In [136]:
import gymnasium as gym
from gymnasium.spaces import MultiDiscrete, Discrete
from gymnasium import Env
import numpy as np
from itertools import permutations

# initialize possible numbers - all digits must be distinct
numbers = [c * 10 + d for
    c, d in permutations(range(4), 2)]

# A - digit was explored and was at the correct position
# B - digit was explored but was at the wrong position
# N - digit was explored but was not in the correct number
def determine_ABN(a, b):
        a_digits = [a//10, a%10]
        b_digits = [b//10, b%10]
        A = []
        B = []
        N = []
        for count, digit in enumerate(a_digits):
            if digit == b_digits[count]:
                A.append(digit)
            elif digit in b_digits:
                B.append(digit)
            else:
                N.append(digit)
        return A, B, N

num_digits = 4
total_guesses = 3 + 1

class BullsnCows(Env):

    metadata = {"render.modes": ["human"]}
    '''
    State Space:
        [4] * len(num_digits) + [total_guesses]  array, the number at each index corresponds to A, B, N
        Map N to 1, B to 2, A to 3
        Initialize with all 0 - unexplored
    Action Space:
        Any possible number from numbers
    Reward:
        +1 for 3's at the correct digits
        -1 for not achieving the above within total_guesses-1 
        0 otherwise
    
    '''
    

    def __init__(self):
        super(BullsnCows, self).__init__()
        # state space
        self.observation_space = MultiDiscrete((np.array([4]*num_digits + [total_guesses])))
        # action space - size is total number of possible numbers
        self.action_space = Discrete(len(numbers))
        # initialize numbers that have already been guessed (for rendering)
        self.guesses = []
        # for initialize digits guessed at each step (for render)
        self.guesses_digits = []
        # initialize the states that have been explored (for render)
        self.explored_states = []

        # initialize starting state - start with all zeros (unexplored)
        self.state = np.zeros(num_digits+1, dtype=int)
        # last element starts from the total number of guesses
        self.state[-1] = total_guesses-1
        # initialize ABN for each guess (for render)
        self.ABN = np.zeros(num_digits)

        # generate correct word 
        self.correct_word = np.random.choice(numbers)
        # initialize correct digits to be 3
        self.correct_digits = [self.correct_word//10, self.correct_word%10]

        # initialize terminated, truncated
        self.terminated = False
        self.truncated = False

    def step(self, action):
        # append to guesses
        self.guesses.append(action)
        
        # find the digits of the number (in order)
        guess_digits = [numbers[action]//10, numbers[action]%10]
        self.guesses_digits.append(guess_digits)

        # map action to corresponding number and calculate to the number of ABN
        A, B, N = determine_ABN(numbers[action], self.correct_word)
        # if A - means correct digit correct position, map to 3
        for index in A:
            self.ABN[index] = 3
            self.state[index] = 3
        # if B - means correct digit wrong position, map to 2
        for index in B:
            self.ABN[index] = 2
            # if the digit has already been explored and achieved A, we already know its correct position, so it stays at 3
            if self.state[index] != 3:
                self.state[index] = 2
        # if N - means wrong digit, map to 1
        for index in N:
            self.ABN[index] = 1
            self.state[index] = 1

        # decrease remaining number of guesses by 1
        self.state[-1] -= 1
        current_ABN = self.ABN.copy()

        # append to explored states
        self.explored_states.append(current_ABN)

        # calculate reward - if the correct digits have all been found at the correct positions, terminate
        if np.all([self.state[i] == 3 for i in self.correct_digits]):
            reward = 1
            self.terminated = True
        # else if it is the final guess, terminate
        elif self.state[-1] == 0:
            reward = -1
            self.terminated = True
        else:
            reward = 0
        return self.state, reward, self.terminated, self.truncated, {}

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        # reset to original states
        # initialize numbers that have already been guessed (for rendering)
        self.guesses = []
        # for initialize digits guessed at each step (for render)
        self.guesses_digits = []
        # initialize the states that have been explored (for render)
        self.explored_states = []

        # initialize starting state - start with all zeros (unexplored)
        self.state = np.zeros(num_digits+1, dtype=int)
        # last element starts from the total number of guesses
        self.state[-1] = total_guesses-1
        # initialize ABN for each guess (for render)
        self.ABN = np.zeros(num_digits)

        # generate correct word 
        self.correct_word = np.random.choice(numbers)
        # initialize correct digits to be 3
        self.correct_digits = [self.correct_word//10, self.correct_word%10]

        # initialize terminated, truncated
        self.terminated = False
        self.truncated = False

        return self.state, {}
    
    def render(self, mode="human"):
        print("Correct Word:", self.correct_word)
        for n, guess, guess_digits, state in zip([x+1 for x in range(total_guesses-1-self.state[-1])], self.guesses, self.guesses_digits, self.explored_states):
            current_explored_state = [state[i] for i in guess_digits]
            ABN_result = []
            for x in current_explored_state:
                if x == 3:
                    ABN_result.append('A')
                elif x == 2:
                    ABN_result.append('B')
                else:
                    ABN_result.append('N')
            print(f'Guess {n}: {numbers[guess]}, ABN: {ABN_result}')
        if np.all([self.state[i] == 3 for i in self.correct_digits]):
            print(f'Guess {n+1}: {self.correct_word}, You win!')


In [108]:
env = BullsnCows()
print(env.observation_space.nvec)
print(env.action_space.n)

[4 4 4 4 4]
12


In [133]:
env = BullsnCows()
episodes = 100
for episode in range(1, episodes+1):
    state, _ = env.reset()
    score = 0
    print(state)

    while not env.terminated:
        action = env.action_space.sample()
        state, reward, terminated, truncated, info = env.step(action)
        print(action, state, reward)
        env.render()
        score += reward

    print(f'Episode: {episode}, Score: {score}')

[0 0 0 0 3]
7 [0 3 1 0 2] 0
Correct Word: 1
Guess 1: 21, ABN: ['N', 'A']
8 [0 3 1 1 1] 0
Correct Word: 1
Guess 1: 21, ABN: ['N', 'A']
Guess 2: 23, ABN: ['N', 'N']
9 [2 3 1 1 0] -1
Correct Word: 1
Guess 1: 21, ABN: ['N', 'A']
Guess 2: 23, ABN: ['N', 'N']
Guess 3: 30, ABN: ['N', 'B']
Episode: 1, Score: -1
[0 0 0 0 3]
7 [0 1 2 0 2] 0
Correct Word: 32
Guess 1: 21, ABN: ['B', 'N']
9 [1 1 2 3 1] 0
Correct Word: 32
Guess 1: 21, ABN: ['B', 'N']
Guess 2: 30, ABN: ['A', 'N']
8 [1 1 2 3 0] -1
Correct Word: 32
Guess 1: 21, ABN: ['B', 'N']
Guess 2: 30, ABN: ['A', 'N']
Guess 3: 23, ABN: ['B', 'B']
Episode: 2, Score: -1
[0 0 0 0 3]
4 [0 1 2 0 2] 0
Correct Word: 23
Guess 1: 12, ABN: ['N', 'B']
3 [1 1 2 0 1] 0
Correct Word: 23
Guess 1: 12, ABN: ['N', 'B']
Guess 2: 10, ABN: ['N', 'N']
3 [1 1 2 0 0] -1
Correct Word: 23
Guess 1: 12, ABN: ['N', 'B']
Guess 2: 10, ABN: ['N', 'N']
Guess 3: 10, ABN: ['N', 'N']
Episode: 3, Score: -1
[0 0 0 0 3]
4 [0 1 1 0 2] 0
Correct Word: 30
Guess 1: 12, ABN: ['N', 'N']
5 [0 

In [139]:
from stable_baselines3 import DQN, PPO, A2C
from stable_baselines3.common.evaluation import evaluate_policy
import os
import torch

logdir_dqn = "alternative_reduced_logs/dqn/"
logdir_ppo = "alternative_reduced_logs/ppo/"
logdir_a2c = "alternative_reduced_logs/a2c/"

for logdir in [logdir_a2c, logdir_dqn, logdir_ppo]:
	if not os.path.exists(logdir):
		os.makedirs(logdir)

env = BullsnCows()
device = torch.device("cuda")

model_dqn = DQN("MlpPolicy", env, verbose=1, learning_starts=5000, tensorboard_log=logdir_dqn, device=device)
model_ppo = PPO("MlpPolicy", env, verbose=1, tensorboard_log=logdir_ppo, device=device)
model_a2c = A2C("MlpPolicy", env, verbose=1, tensorboard_log=logdir_a2c, device=device)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [140]:
model_a2c.learn(total_timesteps=20000, log_interval=4)
model_a2c.save("a2c_BullsnCows_alt_reduced")

Logging to alternative_reduced_logs/a2c/A2C_9


-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 2.71      |
|    ep_rew_mean        | -0.714    |
| time/                 |           |
|    fps                | 345       |
|    iterations         | 4         |
|    time_elapsed       | 0         |
|    total_timesteps    | 20        |
| train/                |           |
|    entropy_loss       | -2.48     |
|    explained_variance | -1.99e+03 |
|    learning_rate      | 0.0007    |
|    n_updates          | 3         |
|    policy_loss        | -0.00834  |
|    value_loss         | 0.111     |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 2.53      |
|    ep_rew_mean        | -0.2      |
| time/                 |           |
|    fps                | 439       |
|    iterations         | 8         |
|    time_elapsed       | 0         |
|    total_timesteps    | 40        |
| train/    

In [151]:
state, _ = env.reset()
while not env.terminated:
    action, _states = model_a2c.predict(state, deterministic=True)
    state, reward, terminated, truncated, info = env.step(action)
    env.render()

Correct Word: 13
Guess 1: 21, ABN: ['N', 'B']
Correct Word: 13
Guess 1: 21, ABN: ['N', 'B']
Guess 2: 13, ABN: ['A', 'A']
Guess 3: 13, You win!


In [45]:
model_dqn.learn(total_timesteps=20000, log_interval=4)
model_dqn.save("dqn_BullsnCows_alt_reduced")

Logging to alternative_reduced_logs/dqn/DQN_12
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 5        |
|    ep_rew_mean      | -0.5     |
|    exploration_rate | 0.991    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 2334     |
|    time_elapsed     | 0        |
|    total_timesteps  | 20       |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 4.88     |
|    ep_rew_mean      | -0.5     |
|    exploration_rate | 0.981    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 3533     |
|    time_elapsed     | 0        |
|    total_timesteps  | 39       |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 4.58     |
|    ep_rew_mean      | -0.167   |
|    exploration_rate | 0.974    |
| time/ 

In [35]:
state, _ = env.reset()
while not env.terminated:
    action, _states = model_dqn.predict(state, deterministic=True)
    state, reward, terminated, truncated, info = env.step(action)
    env.render()

Correct Word: 3
Guess 1: 12, ABN: ['N', 'N']
Correct Word: 3
Guess 1: 12, ABN: ['N', 'N']
Guess 2: 3, ABN: ['A', 'A']
Guess 3: 3, You win!


In [141]:
%load_ext tensorboard
%tensorboard --logdir='alternative_reduced_logs'

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6006 (pid 54112), started 0:36:19 ago. (Use '!kill 54112' to kill it.)