# Imports

In [99]:
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv
from stable_baselines3.common.evaluation import evaluate_policy
import gymnasium as gym
import numpy as np

# Environment

In [88]:
class TicTacToeEnv(gym.Env):
    def __init__(self, opponent: lambda obs: np.ndarray):
        super(TicTacToeEnv, self).__init__()
        self.action_space = gym.spaces.Discrete(9)
        self.observation_space = gym.spaces.MultiDiscrete([3]*9)
        self.state = np.zeros(9, dtype=int)
        self.opponent = opponent


    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.state = np.zeros(9, dtype=int)
        return self.state, {}


    def step(self, action):
        # Model's move
        #Punish model for invalid action
        if self.state[action] != 0:
            return self.state.copy(), -1, True, False, {}

        # Make move and check for win
        self.state[action] = 1
        if self.checkState() is not None:
            return self.state.copy(), self.checkState(), True, False, {}
        
        # Opponent's move
        # Invert the state for opponent's perspective
        state_for_opponent = self.getStateInversed()
        opponent_action = self.opponent(state_for_opponent)
        # If opponent's action is invalid, choose a random valid action
        if self.state[opponent_action] != 0:
            opponent_action = np.random.choice(np.where(self.state == 0)[0])
        
        self.state[opponent_action] = 2
        if self.checkState() is not None:
            return self.state.copy(), self.checkState(), True, False, {}

        return self.state.copy(), 0, False, False, {}


    def render(self, mode='human'):
        if mode == 'human':
            print(np.array(self.state).reshape(3, 3))


    def checkState(self):
        if np.any(np.all(self.state.reshape(3, 3) == 1, axis=0)) or \
           np.any(np.all(self.state.reshape(3, 3) == 1, axis=1)) or \
           np.all(np.diag(self.state.reshape(3, 3)) == 1) or \
           np.all(np.diag(np.fliplr(self.state.reshape(3, 3))) == 1):
            return 1
        if np.any(np.all(self.state.reshape(3, 3) == 2, axis=0)) or \
           np.any(np.all(self.state.reshape(3, 3) == 2, axis=1)) or \
           np.all(np.diag(self.state.reshape(3, 3)) == 2) or \
           np.all(np.diag(np.fliplr(self.state.reshape(3, 3))) == 2):
            return -1
        if not np.any(self.state == 0):
            return 0
        return None
    

    def getStateInversed(self):
        state = np.zeros(9, dtype=int)
        state[self.state == 1] = 2
        state[self.state == 2] = 1
        return state

# Training

### Prep

In [None]:
opponent = lambda obs: np.random.choice(np.where(obs == 0)[0])

env = TicTacToeEnv(opponent)
env = DummyVecEnv([lambda: env])

model = PPO("MlpPolicy", env, verbose=1)

Using cpu device


### Training

In [90]:
model.learn(total_timesteps=10_000)

-----------------------------
| time/              |      |
|    fps             | 853  |
|    iterations      | 1    |
|    time_elapsed    | 2    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 691         |
|    iterations           | 2           |
|    time_elapsed         | 5           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.012489583 |
|    clip_fraction        | 0.104       |
|    clip_range           | 0.2         |
|    entropy_loss         | -2.19       |
|    explained_variance   | -0.136      |
|    learning_rate        | 0.0003      |
|    loss                 | 0.0627      |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0285     |
|    value_loss           | 0.325       |
-----------------------------------------
----------------------------------

<stable_baselines3.ppo.ppo.PPO at 0x7f89f42a7b10>

### Evaluation

In [91]:
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=1000)
print(f'{mean_reward} +/- {std_reward}')



0.83 +/- 0.5577633906953737


### Examples

As each step makes model's and opponent's move, on each render you see +2 moves. Not sure why last render returns zeros tho. Chat says that after returning done=True it is automatically reset.

In [92]:
vec_env = model.get_env()
for i in range(10):
    obs = vec_env.reset()
    done = False
    while not done:
        action = model.predict(obs, deterministic=True)
        obs, reward, done, info = vec_env.step(action)
        vec_env.envs[0].render('human')
    print(f"Ended with reward: {reward}")

[[0 2 0]
 [0 0 0]
 [1 0 0]]
[[0 2 2]
 [0 1 0]
 [1 0 0]]
[[1 2 2]
 [0 1 2]
 [1 0 0]]
[[0 0 0]
 [0 0 0]
 [0 0 0]]
Ended with reward: [1.]
[[0 0 0]
 [2 0 0]
 [1 0 0]]
[[0 0 0]
 [2 0 2]
 [1 0 1]]
[[2 0 1]
 [2 0 2]
 [1 0 1]]
[[0 0 0]
 [0 0 0]
 [0 0 0]]
Ended with reward: [1.]
[[0 2 0]
 [0 0 0]
 [1 0 0]]
[[0 2 0]
 [0 1 0]
 [1 0 2]]
[[2 2 0]
 [0 1 0]
 [1 1 2]]
[[0 0 0]
 [0 0 0]
 [0 0 0]]
Ended with reward: [1.]
[[0 0 0]
 [0 0 0]
 [1 2 0]]
[[2 0 0]
 [0 1 0]
 [1 2 0]]
[[0 0 0]
 [0 0 0]
 [0 0 0]]
Ended with reward: [1.]
[[2 0 0]
 [0 0 0]
 [1 0 0]]
[[2 2 1]
 [0 0 0]
 [1 0 0]]
[[0 0 0]
 [0 0 0]
 [0 0 0]]
Ended with reward: [1.]
[[0 2 0]
 [0 0 0]
 [1 0 0]]
[[2 2 0]
 [0 1 0]
 [1 0 0]]
[[0 0 0]
 [0 0 0]
 [0 0 0]]
Ended with reward: [1.]
[[0 2 0]
 [0 0 0]
 [1 0 0]]
[[0 2 0]
 [0 1 0]
 [1 2 0]]
[[1 2 0]
 [0 1 2]
 [1 2 0]]
[[0 0 0]
 [0 0 0]
 [0 0 0]]
Ended with reward: [1.]
[[0 0 0]
 [0 0 0]
 [1 0 2]]
[[2 0 1]
 [0 0 0]
 [1 0 2]]
[[0 0 0]
 [0 0 0]
 [0 0 0]]
Ended with reward: [1.]
[[0 0 0]
 [2 0 0]
 [1 0 

# Training model on previous models

To make model better lets train it against previous model

### Prep

In [None]:
def opponent(obs): return np.random.choice(np.where(obs == 0)[0])

model = PPO("MlpPolicy", env, verbose=1)

Using cpu device


### Training with switching models

In [None]:
for i in range(3):
    env = TicTacToeEnv(opponent)
    env = DummyVecEnv([lambda: env])

    model.learn(total_timesteps=10_000)

    prev_model = PPO("MlpPolicy", env, verbose=1)
    prev_model.set_parameters(model.get_parameters())
    def opponent(obs): return prev_model.predict(obs, deterministic=True)[0]

    mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=100)
    print(f'{mean_reward} +/- {std_reward}')

-----------------------------
| time/              |      |
|    fps             | 939  |
|    iterations      | 1    |
|    time_elapsed    | 2    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 757         |
|    iterations           | 2           |
|    time_elapsed         | 5           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.009481499 |
|    clip_fraction        | 0.0609      |
|    clip_range           | 0.2         |
|    entropy_loss         | -2.19       |
|    explained_variance   | -0.286      |
|    learning_rate        | 0.0003      |
|    loss                 | 0.0431      |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0233     |
|    value_loss           | 0.304       |
-----------------------------------------
----------------------------------

### Evaluation

As you can see model found optimal win strategy

In [96]:
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=1000)
print(f'{mean_reward} +/- {std_reward}')



1.0 +/- 0.0


In [95]:
vec_env = model.get_env()
for i in range(10):
    obs = vec_env.reset()
    done = False
    while not done:
        action = model.predict(obs, deterministic=True)
        obs, reward, done, info = vec_env.step(action)
        vec_env.envs[0].render('human')
    print(f"Ended with reward: {reward}")

[[0 2 0]
 [0 1 0]
 [0 0 0]]
[[0 2 0]
 [0 1 0]
 [0 2 1]]
[[0 0 0]
 [0 0 0]
 [0 0 0]]
Ended with reward: [1.]
[[0 0 0]
 [0 1 0]
 [0 0 2]]
[[0 0 1]
 [0 1 0]
 [0 2 2]]
[[0 0 0]
 [0 0 0]
 [0 0 0]]
Ended with reward: [1.]
[[0 0 0]
 [0 1 0]
 [0 2 0]]
[[2 0 0]
 [0 1 0]
 [0 2 1]]
[[2 0 1]
 [0 1 0]
 [2 2 1]]
[[0 0 0]
 [0 0 0]
 [0 0 0]]
Ended with reward: [1.]
[[0 0 0]
 [0 1 0]
 [0 2 0]]
[[2 0 0]
 [0 1 0]
 [0 2 1]]
[[2 0 1]
 [2 1 0]
 [0 2 1]]
[[0 0 0]
 [0 0 0]
 [0 0 0]]
Ended with reward: [1.]
[[0 0 0]
 [0 1 0]
 [2 0 0]]
[[0 0 0]
 [0 1 0]
 [2 2 1]]
[[0 0 0]
 [0 0 0]
 [0 0 0]]
Ended with reward: [1.]
[[0 0 0]
 [0 1 2]
 [0 0 0]]
[[0 0 0]
 [0 1 2]
 [0 2 1]]
[[0 0 0]
 [0 0 0]
 [0 0 0]]
Ended with reward: [1.]
[[0 0 0]
 [0 1 0]
 [0 0 2]]
[[2 0 1]
 [0 1 0]
 [0 0 2]]
[[0 0 0]
 [0 0 0]
 [0 0 0]]
Ended with reward: [1.]
[[0 0 0]
 [0 1 0]
 [2 0 0]]
[[0 0 0]
 [2 1 0]
 [2 0 1]]
[[0 0 0]
 [0 0 0]
 [0 0 0]]
Ended with reward: [1.]
[[0 0 0]
 [0 1 0]
 [0 2 0]]
[[0 2 0]
 [0 1 0]
 [0 2 1]]
[[0 0 0]
 [0 0 0]
 [0 0 