<a href="https://colab.research.google.com/github/kozoB/TicTacToeRL/blob/main/TicTacToeRL_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Reinforcement Learning

## Module Installations And Imports

In [1]:
from gym import Env
from gym.spaces import Discrete, Box
import numpy as np
import random

## Define Labels

In [2]:
# Define state labels for tic tac toe game cells ('-' (empty), 'X' and 'O')
mark_labels = ['-', 'X', 'O']
# Define game result labels (0 - game ongoing, 1 - 'X' won, 2 - 'O' won, 3 - draw)
game_result_labels = ['ongoing', 'X-won', 'O-won', 'draw']

  and should_run_async(code)


## Create Game Environment And Rules

In [65]:
class TicTacToeEnv(Env):
  def __init__(self):
    # Actions we can take - Square in grid to mark (1-9)
    self.action_space = Discrete(9)
    # Observation space: 3x3 grid with 3 possible values (-, X, O) encoded as integers (0, 1, 2)
    self.observation_space = Box(low=0, high=2, shape=(3, 3), dtype=np.int32)
    # Initialize the game grid
    self.game_grid = np.full((3, 3), 0, dtype=np.int32)
    # Set starting player as the index of 'X' (1)
    self.current_player = 1 # 1 for 'X', 2 for 'O'
    # Episode status
    self.done = False

  def step(self, action):
    # Convert action (0-8) to row and column indices (0-2)
    row, col = divmod(action, 3)

    # Check if the chosen sqaure is empty
    if self.game_grid[row, col] != 0:
      # Invalid action, return a large negative reward
      return self.game_grid, -10, False, {}

    # Mark the chosen square with the current player's mark
    self.game_grid[row, col] = self.current_player

    # Check the game result
    game_result = self.check_game_result()

    # Determine the reward based on the game result
    if game_result == 1:
        reward = 1 if self.current_player == 1 else -2  # 'X' wins
        self.done = True
    elif game_result == 2:
        reward = -2 if self.current_player == 1 else 1  # 'O' wins
        self.done = True
    elif game_result == 3:
        reward = -1  # Draw
        self.done = True
    else:
        reward = 0  # Game ongoing

    # Switch to the other player
    self.current_player = 2 if self.current_player == 1 else 1

    print("Observation shape:", self.game_grid.shape)

    # Return the updated state, reward, done flag, and additional info
    return self.game_grid.copy(), reward, self.done, {}


  def reset(self):
    # Reset the game grid and player
    self.game_grid = np.full((3, 3), 0, dtype=np.int32)
    self.current_player = 1  # 'X' goes first
    self.done = False

    print("Observation shape:", self.game_grid.shape)

    # Return the initial observation
    return self.game_grid.copy()

  def render(self):
    print('\nGrid state:\n***********************************************')

    # Create a 3x3 array for rendering the grid with the appropriate symbols
    grid_drawing = np.full((3, 3), '-', dtype=str)

    # Loop through each cell in the game grid
    for row in range(3):
      for col in range(3):
        # Get the value in the current cell of the game grid
        square = self.game_grid[row, col]
        # Convert the numerical value to the corresponding mark ('-', 'X', 'O')
        grid_drawing[row, col] = mark_labels[square]

    # Print the rendered game grid
    for row in grid_drawing:
        print(' '.join(row))
    print('***********************************************\n')

  def check_game_result(self):
    # Check rows, columns, and diagonals for a win condition
    for i in range(3):
        # Check rows
        if self.game_grid[i, 0] == self.game_grid[i, 1] == self.game_grid[i, 2] and self.game_grid[i, 0] != 0:
            return self.game_grid[i, 0]
        # Check columns
        if self.game_grid[0, i] == self.game_grid[1, i] == self.game_grid[2, i] and self.game_grid[0, i] != 0:
            return self.game_grid[0, i]

    # Check diagonals
    if self.game_grid[0, 0] == self.game_grid[1, 1] == self.game_grid[2, 2] and self.game_grid[0, 0] != 0:
        return self.game_grid[0, 0]
    if self.game_grid[0, 2] == self.game_grid[1, 1] == self.game_grid[2, 0] and self.game_grid[0, 2] != 0:
        return self.game_grid[0, 2]

    # Check for draw (grid is full)
    if not np.any(self.game_grid == 0):
        return 3  # Draw

    # Game ongoing
    return 0


In [66]:
env = TicTacToeEnv()

In [67]:
action = env.action_space.sample()
print(f"marked square idx: {action}")
env.render()

marked square idx: 3

Grid state:
***********************************************
- - -
- - -
- - -
***********************************************



In [68]:
episodes = 10

for episode in range(1, episodes+1):
  state = env.reset()
  done = False
  score = 0

  while not done:
    env.render()
    action = env.action_space.sample()
    next_state, reward, done, info = env.step(action)
    score += reward

    # Update the current state for the next iteration
    state = next_state

  env.render()
  print(f'Episode: {episode} Score: {score}\n')



Observation shape: (3, 3)

Grid state:
***********************************************
- - -
- - -
- - -
***********************************************

Observation shape: (3, 3)

Grid state:
***********************************************
- - -
X - -
- - -
***********************************************


Grid state:
***********************************************
- - -
X - -
- - -
***********************************************

Observation shape: (3, 3)

Grid state:
***********************************************
- - -
X - -
O - -
***********************************************

Observation shape: (3, 3)

Grid state:
***********************************************
X - -
X - -
O - -
***********************************************

Observation shape: (3, 3)

Grid state:
***********************************************
X - O
X - -
O - -
***********************************************


Grid state:
***********************************************
X - O
X - -
O - -
***********************

# Create Deep Learning Model

## Imports

In [69]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten

In [70]:
states = env.observation_space.shape
actions = env.action_space.n

print(states)
print(actions)

(3, 3)
9


## Create DL Model

In [71]:
from keras import __version__
import tensorflow as tf
tf.keras.__version__ = __version__

In [None]:
!pip install keras-rl2

In [73]:
from rl.agents import DQNAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory
from tensorflow.keras.optimizers.legacy import Adam

In [111]:
def build_model(actions):
    model = Sequential()
    # Flatten the 3x3 grid to a 1D array
    model.add(Flatten(input_shape=(1, 3, 3)))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(actions, activation='linear'))
    return model


In [113]:
model = build_model(actions)

In [114]:
model.summary()

Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten_7 (Flatten)         (None, 9)                 0         
                                                                 
 dense_24 (Dense)            (None, 24)                240       
                                                                 
 dense_25 (Dense)            (None, 24)                600       
                                                                 
 dense_26 (Dense)            (None, 9)                 225       
                                                                 
Total params: 1065 (4.16 KB)
Trainable params: 1065 (4.16 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


## Build Agent With Keras-RL

In [115]:
def build_agent(model, actions):
  policy = BoltzmannQPolicy()
  memory = SequentialMemory(limit=50000, window_length=1)
  dqn = DQNAgent(model=model, memory=memory, policy=policy, nb_actions=actions, nb_steps_warmup=10, target_model_update=1e-2)
  return dqn

In [116]:
optimizer = Adam(learning_rate=1e-3)

In [118]:
dqn = build_agent(model, actions)
dqn.compile(optimizer=optimizer, metrics=['mae'])
dqn.fit(env, nb_steps=50000, visualize=False, verbose=1)

Training for 50000 steps ...
Observation shape: (3, 3)
Interval 1 (0 steps performed)


  updates=self.state_updates,


Observation shape: (3, 3)
    1/10000 [..............................] - ETA: 54:22 - reward: 0.0000e+00Observation shape: (3, 3)
Observation shape: (3, 3)
Observation shape: (3, 3)
Observation shape: (3, 3)


  batch_idxs = np.random.random_integers(low, high - 1, size=size)


   12/10000 [..............................] - ETA: 19:33 - reward: -5.8333   Observation shape: (3, 3)
Observation shape: (3, 3)
Observation shape: (3, 3)
Observation shape: (3, 3)
   18/10000 [..............................] - ETA: 13:18 - reward: -5.5000Observation shape: (3, 3)
Observation shape: (3, 3)
Observation shape: (3, 3)
   22/10000 [..............................] - ETA: 11:09 - reward: -4.9545Observation shape: (3, 3)
   28/10000 [..............................] - ETA: 9:01 - reward: -5.6786 

  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=s

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Observation shape: (3, 3)
Observation shape: (3, 3)
Observation shape: (3, 3)
Observation shape: (3, 3)
Observation shape: (3, 3)
Observation shape: (3, 3)
Observation shape: (3, 3)
Observation shape: (3, 3)
Observation shape: (3, 3)
Observation shape: (3, 3)
Observation shape: (3, 3)
Observation shape: (3, 3)
Observation shape: (3, 3)
Observation shape: (3, 3)
Observation shape: (3, 3)
Observation shape: (3, 3)
Observation shape: (3, 3)
Observation shape: (3, 3)
Observation shape: (3, 3)
Observation shape: (3, 3)
Observation shape: (3, 3)
Observation shape: (3, 3)
Observation shape: (3, 3)
Observation shape: (3, 3)
Observation shape: (3, 3)
Observation shape: (3, 3)
Observation shape: (3, 3)
Observation shape: (3, 3)
Observation shape: (3, 3)
Observation shape: (3, 3)
Observation shape: (3, 3)
Observation shape: (3, 3)
Observation shape: (3, 3)
Observation shape: (3, 3)
Observation shape: (3, 3)
Observation shape: (3, 3)

<keras.src.callbacks.History at 0x7916fecbcb80>

In [119]:
scores = dqn.test(env, nb_episodes=100, visualize=True)
print(np.mean(scores.history['episode_reward']))

Testing for 100 episodes ...
Observation shape: (3, 3)
Observation shape: (3, 3)
Observation shape: (3, 3)
Observation shape: (3, 3)
Observation shape: (3, 3)
Observation shape: (3, 3)
Observation shape: (3, 3)
Observation shape: (3, 3)
Observation shape: (3, 3)
Observation shape: (3, 3)
Episode 1: reward: 1.000, steps: 9
Observation shape: (3, 3)
Observation shape: (3, 3)
Observation shape: (3, 3)
Observation shape: (3, 3)
Observation shape: (3, 3)
Observation shape: (3, 3)
Observation shape: (3, 3)
Observation shape: (3, 3)
Observation shape: (3, 3)
Observation shape: (3, 3)
Episode 2: reward: 1.000, steps: 9
Observation shape: (3, 3)
Observation shape: (3, 3)
Observation shape: (3, 3)
Observation shape: (3, 3)
Observation shape: (3, 3)
Observation shape: (3, 3)
Observation shape: (3, 3)
Observation shape: (3, 3)
Observation shape: (3, 3)
Observation shape: (3, 3)
Episode 3: reward: 1.000, steps: 9
Observation shape: (3, 3)
Observation shape: (3, 3)
Observation shape: (3, 3)
Observat

# Save The Model And Agent

In [10]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [11]:
path = "/content/drive/MyDrive/Machine Learning Projects/DL Practice/Tic Tac Toe RL"

## Save Model

In [144]:
import os

# Create directory if it doesn't exist
os.makedirs(f"{path}/tic_tac_toe_model", exist_ok=True)

model.save(f"{path}/tic_tac_toe_model/tic_tac_toe_model.h5")

  saving_api.save_model(


## Save Agent

In [163]:
import json

# Get the agent's configuration dictionary
agent_config = dqn.get_config()

# Save the agent's configuration to a JSON file
with open(f"{path}/tic_tac_toe_agent_weights/tic_tac_toe_agent_config.json", "w") as config_out:
    json.dump(agent_config, config_out)

# Create directory if it doesn't exist
os.makedirs(f"{path}/tic_tac_toe_agent_weights", exist_ok=True)

# Save the agent's weights
dqn.save_weights(f"{path}/tic_tac_toe_agent_weights/tic_tac_toe_agent_weights.h5f", overwrite=True)

## Load Model

In [9]:
from keras.models import model_from_config

# Load the agent's configuration from the JSON file
with open(f"{path}/tic_tac_toe_agent_weights/tic_tac_toe_agent_config.json", "r") as config_in:
    config = json.load(config_in)

# Load the agent's model
model = model_from_config(config["model"])

# Load the agent's weights
model.load_weights(f"{path}/tic_tac_toe_agent_weights/tic_tac_toe_agent_weights.h5f")

# Rebuild the agent with the loaded model and its configuration
dqn = build_agent(model, actions)

NameError: name 'path' is not defined

# Load model and agent

In [207]:
# from keras.models import load_model
# from keras.models import model_from_json

# # Load model
# model = load_model(f"{path}/tic_tac_toe_model/tic_tac_toe_model.h5")

# # Load agent
# agent = dqn.load_weights(f"{path}/tic_tac_toe_agent_weights/tic_tac_toe_agent_weights.h5f")

# Prepare Environment For Model And Agent Testing

In [32]:
from keras import __version__
import tensorflow as tf
tf.keras.__version__ = __version__

In [34]:
!pip install keras-rl2

Collecting keras-rl2
  Downloading keras_rl2-1.0.5-py3-none-any.whl (52 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/52.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.1/52.1 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: keras-rl2
Successfully installed keras-rl2-1.0.5


In [36]:
from gym import Env
from gym.spaces import Discrete, Box
import numpy as np
import random
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.optimizers import Adam
from rl.agents import DQNAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory
import json

In [37]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [38]:
path = "/content/drive/MyDrive/Machine Learning Projects/DL Practice/Tic Tac Toe RL"

In [39]:
def build_agent(model, actions):
  policy = BoltzmannQPolicy()
  memory = SequentialMemory(limit=50000, window_length=1)
  dqn = DQNAgent(model=model, memory=memory, policy=policy, nb_actions=actions, nb_steps_warmup=10, target_model_update=1e-2)
  return dqn

In [40]:
states = env.observation_space.shape
actions = env.action_space.n

In [41]:
from keras.models import model_from_config

# Load the agent's configuration from the JSON file
with open(f"{path}/tic_tac_toe_agent_weights/tic_tac_toe_agent_config.json", "r") as config_in:
    config = json.load(config_in)

# Load the agent's model
model = model_from_config(config["model"])

# Load the agent's weights
model.load_weights(f"{path}/tic_tac_toe_agent_weights/tic_tac_toe_agent_weights.h5f")

# Rebuild the agent with the loaded model and its configuration
dqn = build_agent(model, actions)

In [42]:
# Define state labels for tic tac toe game cells ('-' (empty), 'X' and 'O')
mark_labels = ['-', 'X', 'O']
# Define game result labels (0 - game ongoing, 1 - 'X' won, 2 - 'O' won, 3 - draw)
game_result_labels = ['ongoing', 'X-won', 'O-won', 'draw']

In [43]:
def build_model(actions):
    model = Sequential()
    # Flatten the 3x3 grid to a 1D array
    model.add(Flatten(input_shape=(1, 3, 3)))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(actions, activation='linear'))
    return model


## Agent VS Agent Environment

In [None]:
class AgentVSAgentEnv(Env):
  def __init__(self):
    # Actions we can take - Square in grid to mark (1-9)
    self.action_space = Discrete(9)
    # Observation space: 3x3 grid with 3 possible values (-, X, O) encoded as integers (0, 1, 2)
    self.observation_space = Box(low=0, high=2, shape=(3, 3), dtype=np.int32)
    # Initialize the game grid
    self.game_grid = np.full((3, 3), 0, dtype=np.int32)
    # Set starting player as the index of 'X' (1)
    self.current_player = 1 # 1 for 'X', 2 for 'O'
    # Episode status
    self.done = False

  def step(self, action):
    # Convert action (0-8) to row and column indices (0-2)
    row, col = divmod(action, 3)

    # Check if the chosen sqaure is empty
    if self.game_grid[row, col] != 0:
      # Invalid action, return a large negative reward
      return self.game_grid, -10, False, {}

    # Mark the chosen square with the current player's mark
    self.game_grid[row, col] = self.current_player

    # Check the game result
    game_result = self.check_game_result()

    # Determine the reward based on the game result
    if game_result == 1:
        reward = 1 if self.current_player == 1 else -2  # 'X' wins
        self.done = True
    elif game_result == 2:
        reward = -2 if self.current_player == 1 else 1  # 'O' wins
        self.done = True
    elif game_result == 3:
        reward = -1  # Draw
        self.done = True
    else:
        reward = 0  # Game ongoing

    # Switch to the other player
    self.current_player = 2 if self.current_player == 1 else 1

    print("Observation shape:", self.game_grid.shape)

    # Return the updated state, reward, done flag, and additional info
    return self.game_grid.copy(), reward, self.done, {}


  def reset(self):
    # Reset the game grid and player
    self.game_grid = np.full((3, 3), 0, dtype=np.int32)
    self.current_player = 1  # 'X' goes first
    self.done = False

    print("Observation shape:", self.game_grid.shape)

    # Return the initial observation
    return self.game_grid.copy()

  def render(self):
    print('\nGrid state:\n******')

    # Create a 3x3 array for rendering the grid with the appropriate symbols
    grid_drawing = np.full((3, 3), '-', dtype=str)

    # Loop through each cell in the game grid
    for row in range(3):
      for col in range(3):
        # Get the value in the current cell of the game grid
        square = self.game_grid[row, col]
        # Convert the numerical value to the corresponding mark ('-', 'X', 'O')
        grid_drawing[row, col] = mark_labels[square]

    # Print the rendered game grid
    for row in grid_drawing:
        print(' '.join(row))
    print('******\n')

  def check_game_result(self):
    # Check rows, columns, and diagonals for a win condition
    for i in range(3):
        # Check rows
        if self.game_grid[i, 0] == self.game_grid[i, 1] == self.game_grid[i, 2] and self.game_grid[i, 0] != 0:
            return self.game_grid[i, 0]
        # Check columns
        if self.game_grid[0, i] == self.game_grid[1, i] == self.game_grid[2, i] and self.game_grid[0, i] != 0:
            return self.game_grid[0, i]

    # Check diagonals
    if self.game_grid[0, 0] == self.game_grid[1, 1] == self.game_grid[2, 2] and self.game_grid[0, 0] != 0:
        return self.game_grid[0, 0]
    if self.game_grid[0, 2] == self.game_grid[1, 1] == self.game_grid[2, 0] and self.game_grid[0, 2] != 0:
        return self.game_grid[0, 2]

    # Check for draw (grid is full)
    if not np.any(self.game_grid == 0):
        return 3  # Draw

    # Game ongoing
    return 0


In [None]:
env = AgentVSAgentEnv()

In [None]:
state = env.reset()
done = False
score = 0

while not done:
    env.render()
    action = dqn.forward(state)  # Use forward method instead of act
    next_state, reward, done, info = env.step(action)
    score += reward

    # Update the current state for the next iteration
    state = next_state

env.render()
print(f'Score: {score}\n')

## Agent VS Human Environment

In [44]:
class AgentVSHumanEnv(Env):
    def __init__(self):
        # Actions we can take - Square in grid to mark (1-9)
        self.action_space = Discrete(9)
        # Observation space: 3x3 grid with 3 possible values (-, X, O) encoded as integers (0, 1, 2)
        self.observation_space = Box(low=0, high=2, shape=(3, 3), dtype=np.int32)
        # Initialize the game grid
        self.game_grid = np.full((3, 3), 0, dtype=np.int32)
        # Set starting player as the index of 'X' (1)
        self.current_player = 1 # 1 for 'X', 2 for 'O'
        # Episode status
        self.done = False

    def step(self, action):
        # Convert action (0-8) to row and column indices (0-2)
        row, col = divmod(action, 3)

        # Check if the chosen square is empty
        if self.game_grid[row, col] != 0:
            # Invalid action, return a large negative reward
            return self.game_grid, -10, False, {}

        # Mark the chosen square with the current player's mark
        self.game_grid[row, col] = self.current_player

        # Check the game result
        game_result = self.check_game_result()

        # Determine the reward based on the game result
        if game_result == 1:
            reward = 1 if self.current_player == 1 else -2  # 'X' wins
            self.done = True
        elif game_result == 2:
            reward = -2 if self.current_player == 1 else 1  # 'O' wins
            self.done = True
        elif game_result == 3:
            reward = -1  # Draw
            self.done = True
        else:
            reward = 0  # Game ongoing

        # Switch to the other player
        self.current_player = 2 if self.current_player == 1 else 1

        print("Observation shape:", self.game_grid.shape)

        # Return the updated state, reward, done flag, and additional info
        return self.game_grid.copy(), reward, self.done, {}

    def reset(self):
        # Reset the game grid and player
        self.game_grid = np.full((3, 3), 0, dtype=np.int32)
        self.current_player = 1  # 'X' goes first
        self.done = False

        # Return the initial observation
        return self.game_grid.copy()

    def render(self):
        print('\nGrid state:\n******')

        # Create a 3x3 array for rendering the grid with the appropriate symbols
        grid_drawing = np.full((3, 3), '-', dtype=str)

        # Loop through each cell in the game grid
        for row in range(3):
            for col in range(3):
                # Get the value in the current cell of the game grid
                square = self.game_grid[row, col]
                # Convert the numerical value to the corresponding mark ('-', 'X', 'O')
                grid_drawing[row, col] = mark_labels[square]

        # Print the rendered game grid
        for row in grid_drawing:
            print(' '.join(row))
        print('******\n')

    def check_game_result(self):
        # Check rows, columns, and diagonals for a win condition
        for i in range(3):
            # Check rows
            if self.game_grid[i, 0] == self.game_grid[i, 1] == self.game_grid[i, 2] and self.game_grid[i, 0] != 0:
                return self.game_grid[i, 0]
            # Check columns
            if self.game_grid[0, i] == self.game_grid[1, i] == self.game_grid[2, i] and self.game_grid[0, i] != 0:
                return self.game_grid[0, i]

        # Check diagonals
        if self.game_grid[0, 0] == self.game_grid[1, 1] == self.game_grid[2, 2] and self.game_grid[0, 0] != 0:
            return self.game_grid[0, 0]
        if self.game_grid[0, 2] == self.game_grid[1, 1] == self.game_grid[2, 0] and self.game_grid[0, 2] != 0:
            return self.game_grid[0, 2]

        # Check for draw (grid is full)
        if not np.any(self.game_grid == 0):
            return 3  # Draw

        # Game ongoing
        return 0


In [45]:
env = AgentVSHumanEnv()

In [48]:
# Reset the environment
state = env.reset()
done = False
score = 0

# Choose player symbol (X or O)
player_symbol = input("Choose your symbol:\n1 for X\n2 for O\n").upper()

# Determine the agent's symbol
agent_symbol = '1' if player_symbol == '2' else '2'

if player_symbol == '1':
  env.render()

# Game loop
while not done:
  # Your turn (if applicable)
  if env.current_player == int(player_symbol):  # Your turn
    print(f"Your turn! ({mark_labels[env.current_player]})")
    # Allow the palyer to choose only valid input
    while True:
      player_selected_square = int(input("\nChoose Square (0-8): ").upper()) # (0-8)
      if player_selected_square < 0 or player_selected_square > 8:
        print("Invalid Square number. Valid squares are 0-8. try another square!")
        continue

      row, col = divmod(player_selected_square, 3)
      # try:
      if env.game_grid[row][col] == 0:
        break
      else:
        print("Square already marked, try another square!")

    env.game_grid[row][col] = player_symbol
    result = env.check_game_result()
    # Check if turn ended in a draw/win
    if result == 3:
      done = True
      break
    elif result != 0:
      done = True
      break

    # Change player to agent
    env.current_player = int(agent_symbol)

  # Agent's turn
  else:
    print("Agent's turn!")
    action = dqn.forward(state)

    # Perform the action
    next_state, reward, done, _ = env.step(action)
    score += reward
    state = next_state
    env.render()
    result = env.check_game_result()
    if result == 3:
      done = True
      break
    elif result != 0:
      done = True
      break

# Render the final state and display the score
env.render()
if env.check_game_result() == 3:
  print(f"\nGame ended in a Draw!")
else:
  print(f"\nGame ended in a Victory for {mark_labels[env.current_player]}!")
  if env.current_player == int(player_symbol):
    print("You won against the AI!")
  else:
    print("You lost against the AI!")

print(f'\nAI Score: {score}\n')


Choose your symbol:
1 for X
2 for O
1

Grid state:
******
- - -
- - -
- - -
******

Your turn! (X)

Choose Square (0-8): 2
Agent's turn!

Grid state:
******
- - X
- - -
- - -
******

Agent's turn!
Observation shape: (3, 3)

Grid state:
******
- - X
- - O
- - -
******

Your turn! (X)

Choose Square (0-8): 4
Agent's turn!
Observation shape: (3, 3)

Grid state:
******
- - X
- X O
- O -
******

Your turn! (X)

Choose Square (0-8): 6

Grid state:
******
- - X
- X O
X O -
******


Game ended in a Victory for X!
You won against the AI!

AI Score: -10

