In [1]:
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical
import matplotlib.pyplot as plt
import pandas as pd
import random
import graphviz
import seaborn as sns
import time
import math
from torch.nn.utils import clip_grad_norm_

from google.colab import drive
drive.mount('/content/drive')
from google.colab import files

Mounted at /content/drive


In [2]:
# 1. Define your environment
class TicTacToe:
  def __init__(self):
    self.board = np.zeros((3, 3))
    self.one_move_back = np.zeros((3, 3))
    self.two_moves_back = np.zeros((3, 3))

  def reset(self):
    self.board = np.zeros((3, 3))
    self.one_move_back = np.zeros((3, 3))
    self.two_moves_back = np.zeros((3, 3))
    return [self.two_moves_back, self.one_move_back, self.board]

  def check_win(self, player):
    # Check rows
    for i in range(3):
      if np.all(self.board[i, :] == player):
        return player

    # Check columns
    for j in range(3):
      if np.all(self.board[:, j] == player):
        return player

    #Check diagonal
    if np.all(np.diag(self.board) == player):
      return player
    if np.all(np.diag(np.fliplr(self.board)) == player):
      return player

    #Check tie
    if np.all(self.board != 0):
      return -1

    return 0

  # Check for empty places on board
  def possibilities(self):
    l = []
    for i in range(len(self.board)):
        for j in range(len(self.board)):

            if self.board[i][j] == 0:
                l.append((i, j))
    return(l)

  #training
  def random_step(self, player):
    selection = self.possibilities()
    current_loc = random.choice(selection)
    self.two_moves_back = self.one_move_back.copy()
    self.one_move_back = self.board.copy()
    self.board[current_loc] = player
    done = self.check_win(player)
    reward = 0
    if done == 2:
      reward = -1
    state = np.concatenate((self.two_moves_back, self.one_move_back, self.board), axis=0)
    return state, reward, done

  #inference
  def random_move(self, player):
    selection = self.possibilities()
    current_loc = random.choice(selection)
    self.two_moves_back = self.one_move_back.copy()
    self.one_move_back = self.board.copy()
    self.board[current_loc] = player
    done = self.check_win(player)
    reward = 0
    if done == 2:
      reward = -1
    # state = np.concatenate((self.two_moves_back, self.one_move_back, self.board), axis=0)
    return self.board, done

  def network_step(self, action, player):
    #print(action)
    self.two_moves_back = self.one_move_back.copy()
    self.one_move_back = self.board.copy()
    row, col = action
    self.board[row, col] = player
    done = self.check_win(player)
    reward = 0
    if done == 1:
      reward = 1
    state = np.concatenate((self.two_moves_back, self.one_move_back, self.board), axis=0)
    return state, reward, done

  def network_best_move(self):
    self.two_moves_back = self.one_move_back.copy()
    self.one_move_back = self.board.copy()

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Get the device
    policy.to(device)

    action_mask = np.zeros((3, 3))
    for i in range(len(action_mask)):
      for j in range(len(action_mask)):
        if self.board[(i,j)] == 0:
          action_mask[i][j] = 1
    action_mask = torch.FloatTensor(action_mask.flatten()).to(device)

    state = np.concatenate((self.two_moves_back, self.one_move_back, self.board), axis=0)
    #print("transformer input: ", state)
    state = torch.FloatTensor(state.flatten()).unsqueeze(0).to(device)
    probs = policy(state)


    action_probabilities = Categorical(probs * action_mask)
    #print("action_probabilities: ", action_probabilities.probs)
    action = torch.argmax(action_probabilities.probs, dim=0)

    self.board[tensor_to_tuple[action.item()]] = 1
    done = self.check_win(1)

    return self.board, done

#takes a 2-d numpy array and create a string representation
def numpy_array_to_string(array):
  string_array = ""
  for row in array:
    for element in row:
      string_array += str(element)
  return string_array

  and should_run_async(code)


In [3]:
class TransformerAgent(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_dim, nhead, num_layers, dropout=0.10):
        super(TransformerAgent, self).__init__()
        self.transformer = nn.Transformer(d_model=hidden_dim, nhead=nhead,
                                          num_decoder_layers=num_layers,
                                          num_encoder_layers=num_layers,
                                          dropout=dropout)
        # Instead of an Embedding layer, use a Linear layer to handle continuous state values
        self.state_embedding = nn.Linear(state_dim, hidden_dim)
        self.actor = nn.Linear(hidden_dim, action_dim)
        self.softmax = nn.Softmax(dim=-1)
        self.state_dim = state_dim

    def forward(self, x):
        # Embed the input using the Linear layer
        x = self.state_embedding(x.float())
        # Reshape to (sequence_length, batch_size, embedding_dim)
        x = x.view(1, 1, -1) # Reshape for Transformer

        # Pass through the Transformer
        output = self.transformer(x, x)[0, -1, :]
        # output (logits)
        logits = self.actor(output)
        #Softmax to get probs
        probs = self.softmax(logits)
        return probs

In [4]:
tensor_to_tuple = {}
for i in range(9):
  tensor_to_tuple[i] = (i//3, i%3)
print(tensor_to_tuple)

{0: (0, 0), 1: (0, 1), 2: (0, 2), 3: (1, 0), 4: (1, 1), 5: (1, 2), 6: (2, 0), 7: (2, 1), 8: (2, 2)}


In [5]:
from logging import log
#Tic Tac Toe version
def compute_discounted_rewards(rewards, gamma=0.99):
    discounted_rewards = []
    discounted_reward = 0
    i = 0
    for reward in reversed(rewards):
        discounted_reward = reward + gamma * discounted_reward
        discounted_rewards.insert(0, discounted_reward)
        # print(i," discounted_rewards: ", discounted_rewards)
        i += 1
    discounted_rewards = torch.tensor(discounted_rewards)


    #discounted_rewards = discounted_rewards / (discounted_rewards.std() + 1e-5)
    #discounted_rewards = discounted_reward - discounted_rewards.mean() / (discounted_rewards.std() + 1e-5)
    # print("discounted rewards: ", discounted_rewards)

    return discounted_rewards

#%%debug
debug = False

def train(env, policy, optimizer, episodes=450001):

  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  print(f"Using device: {device}")
  policy.to(device)

  games_won = 0
  games_lost = 0
  games_skipped = 0
  for episode in range(episodes):
    print("trajectory: ", episode)
    state = [entry.flatten() for entry in env.reset()]
    state = np.concatenate((state[0], state[1], state[2]), axis=None)


    log_probs = []
    rewards = []


    done = 0
    not_done_counter = 0
    while done == 0:
        action_mask = np.zeros((3, 3))
        state = state.flatten()
        current_board = state[18:27].copy().reshape(3, 3)
        #print("current board: \n",
        print(current_board)


        for i in range(len(action_mask)):
          for j in range(len(action_mask)):
            if current_board[(i,j)] == 0:
              action_mask[i][j] = 1
        action_mask = torch.FloatTensor(action_mask.flatten()).to(device) # Move action_mask to device

        if debug:
          print("inner episode loop ", not_done_counter)

        state = torch.FloatTensor(state).unsqueeze(0).to(device) # Move state to device
        if debug:
          print("inner episode loop ", not_done_counter, " state: ", state)

        probs = policy(state)
        if debug:
          print("inner episode loop ", not_done_counter, " probs: ", probs)


        # Then apply the action mask
        masked_probs = probs * action_mask
        if debug:
          print("masked_probs: ", masked_probs)
        # Renormalize to ensure they sum to 1
        masked_probs = masked_probs / masked_probs.sum()
        if debug:
          print("normalized masked_probs: ", masked_probs)


        action_probabilities = Categorical(masked_probs)
        if debug:
          print("inner episode loop ", not_done_counter, " action_probabilities: ", action_probabilities)

        action = action_probabilities.sample()
        if debug:
          print("inner episode loop ", not_done_counter, " action: ", action)
          print("")
          print("")
          print("")

        if debug:
          print("inner episode loop ", not_done_counter, "step input action.item(): ", action.item())


        state, reward, done = env.network_step(tensor_to_tuple[action.item()], player = 1)

        #Only generate random move is state is not terminal
        if done == 0:
          state, reward, done = env.random_step(player = 2)
        if debug:
          print("inner episode loop after step ", not_done_counter)
          print("step output state: \n", state)
          print("step output reward: ", reward)
          print("step output done: ", done)
          print("")
          print("")
          print("")



        log_probs.append(action_probabilities.log_prob(action))
        rewards.append(reward)

        # Inside the train function, after an episode ends:
        if done != 0:
            print(state.flatten()[18:27].reshape(3,3))
            print("DONE!!! ", done)
            if sum(rewards) == 1:
              games_won += 1
            elif sum(rewards) == -1:
              games_lost += 1
            episode_rewards.append(sum(rewards))
            discounted_rewards = compute_discounted_rewards(rewards).to(device) # Move discounted_rewards to device

            policy_loss = []
            for log_prob, Gt in zip(log_probs, discounted_rewards):
                # Convert log_prob to a 1-dimensional tensor before appending
                policy_loss.append((-log_prob * Gt).unsqueeze(0))
            optimizer.zero_grad()
            # Now you can safely concatenate and sum
            policy_loss = torch.cat(policy_loss).sum()
            policy_loss.backward()
            optimizer.step()
            episode_losses.append(policy_loss.item())


            if episode % 50 == 0:
                print(f"Episode {episode}, Total Reward: {sum(rewards)}")
                print("games_won: ", games_won)
                print("games lost: ", games_lost)
                print("games_skipped: ", games_skipped)
                print("")
                print("")
                print("")

            if episode % 50000 == 0:
              # Save the trained policy network
              torch.save(policy.state_dict(), '/content/drive/MyDrive/Monarch/RL/Tic Tac Toe/Transformers/transformer_policy_network_attention_trial2.pth')

            break

In [None]:
#Globals
episode_rewards = []
episode_losses = []

#Driver code for training the model
my_env = TicTacToe()
print("")
policy = TransformerAgent(state_dim=27, action_dim=9, hidden_dim=1024, nhead=8, num_layers=6)
print("Policy: ", policy)
policy.train()
total_params = sum(p.numel() for p in policy.parameters())
print(f"Number of parameters: {total_params}")
optimizer = optim.Adam(policy.parameters(), lr=1e-8)

#train(env, policy, optimizer)
train(my_env, policy, optimizer)






[1;30;43mStreaming output truncated to the last 5000 lines.[0m
 [2. 1. 0.]
 [0. 0. 0.]]
[[1. 2. 0.]
 [2. 1. 0.]
 [1. 0. 2.]]
[[1. 2. 1.]
 [2. 1. 0.]
 [1. 0. 2.]]
DONE!!!  1
trajectory:  100582
[[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]
[[0. 0. 0.]
 [0. 0. 0.]
 [1. 2. 0.]]
[[1. 0. 2.]
 [0. 0. 0.]
 [1. 2. 0.]]
[[1. 0. 2.]
 [2. 1. 0.]
 [1. 2. 0.]]
[[1. 2. 2.]
 [2. 1. 1.]
 [1. 2. 0.]]
[[1. 2. 2.]
 [2. 1. 1.]
 [1. 2. 1.]]
DONE!!!  1
trajectory:  100583
[[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]
[[0. 0. 2.]
 [0. 0. 0.]
 [1. 0. 0.]]
[[2. 0. 2.]
 [0. 1. 0.]
 [1. 0. 0.]]
[[2. 0. 2.]
 [0. 1. 2.]
 [1. 1. 0.]]
[[2. 0. 2.]
 [0. 1. 2.]
 [1. 1. 1.]]
DONE!!!  1
trajectory:  100584
[[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]
[[0. 0. 0.]
 [0. 0. 0.]
 [1. 2. 0.]]
[[0. 0. 2.]
 [0. 0. 0.]
 [1. 2. 1.]]
[[0. 0. 2.]
 [0. 1. 2.]
 [1. 2. 1.]]
[[1. 0. 2.]
 [0. 1. 2.]
 [1. 2. 1.]]
DONE!!!  1
trajectory:  100585
[[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]
[[0. 0. 0.]
 [2. 1. 0.]
 [0. 0. 0.]]
[[0. 0. 0.]
 [2. 1. 0.]
 [2. 0. 1.]]
[[0. 0. 2

In [None]:
cumulative_episode_rewards = []
for i in range(len(episode_rewards)):
  if i == 0:
    cumulative_episode_rewards.append(episode_rewards[i])
  else:
    cumulative_episode_rewards.append(cumulative_episode_rewards[i-1] + episode_rewards[i])


plt.plot(cumulative_episode_rewards)
plt.title('Training Reward Over Episodes')
plt.xlabel('Episode')
plt.ylabel('Total Reward')
plt.show()


plt.plot(episode_losses)
plt.title('Loss Over Episodes')
plt.xlabel('Episode')
plt.ylabel('Loss')
plt.show()

In [None]:
#Experiments to verify Deep Q learning actually works
#Testing agent against random moves, ideally the agent would win ~99% of games
#Is the win percentage affected if agent is player 1 or player 2?
#function for agent as player 1

#losing RL_training_states
losing_states = []

def player1_agent_vs_random_moves(test_env):

  winner, counter = 0, 1

  state = [entry.flatten() for entry in test_env.reset()]
  state = np.concatenate((state[0], state[1], state[2]), axis=None)


  print(state[18:27].reshape(3,3))
  player1 = 1
  player2 = 2
  reward = 0
  #state_list = []
  #state_list.append(board)
  while winner == 0:
    for player in [player1, player2]:
      if player == 1:
        board, winner = test_env.network_best_move()
        print("Board after " + str(counter) + " move")
        print(board)
        counter += 1

      if player == 2:
        board, winner  = test_env.random_move(player2)
        print("Board after " + str(counter) + " move")
        print(board)
        counter += 1
      #state_list.append(board)

      if winner != 0:
          break
    #if winner == 2:
      #losing_states.append(state_list)

  print("Winner: ", winner)
  return winner, counter-1

In [None]:
# Load a torch model saved from colab

# Load the saved model
# policy = TransformerAgent(state_dim=27, action_dim=9, hidden_dim=1024, nhead=8, num_layers=6)
# policy.load_state_dict(torch.load('/content/drive/MyDrive/Monarch/RL/Tic Tac Toe/Transformers/transformer_policy_network_attention1024.pth'))
# Set the model to evaluation mode
policy.eval()


In [None]:
n = 1000
agent_win_counter = 0
agent_loss_counter = 0
agent_tie_counter = 0

test_env = TicTacToe()
move_histogram = []
policy.eval()



start_player_1_agent = time.time()
for i in range(n):
  print("Game number: ", i)
  winner, num_of_moves = player1_agent_vs_random_moves(test_env)
  if winner == 1:
    agent_win_counter += 1
  elif winner == 2:
    agent_loss_counter += 1
  else:
    agent_tie_counter += 1
  move_histogram.append(num_of_moves)
end_player_1_agent = time.time()

print("Trials took " + str(end_player_1_agent-start_player_1_agent) + " seconds")
print(str(agent_win_counter) + " wins out of " + str(n) + " trials")

print("win percentage: ", (agent_win_counter/n)*100,"%")
print("tie percentage: ", (agent_tie_counter/n)*100,"%")
print("lose percentage: ", (agent_loss_counter/n)*100,"%")
print("move histogram: ", move_histogram)

In [None]:
print(len(move_histogram))

five_count = 0
six_count = 0
seven_count = 0
eight_count = 0
nine_count = 0

for i in range(len(move_histogram)):
  if move_histogram[i] == 5:
    five_count += 1
  elif move_histogram[i] == 6:
    six_count += 1
  elif move_histogram[i] == 7:
    seven_count += 1
  elif move_histogram[i] == 8:
    eight_count += 1
  elif move_histogram[i] == 9:
    nine_count += 1

print("five count: ", five_count)
print("six count: ", six_count)
print("seven count: ", seven_count)
print("eight count: ", eight_count)
print("nine count: ", nine_count)