<a href="https://colab.research.google.com/github/kaneelgit/ttt-DRL/blob/main/tictactoe_training_loop_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!git clone https://github.com/kaneelgit/ttt-DRL.git && cd ttt-DRL && cp -r models tictactoe /content/


Cloning into 'ttt-DRL'...
remote: Enumerating objects: 54, done.[K
remote: Counting objects: 100% (54/54), done.[K
remote: Compressing objects: 100% (34/34), done.[K
remote: Total 54 (delta 16), reused 44 (delta 12), pack-reused 0[K
Receiving objects: 100% (54/54), 585.81 KiB | 5.09 MiB/s, done.
Resolving deltas: 100% (16/16), done.


In [2]:
#import libraries
import numpy as np
import pandas
import matplotlib.pyplot as plt

import tensorflow as tf
import tensorflow_probability as tfp

from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import Accuracy

tfd = tfp.distributions
tfpl = tfp.layers

from sklearn.preprocessing import MinMaxScaler
from collections import deque
import random
import sys, os
import pickle

sys.path.append('/content/tictactoe')
sys.path.append('/content/models')

In [3]:
from tictactoe import train
from tictactoe.game import TicTacToe

#import models
from models import RecurrentNetwork as rn

In [6]:
# Create a new model using 2D Convolutional layers
model = tf.keras.Sequential([
    rn.ConvBlock(3, 64, kernel_size = 3),
    rn.ConvBlock(3, 64, kernel_size = 3),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(128),
    tf.keras.layers.Dense(64),
    tf.keras.layers.Dense(32),
    tf.keras.layers.Dense(9, activation='linear')   # Output layer with 9 units for the Q-values
])

# Compile the new 2D Convolutional model
learning_rate = 0.01
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
                   loss='mse')  # Mean Squared Error is commonly used in Q-Learning
model.build(input_shape = (None, 3, 3, 3))
# Summary of the new model
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv_block_8 (ConvBlock)    (None, 3, 3, 64)          2048      
                                                                 
 conv_block_9 (ConvBlock)    (None, 3, 3, 64)          37184     
                                                                 
 flatten_1 (Flatten)         (None, 576)               0         
                                                                 
 dense_5 (Dense)             (None, 128)               73856     
                                                                 
 dense_6 (Dense)             (None, 64)                8256      
                                                                 
 dense_7 (Dense)             (None, 32)                2080      
                                                                 
 dense_8 (Dense)             (None, 9)                

In [7]:
model.load_weights('/content/model_weight_episode_3500.h5')

In [None]:
#print out some states and the responses by the model
#example state
ttt = TicTacToe()
ttt.play_move(0, 0, False)
ttt.play_move(0, 1, False)
ttt.play_move(1, 0, False)
ttt.play_move(1, 1, True)

sample_state_1 = train.board_state_int(ttt.board, 1)

| X | O |   |
| X | O |   |
|   |   |   |


In [None]:
ttt = TicTacToe()

ttt.play_move(0, 2, False)
ttt.play_move(0, 1, False)
ttt.play_move(1, 1, False)
ttt.play_move(2, 2)

sample_state_2 = train.board_state_int(ttt.board, 1)

|   | O | X |
|   | X |   |
|   |   | O |


In [None]:
model.predict(sample_state_2[np.newaxis, :])



array([[0.01002322, 0.11066268, 0.06397608, 0.09698179, 0.21046838,
        0.07374882, 0.04965578, 0.08246679, 0.10511748]], dtype=float32)

In [None]:
#variables
gamma = 0.9
reward_value = 1
epsilon = 0.5
epsilon_min = 0.1
epsilon_decay = 0.001
memory = deque(maxlen = 32)
batch_size = 32
episodes = 1000
# save_dir = os.path.join('model_weights', 'RecurrentNetwork')
# save_dir = os.path.join('content', 'models', 'model_weights', 'RecurrentNetwork')
save_dir = '/content'
device = tf.test.gpu_device_name()
device

'/device:GPU:0'

In [None]:
#function for the main training loop
batch_size = 32
current_episode = 0
num_of_episodes = 20000
save_model_every = 100
verbose = True

for episode in range(current_episode, num_of_episodes):

    #start game
    ttt = TicTacToe()

    #select current player
    rand_choice = np.random.randint(1, 3, size = 1) #if 1 computer plays first if 2 computer plays second
    if rand_choice == 1:
        computer = 'X'
    else:
        computer = 'O'

    #current state
    cp = 1 if ttt.current_player == 'X' else 2 #since this is the beginning current player is passed on to the first state representation
    state = train.board_state_int(ttt.board, cp)

    #append current states and moves
    current_game_states, moves = [], []
    current_game_states.append(state)


    #bool to start game and break the loop
    play_game = True

    while play_game:

        if np.random.rand() <= epsilon:
            all_avail = []
            for row, my_list in enumerate(ttt.board):
                all_avail.extend((row, index) for index, value in enumerate(my_list) if value == ' ')
            move = random.choice(all_avail)

        else:
            with tf.device(device):
                q_preds = model.predict(state[np.newaxis, :])
            move_int = np.argmax(q_preds)

            #get the cell from the move integer predicted by the model
            move = train.int_to_cell_dict[move_int]


        if ttt.make_move(move[0], move[1]):
            #store data
            next_player = 2 if ttt.current_player == 'X' else 1
            state = train.board_state_int(ttt.board, next_player)
            current_game_states.append(state)
            moves.append(move)

            if ttt.check_winner():
                winner = ttt.winner
                break
            if all(cell != ' ' for row in ttt.board for cell in row):
                winner = 'draw'
                break

            ttt.current_player = 'O' if ttt.current_player == 'X' else 'X'

    #training loop
    if len(memory) >= batch_size:

        memory_list = list(memory)
        random.shuffle(memory_list)

        #train the model over the current states
        for start_idx in range(0, len(memory_list), batch_size):

            end_idx = min(start_idx + batch_size, len(memory_list))
            mini_batch = memory_list[start_idx:end_idx]

            inputs = tf.zeros((end_idx - start_idx, state.shape[0], state.shape[1], state.shape[2]))
            outputs = tf.zeros((end_idx - start_idx, 9)) #9 is the number of values

            #get stuff from the mini batch and get qu values and stuff
            for i, (cs, ns, mv, r) in enumerate(mini_batch):
                with tf.device(device):
                    q_value = r + gamma * tf.reduce_max(model.predict(ns[np.newaxis, :]))
                    #predicted q values
                    pred_q_values = model.predict(cs[np.newaxis, :])

                #add the new q value
                move_int = train.cell_to_int_dict[mv]
                pred_q_values[0][move_int] = q_value

                inputs = tf.tensor_scatter_nd_update(inputs, [[i]], [cs])
                outputs = tf.tensor_scatter_nd_update(outputs, [[i]], [pred_q_values.ravel()])


            model.fit(inputs, outputs, verbose = 0, epochs = 1)

    # assign rewards. If match is drawn either way reward is '0'. If computer choose to play 'X' and winner is 'O' then reward is '-1' vise versa.
    if winner == 'draw':
        reward = 0
    else:
        if winner == computer:
            reward = reward_value
        else:
            reward = -reward_value

    for i in range(0, len(current_game_states) - 1):
        memory.append((current_game_states[i], current_game_states[i + 1], moves[i], reward))

    #save the model
    if episode % save_model_every == 0:

        #save model
        save_path = os.path.join(save_dir, f'model_weight_episode_{episode}.h5')
        model.save_weights(save_path)

        #save memory
        filename = os.path.join(save_dir, f"memory_episode_{episode}.pkl")

        with open(filename, 'wb') as file:
            pickle.dump(memory, file)

    if episode % 50 == 0 and verbose:
        print(f"current episode: {episode}")
        #predict using the model
        samp1_qs = model.predict(sample_state_1[np.newaxis, :])
        print(f"sample one: {np.argmax(samp1_qs)} max q val = {np.max(samp1_qs)}")
        samp2_qs = model.predict(sample_state_2[np.newaxis, :])
        print(f"sample two: {np.argmax(samp2_qs)} max q val = {np.max(samp2_qs)}")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
current episode: 450
sample one: 2 max q val = 1.0
sample two: 2 max q val = 1.0


KeyboardInterrupt: 