In [1]:
#import libraries
import numpy as np
import pandas
import matplotlib.pyplot as plt

import tensorflow as tf
import tensorflow_probability as tfp

from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import Accuracy

tfd = tfp.distributions
tfpl = tfp.layers

from sklearn.preprocessing import MinMaxScaler
from collections import deque
import random
import sys

from tictactoe.game import TicTacToe
import os
import pickle

In [2]:
sys.path.append(os.path.join('tictactoe'))
from tictactoe import train

In [3]:
#import models
from models import RecurrentNetwork as rn

#initialize model
model = rn.RecurrentNN

In [4]:
# Define the learning rate schedule function
def lr_schedule(epoch):
    # Initially high learning rate, and decay it gradually
    initial_lr = initial_learning_rate
    decay_factor = 0.1
    decay_epochs = 10  # Adjust the number of epochs after which to decay the learning rate
    
    if epoch < decay_epochs:
        return initial_lr
    else:
        return initial_lr * (decay_factor ** (epoch // decay_epochs))

# lr_scheduler = tf.keras.callbacks.LearningRateScheduler(lr_schedule)
initial_learning_rate = 0.001
optimizer = Adam(learning_rate = initial_learning_rate)
model.compile(loss = 'mean_squared_error', optimizer = optimizer)
model.build(input_shape = (None, 3, 3, 3))

In [5]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv_block (ConvBlock)       (None, 3, 3, 64)          2048      
_________________________________________________________________
conv_block_1 (ConvBlock)     (None, 3, 3, 64)          37184     
_________________________________________________________________
residual_block (ResidualBloc (None, 3, 3, 64)          74368     
_________________________________________________________________
residual_block_1 (ResidualBl (None, 3, 3, 64)          74368     
_________________________________________________________________
global_average_pooling2d (Gl (None, 64)                0         
_________________________________________________________________
dense (Dense)                (None, 9)                 585       
Total params: 188,553
Trainable params: 187,785
Non-trainable params: 768
________________________________________________

In [6]:
# #simpler model
# model = tf.keras.Sequential([
#     tf.keras.layers.Flatten(input_shape=(3, 3, 3)),  # Flatten the 3x3x3 input
#     tf.keras.layers.Dense(128, activation='relu'),
#     tf.keras.layers.Dense(64, activation='relu'),
#     tf.keras.layers.Dense(32, activation='relu'),
#     tf.keras.layers.Dense(16, activation='relu'),
#     tf.keras.layers.Dense(9, activation='linear')   # Output layer with 9 units for the Q-values
# ])

# learning_rate = 0.01

# # Compile the model
# model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
#                     loss='mse')  # Mean Squared Error is commonly used in Q-Learning

# model.summary()

In [7]:
# # Create a new model using 2D Convolutional layers
# model = tf.keras.Sequential([
#     rn.ConvBlock(3, 64, kernel_size = 3),
#     rn.ConvBlock(3, 64, kernel_size = 3),
#     tf.keras.layers.Flatten(),
#     tf.keras.layers.Dense(128),
#     tf.keras.layers.Dense(64),
#     tf.keras.layers.Dense(32),
#     tf.keras.layers.Dense(9, activation='linear')   # Output layer with 9 units for the Q-values
# ])

# # Compile the new 2D Convolutional model
# learning_rate = 0.01
# model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
#                    loss='mse')  # Mean Squared Error is commonly used in Q-Learning
# model.build(input_shape = (None, 3, 3, 3))
# # Summary of the new model
# model.summary()

In [8]:
#print out some states and the responses by the model
#example state
ttt = TicTacToe()
ttt.play_move(0, 0, False)
ttt.play_move(0, 1, False)
ttt.play_move(1, 0, False)
ttt.play_move(1, 1, True)

sample_state_1 = train.board_state_int(ttt.board, 1)

| X | O |   |
| X | O |   |
|   |   |   |


In [9]:
ttt = TicTacToe()

ttt.play_move(0, 2, False)
ttt.play_move(0, 1, False)
ttt.play_move(1, 1, False)
ttt.play_move(2, 2)

sample_state_2 = train.board_state_int(ttt.board, 1)

|   | O | X |
|   | X |   |
|   |   | O |


In [10]:
#variables
gamma = 0.9
reward_value = 1
epsilon = 0.5
epsilon_min = 0.1
epsilon_decay = 0.001
memory = deque(maxlen = 2000)
batch_size = 32
episodes = 1000
save_dir = os.path.join('model_weights', 'RecurrentNetwork')
device = tf.test.gpu_device_name()
device

''

In [11]:
#training loop
# if len(memory) > batch_size:

#     memory_list = list(memory)
#     mini_batch = random.sample(memory_list, batch_size)

#     inputs = tf.zeros((batch_size, state.shape[0], state.shape[1], state.shape[2]))
#     outputs = tf.zeros((batch_size, 9)) #9 is the number of values

#     #get stuff from the mini batch and get qu values and stuff
#     for i, (cs, ns, mv, r) in enumerate(mini_batch):
#         with tf.device(device):
#             q_value = r + gamma * tf.reduce_max(model.predict(ns[np.newaxis, :]))
#             #predicted q values
#             pred_q_values = model.predict(cs[np.newaxis, :])

#         #add the new q value
#         move_int = train.cell_to_int_dict[mv]
#         pred_q_values[0][move_int] = q_value

# #             print(move_int, pred_q_values, r)

#         inputs = tf.tensor_scatter_nd_update(inputs, [[i]], [cs])  
#         outputs = tf.tensor_scatter_nd_update(outputs, [[i]], [pred_q_values.ravel()]) 

#     model.fit(inputs, outputs, verbose = 0, epochs = 1, callbacks=[lr_scheduler])

In [12]:
#train model
# train.training_loop(model, memory, epsilon, gamma, save_dir = save_dir, device = device, save_model_every = 20)
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv_block (ConvBlock)       (None, 3, 3, 64)          2048      
_________________________________________________________________
conv_block_1 (ConvBlock)     (None, 3, 3, 64)          37184     
_________________________________________________________________
residual_block (ResidualBloc (None, 3, 3, 64)          74368     
_________________________________________________________________
residual_block_1 (ResidualBl (None, 3, 3, 64)          74368     
_________________________________________________________________
global_average_pooling2d (Gl (None, 64)                0         
_________________________________________________________________
dense (Dense)                (None, 9)                 585       
Total params: 188,553
Trainable params: 187,785
Non-trainable params: 768
________________________________________________

In [13]:
#function for the main training loop
batch_size = 32
current_episode = 0
num_of_episodes = 5000
save_model_every = 100
verbose = True

for episode in range(current_episode, num_of_episodes):

    #start game
    ttt = TicTacToe()

    #select current player 
    rand_choice = np.random.randint(1, 3, size = 1) #if 1 computer plays first if 2 computer plays second
    if rand_choice == 1:
        computer = 'X'
    else:
        computer = 'O'

    #current state
    cp = 1 if ttt.current_player == 'X' else 2 #since this is the beginning current player is passed on to the first state representation
    state = train.board_state_int(ttt.board, cp)

    #append current states and moves
    current_game_states, moves = [], []  
    current_game_states.append(state)


    #bool to start game and break the loop
    play_game = True

    while play_game:

        if np.random.rand() <= epsilon:
            all_avail = []
            for row, my_list in enumerate(ttt.board):
                all_avail.extend((row, index) for index, value in enumerate(my_list) if value == ' ')
            move = random.choice(all_avail)

        else:
            with tf.device(device):
                q_preds = model.predict(state[np.newaxis, :])
            move_int = np.argmax(q_preds)

            #get the cell from the move integer predicted by the model
            move = train.int_to_cell_dict[move_int]


        if ttt.make_move(move[0], move[1]):
            #store data
            next_player = 2 if ttt.current_player == 'X' else 1
            state = train.board_state_int(ttt.board, next_player)
            current_game_states.append(state)
            moves.append(move)

            if ttt.check_winner():
                winner = ttt.winner
                break
            if all(cell != ' ' for row in ttt.board for cell in row):
                winner = 'draw'
                break

            ttt.current_player = 'O' if ttt.current_player == 'X' else 'X'

    #training loop
    if len(memory) > batch_size:

        memory_list = list(memory)
        random.shuffle(memory_list)

        #train the model over the current states
        for start_idx in range(0, len(memory_list), batch_size):

            end_idx = min(start_idx + batch_size, len(memory_list))
            mini_batch = memory_list[start_idx:end_idx]

            inputs = tf.zeros((end_idx - start_idx, state.shape[0], state.shape[1], state.shape[2]))
            outputs = tf.zeros((end_idx - start_idx, 9)) #9 is the number of values

            #get stuff from the mini batch and get qu values and stuff
            for i, (cs, ns, mv, r) in enumerate(mini_batch):
                with tf.device(device):
                    q_value = r + gamma * tf.reduce_max(model.predict(ns[np.newaxis, :]))
                    #predicted q values
                    pred_q_values = model.predict(cs[np.newaxis, :])

                #add the new q value
                move_int = train.cell_to_int_dict[mv]
                pred_q_values[0][move_int] = q_value
                
                inputs = tf.tensor_scatter_nd_update(inputs, [[i]], [cs])  
                outputs = tf.tensor_scatter_nd_update(outputs, [[i]], [pred_q_values.ravel()]) 


            model.fit(inputs, outputs, verbose = 0, epochs = 1)
            
    # assign rewards. If match is drawn either way reward is '0'. If computer choose to play 'X' and winner is 'O' then reward is '-1' vise versa.
    if winner == 'draw':
        reward = 0
    else:
        if winner == computer:
            reward = reward_value
        else:
            reward = -reward_value

    for i in range(0, len(current_game_states) - 1):
        memory.append((current_game_states[i], current_game_states[i + 1], moves[i], reward))

    #save the model
    if episode % save_model_every == 0:

        #save model
        save_path = os.path.join(save_dir, f'model_weight_episode_{episode}.h5')
        model.save_weights(save_path)

        #save memory
        filename = os.path.join(save_dir, f"memory_episode_{episode}.pkl")

        with open(filename, 'wb') as file:
            pickle.dump(memory, file)

    if episode % 10 == 0 and verbose:
        print(f"current episode: {episode}")        
        #predict using the model
        samp1_qs = model.predict(sample_state_1[np.newaxis, :])
        print(f"sample one: {np.argmax(samp1_qs)} max q val = {np.max(samp1_qs)}")
        samp2_qs = model.predict(sample_state_2[np.newaxis, :])
        print(f"sample two: {np.argmax(samp2_qs)} max q val = {np.max(samp2_qs)}")
        
    
#     if epsilon > epsilon_min:
#         epsilon -= epsilon_decay
#     else:
#         epsilon = epsilon_min

current episode: 0
sample one: 7 max q val = 0.08599068224430084
sample two: 7 max q val = 0.08480316400527954
current episode: 10
sample one: 5 max q val = 0.06995653361082077
sample two: 0 max q val = 0.07423193752765656
current episode: 20
sample one: 0 max q val = 0.20960836112499237
sample two: 0 max q val = 0.24016760289669037
current episode: 30
sample one: 4 max q val = 0.26331835985183716
sample two: 4 max q val = 0.31146299839019775
current episode: 40
sample one: 4 max q val = 0.4242195188999176
sample two: 4 max q val = 0.5540342330932617
current episode: 50
sample one: 1 max q val = 0.5872981548309326
sample two: 2 max q val = 0.9113802909851074
current episode: 60
sample one: 7 max q val = 478.8515319824219
sample two: 7 max q val = 196.3170928955078
current episode: 70
sample one: 4 max q val = 335.6798400878906
sample two: 7 max q val = 11.12768840789795
current episode: 80
sample one: 1 max q val = 41.18072509765625
sample two: 4 max q val = 41.66743850708008
current e

KeyboardInterrupt: 

In [19]:
#print out some states and the responses by the model
#example state
ttt = TicTacToe()
ttt.play_move(0, 0, False)
ttt.play_move(0, 1, False)
ttt.play_move(1, 0, False)
ttt.play_move(1, 1, True)

sample_state_1 = train.board_state_int(ttt.board, 2)

| X | O |   |
| X | O |   |
|   |   |   |


In [18]:
model.predict(sample_state_1[np.newaxis, :])

array([[ 66.785446, -44.155228,  73.89841 ,  62.885788,  80.92904 ,
        -45.862656,  61.43506 ,  71.311646,  55.52045 ]], dtype=float32)