In [None]:
# Let the games begin

import copy
import wandb

from TicTacToe import TicTacToe
from Agent import RandomAgent, HumanAgent
from DeepQAgent import DeepQLearningAgent, DeepQPlayingAgent

params = {
    'nr_of_episodes' : 500000, # number of episodes for training
    'rows' : 3, # rows of the board, rows = cols

    'epsilon_start' : 0.15,  # initial exploration rate
    'epsilon_min' : 0.005, # minimum exploration rate
    'learning_rate': 0.0001, # learning rate
    'gamma' : 0.95,  # discount factor

    'switching' : True, # switch between X and O
    'debug' : False, # print debug messages

    # Parameters for DeepQAgent
    'batch_size' : 16, # batch size for deep learning
    'target_update_frequency' : 20, # target network update frequency
    'evaluation' : True, # save data for evaluation
    'double_q_learning' : False, # flag to switch on double Q-learnning
    'device' : 'cpu', # device to use, 'cpu' or 'mps' or 'cuda' 
    'replay_buffer_length' : 10000, # replay buffer length
    }

rows = 4
win_length = 4
nr_of_episodes = 250000
params['nr_of_episodes'] = nr_of_episodes
params['rows'] = rows

paramsX = copy.deepcopy(params)
paramsO = copy.deepcopy(params)
paramsX['player'] = 'X'
paramsO['player'] = 'O'

outcomes = {'X' : 0, 'O' : 0, 'D' : 0}

learning_agent1 = DeepQLearningAgent(paramsX)
learning_agent2 = DeepQLearningAgent(paramsO)
# random_agent2 = RandomAgent(player='O', switching=False)

# game = TicTacToe(learning_agent1, random_agent2, display=None, rows=rows, cols=rows, win_length=win_length)
game = TicTacToe(learning_agent1, learning_agent2, display=None, rows=rows, cols=rows, win_length=win_length)

for episode in range(nr_of_episodes):
    outcome = game.play()
    outcomes[outcome] += 1

print("Outcomes during learning:")
print(f"X wins: {outcomes['X']/nr_of_episodes}, O wins: {outcomes['O']/nr_of_episodes}, draws: {outcomes['D']/nr_of_episodes}")

wandb.finish()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mjakob_teraki[0m ([33mjakob_snn[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
q_network1 = learning_agent1.q_network
playing_agent1 = DeepQPlayingAgent(q_network1, player='X', switching=False)
random_agent2 = RandomAgent(player='O', switching=False)

game = TicTacToe(playing_agent1, random_agent2, display=None, rows=rows, cols=rows, win_length=win_length)
nr_of_episodes = 1000
outcomes = {'X' : 0, 'O' : 0, 'D' : 0}
for episode in range(nr_of_episodes):
    outcome = game.play()
    outcomes[outcome] += 1

print("Outcomes during playing:")
print(f"X wins: {outcomes['X']/nr_of_episodes}, O wins: {outcomes['O']/nr_of_episodes}, draws: {outcomes['D']/nr_of_episodes}")

q_network2 = learning_agent2.q_network
playing_agent2 = DeepQPlayingAgent(q_network2, player='O', switching=False)
random_agent1 = RandomAgent(player='X', switching=False)

game = TicTacToe(random_agent1, playing_agent2, display=None, rows=rows, cols=rows, win_length=win_length)
nr_of_episodes = 1000
outcomes = {'X' : 0, 'O' : 0, 'D' : 0}
for episode in range(nr_of_episodes):
    outcome = game.play()
    outcomes[outcome] += 1

print("Outcomes during playing:")
print(f"X wins: {outcomes['X']/nr_of_episodes}, O wins: {outcomes['O']/nr_of_episodes}, draws: {outcomes['D']/nr_of_episodes}")

game = TicTacToe(playing_agent1, playing_agent2, display=None, rows=rows, cols=rows, win_length=win_length)
nr_of_episodes = 1000
outcomes = {'X' : 0, 'O' : 0, 'D' : 0}
for episode in range(nr_of_episodes):
    outcome = game.play()
    outcomes[outcome] += 1

print("Outcomes during playing:")
print(f"X wins: {outcomes['X']/nr_of_episodes}, O wins: {outcomes['O']/nr_of_episodes}, draws: {outcomes['D']/nr_of_episodes}")
wandb.finish()

In [None]:
# import torch

# torch.save(q_network1, 'models/q_network_4x4x4.pth')

In [None]:
from Evaluation import plot_evaluation_data, plot_valid_actions

plot_evaluation_data(learning_agent1)
plot_evaluation_data(learning_agent2)

plot_valid_actions(learning_agent1)
plot_valid_actions(learning_agent2)