In [1]:
# Let the games begin

from Matrix import Matrix, QMatrix
from SymmetricMatrix import SymmetricMatrix, QSymmetricMatrix
from SymmetricMatrix import TotallySymmetricMatrix, QTotallySymmetricMatrix
from playGame import play_game, initialize_board, get_empty_positions


# Parameters
params = {
    'Q_initial_value' : 0.0, # initial Q value
    'alpha_start' : 0.1,  # initial learning rate
    'alpha_min' : 0.1, # minimum learning rate
    'gamma' : 0.99,  # discount factor
    'epsilon_start' : 0.5,  # initial exploration rate
    'epsilon_min' : 0.25, # minimum exploration rate
    'waiting_time' : 1.0, # waiting in seconds for display
    'episode' : 0, # number of episodes played during training
    'outcomes' : {'X' : 0, 'O' : 0, 'D' : 0}, # dictionary to save the outcomes for games ('X' = X wins, 'O' = O wins, 'D' = draws) 
    'nr_of_episodes' : 500000, # number of episodes for training
    'eps' : {'X' : -1.0, 'O' : -1.0}, # epsilon values for non-training determine how randomized the computer plays
    'rewards' : {'X' : 1.0, 'O' : -1.0, 'D' : 0.0}, # dictionary for rewards
}

# Initialize matrices

# Q = QMatrix(file='Q.pkl')
# Q = QMatrix(default_value=params['Q_initial_value'])
# Visits = Matrix(default_value=0)
# Rewards = Matrix(default_value=0)

# # Q = QSymmetricMatrix(file='SymmetricQ.pkl')
# Q = QSymmetricMatrix(default_value=params['Q_initial_value'])
# Visits = SymmetricMatrix(default_value=0)
# Rewards = SymmetricMatrix(default_value=0.0)

# Q = QTotallySymmetricMatrix(file='TotallySymmetricQ.pkl')
Q = QTotallySymmetricMatrix(default_value=params['Q_initial_value'])
Visits = TotallySymmetricMatrix(default_value=0)
Rewards = TotallySymmetricMatrix(default_value=0.0)

matrices = (Q, Visits, Rewards)

nr_of_episodes = 1500000
params['nr_of_episodes'] = nr_of_episodes
params['episode'] = 0

flags = {
    'training': True,
    'display': False,
    'interactive': False
}
for _ in range(nr_of_episodes):
    play_game(matrices, params, flags=flags)
    params['episode'] += 1

print("Outcomes:")
print(f"X wins: {params['outcomes']['X']/nr_of_episodes}, O wins: {params['outcomes']['O']/nr_of_episodes}, draws: {params['outcomes']['D']/nr_of_episodes}")

Outcomes:
X wins: 0.676696, O wins: 0.18572133333333332, draws: 0.13758266666666666


In [2]:
nr_of_episodes = 5000
params['nr_of_episodes'] = nr_of_episodes
params['eps'] = {'X' : -1.0, 'O' : -1.0}
params['outcomes'] = {'X' : 0, 'O' : 0, 'D' : 0}
flags = {
    'training': False,
    'display': False,
    'interactive': False
}
for _ in range(nr_of_episodes):
    play_game(matrices, params, flags=flags)

print("Outcomes with players choosing actions based on Q-values:")
print(f"X wins: {params['outcomes']['X']/nr_of_episodes}, O wins: {params['outcomes']['O']/nr_of_episodes}, draws: {params['outcomes']['D']/nr_of_episodes}")

Outcomes with players choosing actions based on Q-values:
X wins: 0.0, O wins: 0.0, draws: 1.0


In [None]:
# import dill

# with open('SymmetricQ.pkl', 'wb') as f:
#     dill.dump(Q.get(), f)

In [None]:
flags = {
    'training': False,
    'display': True,
    'interactive': False
}
# play_game(matrices, params, flags=flags)

In [None]:
flags = {
    'training': False,
    'display': True,
    'interactive': True
}
# play_game(matrices, params, flags=flags)

In [3]:
import numpy as np

from playGame import get_empty_positions

def displayQ(Q, board):
    actions = get_empty_positions(board)
    Qs = [f"{Q.get(board, action):.5f}" for action in actions]
    Qboard = list(board)
    for i, action in enumerate(actions):
        Qboard[action] = Qs[i]

    print(f"{np.array(Qboard).reshape(3,3)}")

history = params['history']
for i in range(len(history)):
    board, action = history[i]
    print(f"q value for action {action} is {Q.get(board, action)}")
    displayQ(Q, board)

q value for action 4 is 0.9557137055148663
[['0.89094' '0.76624' '0.89094']
 ['0.76624' '0.95571' '0.76624']
 ['0.89094' '0.76624' '0.89094']]
q value for action 8 is 0.9345649010819989
[['0.93456' '0.96798' '0.93456']
 ['0.96798' 'X' '0.96798']
 ['0.93456' '0.96798' '0.93456']]
q value for action 2 is 0.9534166592776611
[['0.86742' '0.83043' '0.95342']
 ['0.83043' 'X' '0.93744']
 ['0.95342' '0.93744' 'O']]
q value for action 7 is 0.8461417979412137
[['0.98571' '0.99000' 'X']
 ['0.98809' 'X' '0.98978']
 ['0.84177' '0.84614' 'O']]
q value for action 6 is 0.9999999999999994
[['-0.95654' '-0.67337' 'X']
 ['-0.84545' 'X' '-0.62088']
 ['1.00000' 'O' 'O']]
