In [None]:
# Let the games begin

import copy

from Agent import RandomAgent
from QAgent import QLearningAgent
from TicTacToe import TicTacToe

params = {
    "nr_of_episodes": 500000,  # number of episodes for training
    "rows": 3,  # rows of the board, rows = cols
    "epsilon_start": 0.15,  # initial exploration rate
    "epsilon_min": 0.005,  # minimum exploration rate
    "alpha_start": 0.1,  # initial learning rate
    "alpha_min": 0.1,  # minimum learning rate
    "gamma": 0.9,  # discount factor
    "switching": False,  # switch between X and O
    "debug": False,  # print debug messages
    # Parameters for QAgent
    "lazy_evaluation": True,  # use lazy evaluation
    "Q_initial_value": 0.0,  # initial Q value
    "terminal_q_updates": True,  # flag to switch between terminal and immediate Q updates
    # Parameters for DeepQAgent
    "batch_size": 32,  # batch size for deep learning
    # 'target_update_frequency' : 250, # target network update frequency
    "target_update_frequency": 150,  # target network update frequency
    "evaluation": True,  # save data for evaluation
    "double_q_learning": False,  # flag to switch on double Q-learnning
}

rows = 4
win_length = 4
nr_of_episodes = 750
params["nr_of_episodes"] = nr_of_episodes
params["rows"] = rows

paramsX = copy.deepcopy(params)
paramsO = copy.deepcopy(params)
paramsX["player"] = "X"
paramsO["player"] = "O"

outcomes = {"X": 0, "O": 0, "D": 0, "I": 0}

learning_agent1 = QLearningAgent(paramsX)
learning_agent2 = QLearningAgent(paramsO)
random_agent1 = RandomAgent(player="X", switching=False)
random_agent2 = RandomAgent(player="O", switching=False)

# game = TicTacToe(random_agent1, random_agent2, display=None, rows=rows, cols=rows, win_length=win_length)
# game = TicTacToe(learning_agent1, random_agent2, display=None, rows=rows, cols=rows, win_length=win_length)
game = TicTacToe(learning_agent1, learning_agent2, display=None, rows=rows, cols=rows, win_length=win_length)
# game = TicTacToe(random_agent1, learning_agent2, display=None, rows=rows, cols=rows, win_length=win_length)
# game = TicTacToe(random_agent1, learning_agent2, display=None, rows=rows, cols=rows, win_length=win_length)
# game1 = TicTacToe(learning_agent1, random_agent2, display=None, rows=rows, cols=rows, win_length=win_length)
# game2 = TicTacToe(random_agent1, learning_agent2, display=None, rows=rows, cols=rows, win_length=win_length)

for episode in range(nr_of_episodes):
    outcome = game.play()
    outcomes[outcome] += 1

    # outcome1 = game1.play()
    # outcome2 = game2.play()

print("Outcomes during learning:")
print(
    f"X wins: {outcomes['X']/nr_of_episodes}, O wins: {outcomes['O']/nr_of_episodes}, draws: {outcomes['D']/nr_of_episodes}"
)

In [None]:
from Evaluation import QAgent_plays_against_QAgent, QAgent_plays_against_RandomAgent

Q1 = learning_agent1.Q
Q2 = learning_agent2.Q

QAgent_plays_against_RandomAgent(Q1, "X", 2000, rows=rows, cols=rows, win_length=win_length)
QAgent_plays_against_RandomAgent(Q2, "O", 2000, rows=rows, cols=rows, win_length=win_length)
QAgent_plays_against_QAgent(Q1, "X", Q2, "O", 2000, rows=rows, cols=rows, win_length=win_length)

In [None]:
from Evaluation import plot_evaluation_data

plot_evaluation_data(learning_agent1)
plot_evaluation_data(learning_agent2)

In [None]:
# import dill

# with open('SymmetricQ_optimalX.pkl', 'wb') as f:
#     dill.dump(Q1.get(), f)

# with open('SymmetricQ_optimalO.pkl', 'wb') as f:
#     dill.dump(Q2.get(), f)

# with open('TotallySymmetricQ_optimalX.pkl', 'wb') as f:
#     dill.dump(Q1.get(), f)

# with open('TotallySymmetricQ_optimalO.pkl', 'wb') as f:
#     dill.dump(Q2.get(), f)

In [None]:
import numpy as np

from TicTacToe import TicTacToe

get_empty_positions = TicTacToe.get_valid_actions_from_board


def displayQ(Q, board):
    actions = get_empty_positions(board)  # Assume this function returns indices of empty positions
    Qs = {action: f"{Q.get(tuple(board), action):.2f}" for action in actions}  # Get Q-values, default to 0
    board_size = int(len(board) ** 0.5)  # Assume square board

    # Create a new board layout with Q-values embedded
    Qboard = list(board)
    for action, value in Qs.items():
        Qboard[action] = value  # Replace empty spots with Q-values

    cell_width = 5  # Padding for centering

    # Format and display the board
    for i in range(board_size):
        row = Qboard[i * board_size : (i + 1) * board_size]
        formatted_row = " | ".join(str(cell).center(cell_width) for cell in row)
        print(formatted_row)
        if i < board_size - 1:
            print("-" * (board_size * cell_width + (board_size - 1) * 3))  # Line separator

    print("\n")


def display_history(Q, history):
    for i in range(len(history)):
        board, action = history[i]
        displayQ(Q, board)


# Q1 = learning_agent.Q
# historyX = paramsX['histories']
# Q2 = learning_agent2.Q
# historyO = paramsO['histories']
# for i in range(min((len(historyX), len(historyO)))):
#     board, action = historyO[i]
#     displayQ(Q2, board)
#     board, action = historyX[i]
#     displayQ(Q1, board)

# board, action = historyO[len(historyO) - 1]
# displayQ(Q1, board)

# if max(len(historyX), len(historyO)) == len(historyX):
#     board, action = historyX[len(historyX) - 1]
#     displayQ(Q1, board)
# else:
#     board, action = historyO[len(historyO) - 1]
#     displayQ(Q2, board)

In [None]:
import matplotlib.pyplot as plt


def extract_values(dictionary):
    """Extract all values from a potentially nested dictionary."""
    values = []
    for key, value in dictionary.items():
        if isinstance(value, dict):  # If the value is a dictionary, recurse
            values.extend(extract_values(value))
        else:
            values.append(value)
    return values


def evaluate_and_plot_Q(learning_agent, player):
    Q = learning_agent.Q
    qMatrix = Q.get()
    qValues = extract_values(qMatrix)
    print(qValues)
    print(f"Total number of elements in Q for player {player}: {len(qValues)}")

    mean_q = np.mean(qValues)
    median_q = np.median(qValues)
    std_q = np.std(qValues)
    min_q = np.min(qValues)
    max_q = np.max(qValues)

    print(f"Q-value Statistics for player {player}:")
    print(f"Mean: {mean_q}")
    print(f"Median: {median_q}")
    print(f"Standard Deviation: {std_q}")
    print(f"Minimum: {min_q}")
    print(f"Maximum: {max_q}")

    plt.figure(figsize=(10, 6))
    plt.hist(qValues, bins=20, edgecolor="black", alpha=0.7)
    plt.title(f"Histogram of Q-values for player {player}")
    plt.xlabel("Q-value")
    plt.ylabel("Frequency")
    plt.grid(axis="y", linestyle="--", alpha=0.7)
    plt.show()


evaluate_and_plot_Q(learning_agent1, "X")
evaluate_and_plot_Q(learning_agent2, "O")