In [1]:
import numpy as np
from itertools import product

transformations = [
    lambda x: x,                         # Identity
    lambda x: np.rot90(x, 1),            # Rotate 90°
    lambda x: np.rot90(x, 2),            # Rotate 180°
    lambda x: np.rot90(x, 3),            # Rotate 270°
    lambda x: np.fliplr(x),              # Horizontal reflection
    lambda x: np.flipud(x),              # Vertical reflection
    lambda x: np.transpose(x),           # Diagonal reflection (TL-BR)
    # lambda x: np.fliplr(np.transpose(x)) # Diagonal reflection (TR-BL)
]

inverse_transformations = [
    lambda x: x,                         # Identity
    lambda x: np.rot90(x, 3),            # Rotate 90° inverse (rotate 270°)
    lambda x: np.rot90(x, 2),            # Rotate 180° inverse
    lambda x: np.rot90(x, 1),            # Rotate 270° inverse (rotate 90°)
    lambda x: np.fliplr(x),              # Horizontal reflection inverse
    lambda x: np.flipud(x),              # Vertical reflection inverse
    lambda x: np.transpose(x),           # Diagonal reflection (TL-BR) inverse
    # lambda x: np.transpose(np.fliplr(x))# Diagonal reflection (TR-BL) inverse
]

original_actions = np.array(range(9)).reshape(3, 3)
for i, transform in enumerate(transformations):
    assert inverse_transformations[i](transform(original_actions)).flatten().tolist() == original_actions.flatten().tolist()

transformed_actions = [transform(original_actions).flatten().tolist() for transform in transformations]
for i in range(len(transformed_actions)):
    for j in range(i + 1, len(transformed_actions)):
        assert transformed_actions[i] != transformed_actions[j]

def generate_all_valid_boards():
    symbols = [' ', 'X', 'O']
    all_boards = list(product(symbols, repeat=9))  # Generate all 3^9 combinations
    print(f"Total number of boards: {len(all_boards)}")
    all_valid_boards = []

    for board in all_boards:
        x_count = board.count('X')
        o_count = board.count('O')
        
        # Valid boards must satisfy these conditions:
        if x_count == o_count or x_count == o_count + 1:
            all_valid_boards.append(board)

    print(f"Number of valid boards: {len(all_valid_boards)}")    
    return all_valid_boards

def board_to_matrix(board):
    return np.array(board).reshape(3, 3)

def matrix_to_board(matrix):
    return matrix.flatten().tolist()

def generate_symmetries(board):
    matrix = board_to_matrix(board)
    symmetries = [transform(matrix) for transform in transformations]
    return [matrix_to_board(sym) for sym in symmetries]

def get_canonical_representation(board):
    symmetries = generate_symmetries(board)
    min_symmetry = min(symmetries)
    return tuple(min_symmetry), symmetries.index(min_symmetry)

# Generate all empty positions on the board
def get_empty_positions(board):
    return [i for i, cell in enumerate(board) if cell == ' ']

def display_board(board):
    print(f" {board[0]} | {board[1]} | {board[2]} ")
    print("---+---+---")
    print(f" {board[3]} | {board[4]} | {board[5]} ")
    print("---+---+---")
    print(f" {board[6]} | {board[7]} | {board[8]} ")
    print("\n")

# Generate all valid Tic-Tac-Toe boards
all_valid_boards = generate_all_valid_boards()
state_action_pairs = 0
for valid_board in all_valid_boards:
    empty_positions = get_empty_positions(valid_board)
    state_action_pairs += len(empty_positions)

print(f"Total number of valid state-action pairs: {state_action_pairs}")

# Generate all canonical Tic-Tac-Toe boards
all_canonical_boards = set()
get_canonical_boards = {}
get_transform = {}
get_inverse_transform = {}
get_canonical_actions = {}
get_inverse_canonical_actions = {}
for valid_board in all_valid_boards:
    canonical_board, transform_idx = get_canonical_representation(valid_board)
    all_canonical_boards.add(canonical_board)
    get_canonical_boards[valid_board] = canonical_board
    get_transform[valid_board] = transformations[transform_idx]
    get_inverse_transform[valid_board] = inverse_transformations[transform_idx]
    get_inverse_canonical_actions[valid_board] = get_transform[valid_board](original_actions).flatten().tolist()
    get_canonical_actions[valid_board] = get_inverse_transform[valid_board](original_actions).flatten().tolist()

all_canonical_boards = sorted(list(all_canonical_boards))
print(f"Number of canonical boards: {len(all_canonical_boards)}")

all_canonical_actions = {}
canonical_state_action_pairs = 0
for canonical_board in all_canonical_boards:
    empty_positions = get_empty_positions(canonical_board)
    all_canonical_actions[canonical_board] = empty_positions
    canonical_state_action_pairs += len(empty_positions)

print(f"Number of canonical state-action pairs: {canonical_state_action_pairs}")

def get_canonical_board(board):
    return get_canonical_boards[tuple(board)]

def get_canonical_action(board, action):
    return get_canonical_actions[board][action]

def get_inverse_canonical_action(board, canonical_action):
    return get_inverse_canonical_actions[board][canonical_action]

def canonicalize(board, action):
    canonical_board = get_canonical_board(board)
    canonical_action = get_canonical_action(board, action)
    return canonical_board, canonical_action

valid_board = all_valid_boards[5]
actions = get_empty_positions(valid_board)
canonical_actions = [get_canonical_action(valid_board, action) for action in actions]
print(f"Board:             {valid_board}")
print(f"Canonical board:   {get_canonical_board(valid_board)}")
print(f"Actions:           {actions}")
print(f"Canonical actions: {sorted(canonical_actions)}")
print(f"Canonical actions: {sorted(all_canonical_actions[get_canonical_board(valid_board)])}")

tot = 0
for i, valid_board in enumerate(all_valid_boards):
    actions = get_empty_positions(valid_board)
    canonical_board = get_canonical_board(valid_board)
    canonical_actions1 = sorted(all_canonical_actions[canonical_board])
    canonical_actions2 = sorted([get_canonical_action(valid_board, action) for action in actions])
    if not canonical_actions2 == canonical_actions1:
        tot += 1

print(f"Number of wrong canonical actions: {tot}/{len(all_valid_boards)}")

Total number of boards: 19683
Number of valid boards: 6046
Total number of valid state-action pairs: 19107
Number of canonical boards: 1520
Number of canonical state-action pairs: 4808
Board:             (' ', ' ', ' ', ' ', ' ', ' ', 'X', ' ', ' ')
Canonical board:   (' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', 'X')
Actions:           [0, 1, 2, 3, 4, 5, 7, 8]
Canonical actions: [0, 1, 2, 3, 4, 5, 6, 7]
Canonical actions: [0, 1, 2, 3, 4, 5, 6, 7]
Number of wrong canonical actions: 0/6046


In [2]:
from collections import defaultdict

def get_next_player(board):
    x_count = board.count('X')
    o_count = board.count('O')
    return 'X' if x_count == o_count else 'O'

def get_next_board(board, action):
    new_board = list(board)
    next_player = get_next_player(board)
    if next_player == 'X':
        new_board[action] = 'X'
    else:
        new_board[action] = 'O'
    
    return tuple(new_board)

# canonical_board_to_next_canonical_board = defaultdict(lambda: defaultdict(lambda: -1))
canonical_board_to_next_canonical_board = {}
all_next_canonical_boards = set()
for canonical_board in all_canonical_boards:
    # display_board(canonical_board)
    # print(all_canonical_actions[canonical_board])
    canonical_actions_to_next_canonical_board = {}
    for canonical_action in all_canonical_actions[canonical_board]:
        next_board = get_next_board(canonical_board, canonical_action)
        next_canonical_board = get_canonical_board(next_board)
        all_next_canonical_boards.add(next_canonical_board)
        canonical_actions_to_next_canonical_board[canonical_action] = next_canonical_board
        # print(canonical_action)
        # display_board(next_canonical_board)

    canonical_board_to_next_canonical_board[canonical_board] = canonical_actions_to_next_canonical_board

# canonical_board_to_next_canonical_board[canonical_board]
# allzs = [v for states in canonical_board_to_next_canonical_board.values() for v in states.values()]
# len(allzs)
print(len(all_next_canonical_boards))
print(len(all_canonical_boards))

1073
1520


In [3]:
QQ = defaultdict(lambda: -1)
for i, next_canonical_board in enumerate(all_next_canonical_boards):
    QQ[next_canonical_board] = i

# valid_board = all_valid_boards[3]
valid_board = ('O', ' ', ' ', ' ', ' ', ' ', ' ', ' ', 'X')
valid_actions = get_empty_positions(valid_board)
# valid_actions = list(range(9))
canonical_board = get_canonical_board(valid_board)
canonical_actions = [get_canonical_action(valid_board, valid_action) for valid_action in valid_actions]
QQlist = [QQ[canonical_board_to_next_canonical_board[canonical_board][canonical_action]] for canonical_action in canonical_actions]
# print(canonical_board)
# print(canonical_actions)
# print(np.array(QQlist).reshape(3, 3))
# print(QQlist)
Qvalid_board = list(valid_board)
print(np.array(Qvalid_board).reshape(3,3))
for valid_action in valid_actions:
    Qvalid_board[valid_action] = QQ[canonical_board_to_next_canonical_board[canonical_board][get_canonical_action(valid_board, valid_action)]]

print(np.array(Qvalid_board).reshape(3,3))

valid_board = (' ', ' ', 'O', ' ', ' ', ' ', 'X', ' ', ' ')
valid_actions = get_empty_positions(valid_board)
# valid_actions = list(range(9))
canonical_board = get_canonical_board(valid_board)
canonical_actions = [get_canonical_action(valid_board, valid_action) for valid_action in valid_actions]
QQlist = [QQ[canonical_board_to_next_canonical_board[canonical_board][canonical_action]] for canonical_action in canonical_actions]
# print(canonical_board)
# print(canonical_actions)
# print(np.array(QQlist).reshape(3, 3))
# print(QQlist)
Qvalid_board = list(valid_board)
print(np.array(Qvalid_board).reshape(3,3))
for valid_action in valid_actions:
    Qvalid_board[valid_action] = QQ[canonical_board_to_next_canonical_board[canonical_board][get_canonical_action(valid_board, valid_action)]]

print(np.array(Qvalid_board).reshape(3,3))

[['O' ' ' ' ']
 [' ' ' ' ' ']
 [' ' ' ' 'X']]
[['O' '301' '125']
 ['63' '56' '964']
 ['355' '345' 'X']]
[[' ' ' ' 'O']
 [' ' ' ' ' ']
 ['X' ' ' ' ']]
[['355' '63' 'O']
 ['345' '56' '301']
 ['X' '964' '125']]


In [4]:
board = (' ', ' ', ' ', ' ', 'X', ' ', ' ', ' ', ' ')
# display_board(board)
canonical_board, _ = get_canonical_representation(board)
action = 1
canonical_action = get_canonical_action(board, action)
# display_board(canonical_board)
# print(canonical_action)
for inverse_transform in inverse_transformations:
    display_board(matrix_to_board(inverse_transform(board_to_matrix(canonical_board))))

   |   |   
---+---+---
   | X |   
---+---+---
   |   |   


   |   |   
---+---+---
   | X |   
---+---+---
   |   |   


   |   |   
---+---+---
   | X |   
---+---+---
   |   |   


   |   |   
---+---+---
   | X |   
---+---+---
   |   |   


   |   |   
---+---+---
   | X |   
---+---+---
   |   |   


   |   |   
---+---+---
   | X |   
---+---+---
   |   |   


   |   |   
---+---+---
   | X |   
---+---+---
   |   |   




In [5]:
for i in range(10):
    if i == 10 - 1:
        print(f"final i: {i}")
    else:
        print(f"{i}, {i + 1}")

for i in reversed(range(10)):
    if i == 10 - 1:
        print(f"final i: {i}")
    else:
        print(f"{i}, {i + 1}")

0, 1
1, 2
2, 3
3, 4
4, 5
5, 6
6, 7
7, 8
8, 9
final i: 9
final i: 9
8, 9
7, 8
6, 7
5, 6
4, 5
3, 4
2, 3
1, 2
0, 1


In [6]:
# Let the games begin

from Matrix import Matrix, QMatrix
from SymmetricMatrix import SymmetricMatrix, QSymmetricMatrix
from playGame import play_game

# Parameters
params = {
    'Q_initial_value' : 1.0, # initial Q value
    'alpha_start' : 0.1,  # initial learning rate
    'alpha_min' : 0.1, # minimum learning rate
    'gamma' : 0.9,  # discount factor
    'epsilon_start' : 0.5,  # initial exploration rate
    'epsilon_min' : 0.25, # minimum exploration rate
    'waiting_time' : 1.0, # waiting in seconds for display
    'episode' : 0, # number of episodes played during training
    'outcomes' : {'X' : 0, 'O' : 0, 'D' : 0}, # dictionary to save the outcomes for games ('X' = X wins, 'O' = O wins, 'D' = draws) 
    'nr_of_episodes' : 500000, # number of episodes for training
    'eps' : {'X' : -1.0, 'O' : -1.0}, # epsilon values for non-training determine how randomized the computer plays
}

# Initialize matrices
# # Q = QMatrix(file='Q.pkl')
# Q = QMatrix(default_value=params['Q_initial_value'])
# Visits = Matrix(default_value=0)
# Rewards = Matrix(default_value=0)

Q = QSymmetricMatrix(file='SymmetricQ.pkl')
# Q = QSymmetricMatrix(default_value=params['Q_initial_value'])
Visits = SymmetricMatrix(default_value=0)
Rewards = SymmetricMatrix(default_value=0.0)

matrices = (Q, Visits, Rewards)

nr_of_episodes = 5000
params['nr_of_episodes'] = nr_of_episodes
params['episode'] = 0

flags = {
    'training': True,
    'display': False,
    'interactive': False
}
for _ in range(nr_of_episodes):
    play_game(matrices, params, flags=flags)
    params['episode'] += 1

print("Outcomes:")
print(f"X wins: {params['outcomes']['X']/nr_of_episodes}, O wins: {params['outcomes']['O']/nr_of_episodes}, draws: {params['outcomes']['D']/nr_of_episodes}")

Outcomes:
X wins: 0.4542, O wins: 0.41, draws: 0.1358


In [7]:
nr_of_episodes = 5000
params['nr_of_episodes'] = nr_of_episodes
params['eps'] = {'X' : -0.1, 'O' : -0.1}
params['outcomes'] = {'X' : 0, 'O' : 0, 'D' : 0}
flags = {
    'training': False,
    'display': False,
    'interactive': False
}
for _ in range(nr_of_episodes):
    play_game(matrices, params, flags=flags)

print("Outcomes with players choosing actions based on Q-values:")
print(f"X wins: {params['outcomes']['X']/nr_of_episodes}, O wins: {params['outcomes']['O']/nr_of_episodes}, draws: {params['outcomes']['D']/nr_of_episodes}")

Outcomes with players choosing actions based on Q-values:
X wins: 0.1418, O wins: 0.6998, draws: 0.1584


Outcomes with players choosing action based on Q-values:
X wins: 0.138, O wins: 0.6906, draws: 0.1714

In [8]:
# import dill

# with open('SymmetricQ.pkl', 'wb') as f:
#     dill.dump(Q.get(), f)

In [9]:
flags = {
    'training': False,
    'display': True,
    'interactive': False
}
play_game(matrices, params, flags=flags)



 O  |  X  |  X 
----+-----+----
 O  |     |  O 
----+-----+----
 O  |  X  |  X 


Player O wins!



In [10]:
flags = {
    'training': False,
    'display': True,
    'interactive': True
}
# play_game(matrices, params, flags=flags)

In [None]:
import matplotlib.pyplot as plt

q_matrix = Q.get()
print(f"Total unique states encountered: {len(q_matrix.keys())}")

# Extract all Q-values from the nested dictionary
all_q_values = [q for actions in q_matrix.values() for q in actions.values()]
print(f"Total number of elements in Q: {len(all_q_values)}")

mean_q = np.mean(all_q_values)
median_q = np.median(all_q_values)
std_q = np.std(all_q_values)
min_q = np.min(all_q_values)
max_q = np.max(all_q_values)

print("Q-value Statistics:")
print(f"Mean: {mean_q}")
print(f"Median: {median_q}")
print(f"Standard Deviation: {std_q}")
print(f"Minimum: {min_q}")
print(f"Maximum: {max_q}")

plt.figure(figsize=(10, 6))
plt.hist(all_q_values, bins=20, edgecolor='black', alpha=0.7)
plt.title("Histogram of Q-values")
plt.xlabel("Q-value")
plt.ylabel("Frequency")
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

# with open('Q_optimal.pkl', 'wb') as f:
#     dill.dump(Q, f)

Number of canonical boards: 1520
Number of canonical state-action pairs: 4808

In [None]:
visits = Visits.get()
all_v = [v for states in visits.values() for v in states.values()]

print("Statistics of visited state-action pairs:")
print(f"Number of state-action pairs visited: {len(all_v)}")

mean_v = np.mean(all_v)
median_v = np.median(all_v)
std_v = np.std(all_v)
min_v = np.min(all_v)
max_v = np.max(all_v)

print(f"Mean: {mean_v}")
print(f"Median: {median_v}")
print(f"Standard Deviation: {std_v}")
print(f"Minimum: {min_v}")
print(f"Maximum: {max_v}")

plt.figure(figsize=(10, 6))
plt.hist(all_v, bins=20, edgecolor='black', alpha=0.7)
plt.title("Histogram of visited states")
plt.xlabel("Visits")
plt.ylabel("States")
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

In [None]:
rewards = Rewards.get()
all_r = [r for states in rewards.values() for r in states.values()]
all_rX = [r for states in rewards.values() for r in states.values() if r == 1.0]
all_rO = [r for states in rewards.values() for r in states.values() if r == -1.0]
all_rD = [r for states in rewards.values() for r in states.values() if r == 0.5]

print("Statistics of rewards")
print(f"Number of state-action pairs with rewards: {len(all_r)}")
print(f"Number of state-action pairs with rewards for X: {len(all_rX)}")
print(f"Number of state-action pairs with rewards for O: {len(all_rO)}")
print(f"Number of state-action pairs with rewards for D: {len(all_rD)}")