In [1]:
from typing import Tuple, Dict, List
import numpy as np

In [2]:
EMPTY, X, O = 0, 1, -1
StateType = Tuple[int, ...]
ActionType = int
QTable = Dict[Tuple[StateType, ActionType], float]

In [8]:
def check_winner(state: StateType):
    wins = [(0,1,2), (3,4,5), (6,7,8),
            (0,3,6), (1,4,7), (2,5,8),
            (0,4,8), (2,4,6)]
    for i,j,k in wins:
        if state[i] == state[j] == state[k] != EMPTY:
            return state[i]
    return None

def get_available_actions(state: StateType) -> List[int]:
    return [i for i in range(9) if state[i] == EMPTY]

def choose_action(state: StateType, Q: QTable, epsilon: float) -> int:
    actions = get_available_actions(state)
    if np.random.rand() < epsilon:
        return int(np.random.choice(actions))
    # greedy
    best_a, best_q = actions[0], -np.inf
    for a in actions:
        q = Q.get((state, a), 0.0)
        if q > best_q:
            best_q, best_a = q, a
    return best_a

def make_a_move(state: StateType, action: ActionType, player: int) -> StateType:
    s = list(state)
    s[action] = player
    return tuple(s)

def iteration(ITERS=200000, alpha=0.1, gamma=0.9, epsilon=0.1):
    Q1: QTable = {}
    Q2: QTable = {}
    for _ in range(ITERS):
        state: StateType = tuple([EMPTY]*9)
        player = X
        while True:
            if player == X:
                action = choose_action(state, Q1, epsilon)
                next_state = make_a_move(state, action, X)
                winner = check_winner(next_state)
                if winner == X: reward = 1.0
                elif winner == O: reward = -1.0
                elif winner is None and not get_available_actions(next_state): reward = 0.5
                else: reward = 0.0

                max_next_q = 0.0
                if winner is None and get_available_actions(next_state):
                    max_next_q = max(Q1.get((next_state, a), 0.0) for a in get_available_actions(next_state))
                old = Q1.get((state, action), 0.0)
                Q1[(state, action)] = old + alpha*(reward + gamma*max_next_q - old)

            else:
                action = choose_action(state, Q2, epsilon)
                next_state = make_a_move(state, action, O)
                winner = check_winner(next_state)
                if winner == O: reward = 1.0
                elif winner == X: reward = -1.0
                elif winner is None and not get_available_actions(next_state): reward = 0.5
                else: reward = 0.0

                max_next_q = 0.0
                if winner is None and get_available_actions(next_state):
                    max_next_q = max(Q2.get((next_state, a), 0.0) for a in get_available_actions(next_state))
                old = Q2.get((state, action), 0.0)
                Q2[(state, action)] = old + alpha*(reward + gamma*max_next_q - old)

            state = next_state
            if check_winner(state) is not None or not get_available_actions(state):
                break
            player *= -1
    return Q1, Q2

In [9]:
Q1, Q2 = iteration(ITERS = 200000)

In [5]:
def simulate_game(Q1: QTable, Q2: QTable,
                  mode="agent_vs_random", agent_player=X, epsilon=0.0):
    state: StateType = tuple([EMPTY]*9)
    player = X
    while True:
        if mode == "agent_vs_random":
            if player == agent_player:
                Q = Q1 if agent_player == X else Q2
                action = choose_action(state, Q, epsilon)
            else:
                action = int(np.random.choice(get_available_actions(state)))
        elif mode == "Q1_vs_Q2":
            Q = Q1 if player == X else Q2
            action = choose_action(state, Q, epsilon)
        else:
            raise ValueError("mode must be 'agent_vs_random' or 'Q1_vs_Q2'")

        state = make_a_move(state, action, player)
        winner = check_winner(state)
        if winner is not None or not get_available_actions(state):
            return winner  # X, O, or None
        player *= -1

In [6]:
def get_statistics(Q1, Q2, num_games=1000):
    results = {X: 0, O: 0, None: 0} # None for draws
    for _ in range(num_games):
        winner = simulate_game(Q1, Q2, mode="agent_vs_random", agent_player=X)
        results[winner] += 1
    return results

In [16]:
# Assuming Q1 and Q2 are already trained from the previous step
game_stats = get_statistics(Q1, Q2, num_games=1000)
print("Game Statistics (1000 games):")
print(f"Player X wins: {game_stats[X]}")
print(f"Player O wins: {game_stats[O]}")
print(f"Draws: {game_stats[None]}")

Game Statistics (1000 games):
Player X wins: 860
Player O wins: 117
Draws: 23
