In [1]:
import numpy as np
from typing import List, Tuple, Optional
import random
from collections import defaultdict
import random
import pickle
import matplotlib.pyplot as plt

In [2]:
class TicTacToeEnv:
    def __init__(self):
        """
        Initializes a new TicTacToe environment.

        Board representation:
        - 0: Empty cell
        - 1: Player X
        - -1: Player O (opponent)

        The board is initialized as a 3x3 matrix of zeros.
        Player X starts the game (represented by 1).
        """
        self.board = np.zeros((3, 3), dtype=int)
        self.current_player = 1

    def reset(self) -> np.ndarray:
        """
        Resets the environment for a new game.

        Returns:
            np.ndarray: A clean 3x3 board filled with zeros
        """
        self.board = np.zeros((3, 3), dtype=int)
        self.current_player = 1
        return self.board.copy()

    def get_valid_moves(self) -> List[Tuple[int, int]]:
        """
        Returns all valid moves in the current board state.

        Returns:
            List[Tuple[int, int]]: List of (row, column) tuples representing empty cells
        """
        return [(i, j) for i in range(3) for j in range(3) if self.board[i, j] == 0]

    def make_move(self, position: Tuple[int, int]) -> Tuple[np.ndarray, float, bool]:
        """
        Executes a move on the board and returns the new state.

        Args:
            position: Tuple[int, int] representing (row, column) of the move

        Returns:
            Tuple containing:
            - np.ndarray: New board state
            - float: Reward (-1 for loss, 0 for ongoing, 1 for win)
            - bool: Whether the game is finished

        Raises:
            ValueError: If the move is invalid (cell already occupied)
        """
        if not self._is_valid_move(position):
            raise ValueError("Invalid move: Cell is already occupied")

        self.board[position] = self.current_player

        done = False
        reward = 0

        if self._check_winner() == self.current_player:
            reward = 1
            done = True
        elif self._check_winner() is not None:
            reward = -1
            done = True
        elif len(self.get_valid_moves()) == 0:
            done = True

        self.current_player *= -1
        return self.board.copy(), reward, done

    def _is_valid_move(self, position: Tuple[int, int]) -> bool:
        """
        Checks if a move is valid.

        Args:
            position: Tuple[int, int] representing (row, column) of the move

        Returns:
            bool: True if the cell is empty (0), False otherwise
        """
        return self.board[position] == 0

    def _check_winner(self) -> Optional[int]:
        """
        Checks if there's a winner in the current board state.

        Winning conditions:
        - Three in a row (horizontally)
        - Three in a column (vertically)
        - Three in a diagonal

        Returns:
            Optional[int]:
            - 1 if X wins
            - -1 if O wins
            - None if no winner yet
        """
        for i in range(3):
            if abs(sum(self.board[i,:])) == 3:
                return self.board[i,0]

        for j in range(3):
            if abs(sum(self.board[:,j])) == 3:
                return self.board[0,j]

        if abs(sum(np.diag(self.board))) == 3:
            return self.board[0,0]
        if abs(sum(np.diag(np.fliplr(self.board)))) == 3:
            return self.board[0,2]

        return None

    def render(self):
        """
        Displays the current board state in a human-readable format.

        Output format:
        ---------
        | X | O | X |
        ---------
        | O | X | O |
        ---------
        | X | O | X |
        ---------
        """
        symbols = {0: ' ', 1: 'X', -1: 'O'}
        print('---------')
        for i in range(3):
            print('|', end=' ')
            for j in range(3):
                print(symbols[self.board[i,j]], end=' | ')
            print('\n---------')

In [3]:
def test_environment():
    """
    Demonstrates the functionality of the TicTacToe environment through a sample game.

    This function:
    1. Creates a new environment instance
    2. Executes a predefined sequence of moves
    3. Displays the board state after each move
    4. Shows rewards and game status

    Test sequence:
    - Move (0,0): Top-left corner
    - Move (1,1): Center
    - Move (0,1): Top-middle
    - Move (0,2): Top-right
    - Move (2,2): Bottom-right

    Output for each move:
    - Visual representation of the board
    - Position of the move
    - Reward received (-1, 0, or 1)
    - Whether the game has ended

    The test terminates early if the game ends (win/draw/loss).

    Example output:
    ---------
    | X |   |   |
    ---------
    |   |   |   |
    ---------
    |   |   |   |
    ---------
    Reward: 0, Game ended: False
    """
    # Initialize new environment
    env = TicTacToeEnv()
    print("Initial state:")
    env.render()

    # List of moves to test (row, column)
    moves = [(0,0), (1,1), (0,1), (0,2), (2,2)]

    # Execute each move and display results
    for move in moves:
        print(f"\nMove at position {move}:")
        board, reward, done = env.make_move(move)
        env.render()
        print(f"Reward: {reward}, Game ended: {done}")

        # Stop if game is finished
        if done:
            break

In [40]:
import numpy as np
from typing import Tuple, Dict, List
import random
from collections import defaultdict
import datetime
import os

class TicTacToeQLearningAgent:
    """Q-learning agent for playing TicTacToe."""

    def __init__(self, epsilon: float = 0.1, alpha: float = 0.2, gamma: float = 0.3):
        """
        Initialize Q-learning agent.

        Args:
            epsilon: Exploration rate
            alpha: Learning rate
            gamma: Discount factor
        """
        self.q_table = defaultdict(lambda: defaultdict(float))
        self.epsilon = epsilon
        self.alpha = alpha
        self.gamma = gamma
        self.training_stats = {'wins': 0, 'losses': 0, 'draws': 0}

    def _get_state_key(self, board: np.ndarray) -> str:
        """Convert board state to string key for Q-table."""
        return ','.join(map(str, board.flatten()))

    def get_action(self, board: np.ndarray, valid_moves: List[Tuple[int, int]]) -> Tuple[int, int]:
        """
        Choose action using epsilon-greedy policy.

        Args:
            board: Current game state
            valid_moves: List of available moves

        Returns:
            Chosen move as (row, col)
        """
        state = self._get_state_key(board)

        # Initialize Q-values for new state-action pairs
        for move in valid_moves:
            if move not in self.q_table[state]:
                self.q_table[state][move] = np.random.uniform(0, 0.1)

        # Exploration
        if random.random() < self.epsilon:
            return random.choice(valid_moves)

        # Exploitation with randomized tie-breaking
        q_values = {move: self.q_table[state][move] for move in valid_moves}
        max_q = max(q_values.values())
        best_moves = [move for move, q in q_values.items() if q == max_q]
        return random.choice(best_moves)

    def update(self, state: np.ndarray, action: Tuple[int, int],
               next_state: np.ndarray, reward: float, done: bool,
               next_valid_moves: List[Tuple[int, int]]):
        """
        Update Q-values using Q-learning update rule.

        Args:
            state: Current state
            action: Chosen action
            next_state: Resulting state
            reward: Reward received
            done: Whether game ended
            next_valid_moves: Available moves in next state
        """
        state_key = self._get_state_key(state)
        next_state_key = self._get_state_key(next_state)

        # Initialize Q-values for next state
        for move in next_valid_moves:
            if move not in self.q_table[next_state_key]:
                self.q_table[next_state_key][move] = np.random.uniform(0, 0.1)

        # Get max Q-value for next state
        next_max_q = max([self.q_table[next_state_key][next_action]
                         for next_action in next_valid_moves]) if not done else 0

        # Update Q-value
        current_q = self.q_table[state_key][action]
        new_q = current_q + self.alpha * (reward + self.gamma * next_max_q - current_q)
        self.q_table[state_key][action] = new_q

        # Update training statistics
        if done:
            if reward == 1:
                self.training_stats['wins'] += 1
            elif reward == -1:
                self.training_stats['losses'] += 1
            else:
                self.training_stats['draws'] += 1

    def save_policy(self, filepath: str = None) -> str:
      """
      Save the Q-table policy to a pickle file.

      Args:
          filepath: Optional specific path to save the policy
                  If None, generates a timestamp-based filename

      Returns:
          str: Path where the policy was saved
      """
      # Convert defaultdict to regular dict for saving
      policy_dict = {
          'q_table': dict(self.q_table),
          'epsilon': self.epsilon,
          'alpha': self.alpha,
          'gamma': self.gamma
      }

      # Generate filename with timestamp if not provided
      if filepath is None:
          timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
          filepath = f"tictactoe_policy_{timestamp}.pkl"

      # Ensure the directory exists
      os.makedirs(os.path.dirname(filepath) if os.path.dirname(filepath) else '.', exist_ok=True)

      # Save the policy
      with open(filepath, 'wb') as f:
          pickle.dump(policy_dict, f)

      print(f"Policy saved to: {filepath}")
      return filepath

    @classmethod
    def load_policy(cls, filepath: str) -> 'TicTacToeQLearningAgent':
        """
        Load a Q-table policy from a pickle file.

        Args:
            filepath: Path to the saved policy file

        Returns:
            TicTacToeQLearningAgent: New agent instance with loaded policy

        Raises:
            FileNotFoundError: If policy file doesn't exist
        """
        if not os.path.exists(filepath):
            raise FileNotFoundError(f"Policy file not found: {filepath}")

        # Load the policy
        with open(filepath, 'rb') as f:
            policy_dict = pickle.load(f)

        # Create new agent with loaded parameters
        agent = cls(
            epsilon=policy_dict['epsilon'],
            alpha=policy_dict['alpha'],
            gamma=policy_dict['gamma']
        )

        # Convert regular dict back to defaultdict
        agent.q_table = defaultdict(lambda: defaultdict(float),
                                  {k: defaultdict(float, v)
                                  for k, v in policy_dict['q_table'].items()})

        print(f"Policy loaded from: {filepath}")
        return agent

In [41]:
def train_agent(episodes: int = 50000, save_path: str = None) -> TicTacToeQLearningAgent:
    """
    Train the Q-learning agent and optionally save the policy.

    Args:
        episodes: Number of training episodes
        save_path: Optional path to save the trained policy
                  If None, generates a timestamp-based filename

    Returns:
        Trained agent
    """
    env = TicTacToeEnv()
    agent = TicTacToeQLearningAgent()

    # Training monitoring
    episode_rewards = []
    win_rates = []

    for episode in range(episodes):
        state = env.reset()
        episode_reward = 0
        done = False

        while not done:
            valid_moves = env.get_valid_moves()
            action = agent.get_action(state, valid_moves)

            next_state, reward, done = env.make_move(action)
            next_valid_moves = env.get_valid_moves()

            # Modify rewards to encourage winning
            if reward == 1:
                reward = 1
            elif reward == -1:
                reward = -2

            agent.update(state, action, next_state, reward, done, next_valid_moves)
            episode_reward += reward

            state = next_state
            if not done:
                state = -state  # Flip board perspective

        episode_rewards.append(episode_reward)

        # Print progress every 1000 episodes
        if (episode + 1) % 1000 == 0:
            win_rate = agent.training_stats['wins'] / 1000
            win_rates.append(win_rate)
            print(f"Episode {episode + 1}")
            print(f"Win Rate: {win_rate:.2%}")
            print(f"Average Reward: {np.mean(episode_rewards[-1000:]):.2f}")
            print(f"Q-table size: {len(agent.q_table)}")
            agent.training_stats = {'wins': 0, 'losses': 0, 'draws': 0}

    # Save the trained policy if requested
    if save_path is not None:
        agent.save_policy(save_path)

    return agent

In [42]:
def play_game(agent: TicTacToeQLearningAgent, human_player: int = -1):
    """
    Play a game against the trained agent.

    Args:
        agent: Trained Q-learning agent
        human_player: 1 for X, -1 for O
    """
    env = TicTacToeEnv()
    state = env.reset()
    done = False

    print("Game started! You are", "X" if human_player == 1 else "O")
    env.render()

    while not done:
        current_player = env.current_player

        if current_player == human_player:
            valid_moves = env.get_valid_moves()
            print("\nValid moves:", valid_moves)
            while True:
                try:
                    row = int(input("Enter row (0-2): "))
                    col = int(input("Enter column (0-2): "))
                    if (row, col) in valid_moves:
                        break
                    print("Invalid move, try again.")
                except ValueError:
                    print("Invalid input, try again.")
            action = (row, col)
        else:
            print("\nAgent's turn...")
            valid_moves = env.get_valid_moves()
            action = agent.get_action(state, valid_moves)

        state, reward, done = env.make_move(action)
        env.render()

        if done:
            if reward == 1:
                winner = "X" if current_player == 1 else "O"
                print(f"\nGame Over! {winner} wins!")
            elif reward == -1:
                winner = "O" if current_player == 1 else "X"
                print(f"\nGame Over! {winner} wins!")
            else:
                print("\nGame Over! It's a draw!")

In [43]:
def test_environment():
    """
    Test the TicTacToe environment functionality.

    Executes a predefined sequence of moves and displays:
    - Board state after each move
    - Move position
    - Reward received
    - Game status
    """
    env = TicTacToeEnv()
    print("Initial state:")
    env.render()

    # Test moves
    moves = [(0, 0), (1, 1), (0, 1), (0, 2), (2, 2)]

    for move in moves:
        print(f"\nMove at position {move}:")
        board, reward, done = env.make_move(move)
        env.render()
        print(f"Reward: {reward}, Game ended: {done}")

        if done:
            break

In [None]:
if __name__ == "__main__":
    # Train the agent and save the policy
    save_path = "policies/best_policy.pkl"
    trained_agent = train_agent(episodes=500, save_path=save_path)

    # Example of loading a saved policy
    loaded_agent = TicTacToeQLearningAgent.load_policy(save_path)

    # Play against the loaded agent
    play_game(loaded_agent, human_player=-1)

Policy saved to: policies/best_policy.pkl
Policy loaded from: policies/best_policy.pkl
Game started! You are O
---------
|   |   |   | 
---------
|   |   |   | 
---------
|   |   |   | 
---------

Agent's turn...
---------
|   |   |   | 
---------
|   |   |   | 
---------
|   |   | X | 
---------

Valid moves: [(0, 0), (0, 1), (0, 2), (1, 0), (1, 1), (1, 2), (2, 0), (2, 1)]
