In [14]:
import numpy as np
import random
import pickle
from collections import defaultdict

# --- I. Tic-Tac-Toe Environment Implementation ---
class TicTacToe:
    def __init__(self):
        self.board = np.full((3, 3), ' ')
        self.winning_combinations = [
            # Rows
            [(0, 0), (0, 1), (0, 2)], [(1, 0), (1, 1), (1, 2)], [(2, 0), (2, 1), (2, 2)],
            # Columns
            [(0, 0), (1, 0), (2, 0)], [(0, 1), (1, 1), (2, 1)], [(0, 2), (1, 2), (2, 2)],
            # Diagonals
            [(0, 0), (1, 1), (2, 2)], [(0, 2), (1, 1), (2, 0)]
        ]
        self.termination_state = np.full((3, 3), 'T')

    def reset_board(self):
        self.board = np.full((3, 3), ' ')
        return self.get_board_hash()

    def get_board_hash(self, board=None):
        """Returns a hashable representation of the board state."""
        current_board = board if board is not None else self.board
        return tuple(current_board.flatten())

    def get_available_moves(self):
        """Returns a list of (row, col) tuples for empty cells."""
        moves = []
        for r in range(3):
            for c in range(3):
                if self.board[r, c] == ' ':
                    moves.append((r, c))
        return moves

    def make_move(self, move, player_symbol):
        """Makes a move on the board. move is a (row, col) tuple."""
        if self.board[move[0], move[1]] == ' ':
            self.board[move[0], move[1]] = player_symbol
            return True
        return False # Invalid move

    def check_winner(self):
        """
        Checks for a winner or a draw.
        Returns:
            'X' if X wins
            'O' if O wins
            'draw' if it's a draw
            None if the game is ongoing
        """
        for player_symbol in ['X', 'O']:
            for combo in self.winning_combinations:
                if all(self.board[r, c] == player_symbol for r, c in combo):
                    return player_symbol
        if not self.get_available_moves(): # No more moves
            return 'draw'
        return None # Game is ongoing

    def get_reward(self, player_symbol, winner):
        """
        Calculates reward for the player.
        Args:
            player_symbol (str): The symbol of the agent ('X' or 'O').
            winner (str/None): Result from check_winner().
        Returns:
            int: Reward value.
        """
        if winner == player_symbol:
            return 1  # Win
        elif winner is not None and winner != 'draw': # Opponent won
            return -1 # Loss
        # elif winner == 'draw':
        #     return 0 # Draw
        else:
            return 0 


    def is_game_over(self):
        return self.check_winner() is not None

    def render_board(self):
        print("-------------")
        for row in self.board:
            print(f"| {' | '.join(row)} |")
            print("-------------")
        print()

# --- II. Reinforcement Learning Agent ---
class RLAgent:
    def __init__(self, player_symbol, learning_rate=0.1, discount_factor=0.9, exploration_rate=1.0, exploration_decay_rate=0.999, min_exploration_rate=0.01, lr_decay=0.9995):
        self.player_symbol = player_symbol
        self.opponent_symbol = 'O' if player_symbol == 'X' else 'X'
        self.lr = learning_rate
        self.gamma = discount_factor
        self.epsilon = exploration_rate
        self.epsilon_decay = exploration_decay_rate
        self.min_epsilon = min_exploration_rate
        self.q_table = defaultdict(lambda: defaultdict(float)) # Q(s,a) -> value. state is board hash, action is (r,c)
        self.lr_decay = lr_decay

    def get_action(self, board_hash, available_moves):
        """Epsilon-greedy action selection."""
        if not available_moves:
            return None

        if random.uniform(0, 1) < self.epsilon:
            return random.choice(available_moves)  # Explore
        else:
            # Exploit: Choose the action with the highest Q-value
            q_values_for_state = self.q_table[board_hash]
            if not q_values_for_state: # No Q-values for this state yet, pick randomly
                return random.choice(available_moves)

            max_q = -float('inf')
            best_action = None
            # Shuffle available moves to break ties randomly if Q-values are the same
            random.shuffle(available_moves)
            for move in available_moves:
                if q_values_for_state[move] > max_q:
                    max_q = q_values_for_state[move]
                    best_action = move
                elif q_values_for_state[move] == -float('inf') and best_action is None: # if all are -inf, pick one
                     best_action = move


            if best_action is None: # If all Q-values are 0 or not set, pick randomly
                return random.choice(available_moves)
            return best_action

    def update_q_table(self, state_hash, action, reward, next_state_hash, next_available_moves, is_terminal):
        """
        Update Q-value for a state-action pair using Q-learning.
        This is the core of your training algorithm.
        """
        if action is None: # Should not happen if get_action is called with available_moves
            return

        current_q = self.q_table[state_hash][action]

        # Q-learning: Find max Q-value for the next state
        max_next_q = 0.0
        if not is_terminal and next_available_moves:
            q_values_for_next_state = self.q_table[next_state_hash]
            if q_values_for_next_state: # if there are entries for next_state
                 max_next_q = max(q_values_for_next_state[move] for move in next_available_moves if move in q_values_for_next_state) if q_values_for_next_state else 0.0
            # If next_state_hash is not in q_table or no moves have q_values yet for it, max_next_q remains 0

        # Q-learning update rule
        new_q = current_q + self.lr * (reward + self.gamma * max_next_q - current_q)
        self.q_table[state_hash][action] = new_q

    def decay_epsilon(self):
        self.epsilon = max(self.min_epsilon, self.epsilon * self.epsilon_decay)

    def decay_learning_rate(self):
        self.lr = self.lr * self.lr_decay

    def save_q_table(self, filename="q_table.pkl"):
        with open(filename, 'wb') as f:
            pickle.dump(dict(self.q_table), f) # Convert defaultdict to dict for pickling
        print(f"Q-table saved to {filename}")

    def load_q_table(self, filename="q_table.pkl"):
        try:
            with open(filename, 'rb') as f:
                loaded_q_table = pickle.load(f)
                self.q_table = defaultdict(lambda: defaultdict(float), loaded_q_table)
                # Convert inner dicts back to defaultdict(float) if needed,
                # but direct assignment usually works if structure is q_table[state][action]
                for state_key in self.q_table:
                    self.q_table[state_key] = defaultdict(float, self.q_table[state_key])

            print(f"Q-table loaded from {filename}")
        except FileNotFoundError:
            print(f"No Q-table found at {filename}, starting with an empty one.")
        except Exception as e:
            print(f"Error loading Q-table: {e}. Starting with an empty one.")


# --- III. Training the Agent ---
def train_agent(agent1, agent2_opponent, env, num_episodes=10000):
    print(f"Starting training for {num_episodes} episodes...")
    wins_agent1 = 0
    draws = 0
    losses_agent1 = 0 # Agent1 losses / Agent2 wins

    for episode in range(num_episodes):
        current_state_hash = env.reset_board()
        game_over = False
        last_move_info = {agent1.player_symbol: None, agent2_opponent.player_symbol: None} # To store (s, a) for update

        current_player = agent1 # Agent1 starts

        while not game_over:
            available_moves = env.get_available_moves()
            if not available_moves: # Should be caught by game_over, but as a safeguard
                break

            is_agent1_turn = current_player == agent1

            if is_agent1_turn:
                action = agent1.get_action(current_state_hash, available_moves)
                if action:
                    # Store state and action for agent1 before making the move
                    last_move_info[agent1.player_symbol] = {'state': current_state_hash, 'action': action}
                    env.make_move(action, agent1.player_symbol)
                else: # No valid action (should not happen with available_moves check)
                    game_over = True # End game if agent can't move
                    break
            else: # Opponent's turn (could be another RLAgent or a random one)
                if isinstance(agent2_opponent, RLAgent):
                    action = agent2_opponent.get_action(current_state_hash, available_moves)
                    if action:
                        # Store state and action for agent2 if it's an RL agent
                        last_move_info[agent2_opponent.player_symbol] = {'state': current_state_hash, 'action': action}
                        env.make_move(action, agent2_opponent.player_symbol)
                    else:
                        game_over = True
                        break
                else: # Random opponent
                    action = random.choice(available_moves)
                    env.make_move(action, agent2_opponent.player_symbol) # Opponent symbol

            winner = env.check_winner()
            game_over = (winner is not None)
            next_state_hash = env.get_board_hash()
            next_available_moves = env.get_available_moves()

            if is_agent1_turn:
                if isinstance(agent2_opponent, RLAgent) and last_move_info[agent2_opponent.player_symbol]:
                    s, a = last_move_info[agent2_opponent.player_symbol]['state'], last_move_info[agent2_opponent.player_symbol]['action']
                    reward_agent2 = env.get_reward(agent2_opponent.player_symbol, winner)
                    # Note: The reward for agent2 is based on the same game outcome
                    agent2_opponent.update_q_table(s, a, reward_agent2, next_state_hash, next_available_moves, game_over)
                    last_move_info[agent2_opponent.player_symbol] = None # Clear after update

            else:
                if last_move_info[agent1.player_symbol]:
                    s, a = last_move_info[agent1.player_symbol]['state'], last_move_info[agent1.player_symbol]['action']
                    reward_agent1 = env.get_reward(agent1.player_symbol, winner)
                    agent1.update_q_table(s, a, reward_agent1, next_state_hash, next_available_moves, game_over)
                    last_move_info[agent1.player_symbol] = None

            # --- THIS IS WHERE THE TRAINING ALGORITHM IS APPLIED ---
            ### But what if the game is over? We need to update the Q-table for the last move but it's not what done in here
            # Update Q-table for agent1 based on its last move
            # if last_move_info[agent1.player_symbol]:
            #     s, a = last_move_info[agent1.player_symbol]['state'], last_move_info[agent1.player_symbol]['action']
            #     reward_agent1 = env.get_reward(agent1.player_symbol, winner)
            #     agent1.update_q_table(s, a, reward_agent1, next_state_hash, next_available_moves, game_over)
            #     last_move_info[agent1.player_symbol] = None # Clear after update

            # # Update Q-table for agent2 if it's an RLAgent and made a move
            # if isinstance(agent2_opponent, RLAgent) and last_move_info[agent2_opponent.player_symbol]:
            #     s, a = last_move_info[agent2_opponent.player_symbol]['state'], last_move_info[agent2_opponent.player_symbol]['action']
            #     reward_agent2 = env.get_reward(agent2_opponent.player_symbol, winner)
            #     # Note: The reward for agent2 is based on the same game outcome
            #     agent2_opponent.update_q_table(s, a, reward_agent2, next_state_hash, next_available_moves, game_over)
            #     last_move_info[agent2_opponent.player_symbol] = None




            current_state_hash = next_state_hash

            if game_over:
                #update for the last move
                # Update Q-table for agent1 based on its last move
                termination_state_hash = env.termination_state
                if last_move_info[agent1.player_symbol]:
                    s, a = last_move_info[agent1.player_symbol]['state'], last_move_info[agent1.player_symbol]['action']
                    reward_agent1 = env.get_reward(agent1.player_symbol, winner)
                    agent1.update_q_table(s, a, reward_agent1, termination_state_hash, next_available_moves, game_over)
                    last_move_info[agent1.player_symbol] = None # Clear after update

                # Update Q-table for agent2 if it's an RLAgent and made a move
                if isinstance(agent2_opponent, RLAgent) and last_move_info[agent2_opponent.player_symbol]:
                    s, a = last_move_info[agent2_opponent.player_symbol]['state'], last_move_info[agent2_opponent.player_symbol]['action']
                    reward_agent2 = env.get_reward(agent2_opponent.player_symbol, winner)
                    # Note: The reward for agent2 is based on the same game outcome
                    agent2_opponent.update_q_table(s, a, reward_agent2, termination_state_hash, next_available_moves, game_over)
                    last_move_info[agent2_opponent.player_symbol] = None

                if winner == agent1.player_symbol:
                    wins_agent1 += 1
                elif winner == agent2_opponent.player_symbol:
                    losses_agent1 +=1
                elif winner == 'draw':
                    draws += 1
            else:
                # Switch player
                current_player = agent2_opponent if is_agent1_turn else agent1


        if isinstance(agent1, RLAgent): agent1.decay_epsilon()
        if isinstance(agent2_opponent, RLAgent): agent2_opponent.decay_epsilon()
        if isinstance(agent1, RLAgent): agent1.decay_learning_rate()
        if isinstance(agent2_opponent, RLAgent): agent2_opponent.decay_learning_rate()

        if (episode + 1) % (num_episodes // 10) == 0:
            print(f"Episode {episode + 1}/{num_episodes} completed. Agent1 Wins: {wins_agent1}, Losses: {losses_agent1}, Draws: {draws}")
            if isinstance(agent1, RLAgent) : print(f"  Agent1 Epsilon: {agent1.epsilon:.4f}")
            if isinstance(agent2_opponent, RLAgent) : print(f"  Agent2 Epsilon: {agent2_opponent.epsilon:.4f}")


    print("\nTraining finished.")
    print(f"Agent1 ('{agent1.player_symbol}') Wins: {wins_agent1} ({(wins_agent1/num_episodes)*100:.1f}%)")
    print(f"Opponent ('{agent2_opponent.player_symbol}') Wins: {losses_agent1} ({(losses_agent1/num_episodes)*100:.1f}%)")
    print(f"Draws: {draws} ({(draws/num_episodes)*100:.1f}%)")
    if isinstance(agent1, RLAgent): print(f"Final Agent1 Epsilon: {agent1.epsilon:.4f}")
    if isinstance(agent2_opponent, RLAgent): print(f"Final Agent2 Epsilon: {agent2_opponent.epsilon:.4f}")
    if isinstance(agent1, RLAgent): print(f"Agent1 Q-table size: {len(agent1.q_table)}")
    if isinstance(agent2_opponent, RLAgent): print(f"Agent2 Q-table size: {len(agent2_opponent.q_table)}")


# Simple Random Player for training or comparison
class RandomPlayer:
    def __init__(self, player_symbol):
        self.player_symbol = player_symbol

    def get_action(self, board_hash, available_moves): # board_hash is not used but kept for consistency
        if not available_moves:
            return None
        return random.choice(available_moves)

# --- IV. Qualitative Performance Check: Play Against the Trained Agent ---
def play_against_agent(agent, env):
    print("\nStarting game: Human (O) vs Trained Agent (X)")
    agent.epsilon = 0 # Ensure agent plays greedily
    human_player = 'O'
    current_player_symbol = 'X' # Agent starts or choose randomly

    env.reset_board()
    env.render_board()

    while not env.is_game_over():
        available_moves = env.get_available_moves()
        if not available_moves:
            break

        if current_player_symbol == agent.player_symbol:
            print(f"Agent '{agent.player_symbol}' is thinking...")
            move = agent.get_action(env.get_board_hash(), available_moves)
            if move is None: # Should not happen if there are available moves
                print("Agent could not find a move.")
                break
            print(f"Agent '{agent.player_symbol}' plays at {move}")
        else: # Human's turn
            valid_move = False
            while not valid_move:
                try:
                    print(f"Available moves: {available_moves}")
                    row, col = map(int, input(f"Your turn ({human_player}). Enter row,col (0-2): ").split(','))
                    move = (row, col)
                    if move in available_moves:
                        valid_move = True
                    else:
                        print("Invalid move. Cell occupied or out of bounds.")
                except ValueError:
                    print("Invalid input. Please enter row,col (e.g., 1,1).")
                except Exception as e:
                    print(f"An error occurred: {e}")

        env.make_move(move, current_player_symbol)
        env.render_board()
        winner = env.check_winner()

        if winner:
            if winner == 'draw':
                print("It's a DRAW!")
            else:
                print(f"Player '{winner}' WINS!")
            break

        current_player_symbol = human_player if current_player_symbol == agent.player_symbol else agent.player_symbol

    if not env.is_game_over(): # If loop broke for other reasons
        print("Game ended unexpectedly.")

# --- V. Quantitative Performance Measure: Two Agents Play Each Other ---
def evaluate_agents(agent1, agent2, env, num_games=1000):
    print(f"\nEvaluating Agent '{agent1.player_symbol}' vs Agent '{agent2.player_symbol}' for {num_games} games...")
    # Set agents to evaluation mode (no exploration)
    original_eps1 = getattr(agent1, 'epsilon', None)
    original_eps2 = getattr(agent2, 'epsilon', None)
    if isinstance(agent1, RLAgent): agent1.epsilon = 0
    if isinstance(agent2, RLAgent): agent2.epsilon = 0


    scores = {agent1.player_symbol: 0, agent2.player_symbol: 0, 'draw': 0}
    
    # Alternate who starts
    starts_player1 = True

    for game_num in range(num_games):
        env.reset_board()
        game_over = False
        
        if starts_player1:
            current_player = agent1
            other_player = agent2
        else:
            current_player = agent2
            other_player = agent1
            
        while not game_over:
            available_moves = env.get_available_moves()
            if not available_moves:
                break # Should be handled by check_winner draw condition

            board_hash = env.get_board_hash()
            if current_player == agent1:
                move = agent1.get_action(board_hash, available_moves)
            else: # current_player == agent2
                move = agent2.get_action(board_hash, available_moves)

            if move:
                env.make_move(move, current_player.player_symbol)
            else: # Agent can't move, should not happen if game is not over and moves are available
                print(f"Warning: Agent {current_player.player_symbol} could not make a move in evaluation game {game_num+1}.")
                game_over = True # End game if an agent fails to move
                scores['draw'] +=1 # Or count as a loss for the agent? For now, draw.
                break


            winner = env.check_winner()
            if winner:
                if winner == agent1.player_symbol:
                    scores[agent1.player_symbol] += 1
                elif winner == agent2.player_symbol:
                    scores[agent2.player_symbol] += 1
                else: # draw
                    scores['draw'] += 1
                game_over = True
            
            # Switch players
            current_player, other_player = other_player, current_player
            
        #starts_player1 = not starts_player1 # Alternate starting player for next game

    # Restore epsilon if they were RLAgents
    if isinstance(agent1, RLAgent) and original_eps1 is not None: agent1.epsilon = original_eps1
    if isinstance(agent2, RLAgent) and original_eps2 is not None: agent2.epsilon = original_eps2

    print("\nEvaluation Results:")
    print(f"Agent '{agent1.player_symbol}' Wins: {scores[agent1.player_symbol]} ({(scores[agent1.player_symbol]/num_games)*100:.1f}%)")
    print(f"Agent '{agent2.player_symbol}' Wins: {scores[agent2.player_symbol]} ({(scores[agent2.player_symbol]/num_games)*100:.1f}%)")
    print(f"Draws: {scores['draw']} ({(scores['draw']/num_games)*100:.1f}%)")
    return scores


# --- Example Usage ---
if __name__ == "__main__":
    # Initialize Environment
    game_env = TicTacToe()

    # Initialize Agents
    # Agent 1 will be an RL Agent ('X')
    rl_agent_X = RLAgent(player_symbol='X', learning_rate=0.1, discount_factor=1.0, exploration_rate=1.0, exploration_decay_rate=0.99999, min_exploration_rate=0.01, lr_decay=1)

    # Agent 2 can be another RL Agent or a RandomPlayer for training
    # Option 1: Train against a Random Player
    random_opponent_O = RandomPlayer(player_symbol='O')
    print("--- Training RL Agent (X) vs Random Player (O) ---")
    # train_agent(rl_agent_X, random_opponent_O, game_env, num_episodes=50000) # More episodes for better learning
    # rl_agent_X.save_q_table("agent_X_vs_random.pkl")

    # Option 2: Train two RL agents against each other (self-play)
    rl_agent_O = RLAgent(player_symbol='O', learning_rate=0.1, discount_factor=1.0, exploration_rate=1.0, exploration_decay_rate=0.99999, min_exploration_rate=0.01, lr_decay=1)
    print("\n--- Training RL Agent (X) vs RL Agent (O) (Self-Play) ---")
    train_agent(rl_agent_X, rl_agent_O, game_env, num_episodes=70000) # Self-play often requires more episodes
    rl_agent_X.save_q_table("agent_X_selfplay.pkl")
    rl_agent_O.save_q_table("agent_O_selfplay.pkl")


    # --- Qualitative Check: Play against the trained agent X ---
    # Load the agent you want to play against
    # rl_agent_X.load_q_table("agent_X_vs_random.pkl") # if trained against random
    rl_agent_X.load_q_table("agent_X_selfplay.pkl") # if trained via self-play
    # play_against_agent(rl_agent_X, game_env)


    # --- Quantitative Evaluation ---
    # Example 1: Trained Agent X vs Random Agent O
    print("\n--- Evaluating Trained RL Agent (X) vs Random Player (O) ---")
    # rl_agent_X.load_q_table("agent_X_selfplay.pkl") # Ensure it's loaded and epsilon is set low by evaluate_agents
    random_eval_opponent = RandomPlayer(player_symbol='O')
    evaluate_agents(rl_agent_X, random_eval_opponent, game_env, num_games=1000)

    # Example 2: Trained Agent X vs Trained Agent O (from self-play)
    print("\n--- Evaluating Trained RL Agent (X) vs Trained RL Agent (O) ---")
    # rl_agent_X.load_q_table("agent_X_selfplay.pkl")
    # rl_agent_O.load_q_table("agent_O_selfplay.pkl")
    evaluate_agents(rl_agent_X, rl_agent_O, game_env, num_games=1000)

    # Example 3: Trained Agent X vs a newly initialized (untrained) RL Agent O
    print("\n--- Evaluating Trained RL Agent (X) vs Untrained RL Agent (O) ---")
    untrained_agent_O = RLAgent(player_symbol='O') # Fresh, untrained agent
    # rl_agent_X.load_q_table("agent_X_selfplay.pkl")
    evaluate_agents(rl_agent_X, untrained_agent_O, game_env, num_games=1000)

--- Training RL Agent (X) vs Random Player (O) ---

--- Training RL Agent (X) vs RL Agent (O) (Self-Play) ---
Starting training for 70000 episodes...
Episode 7000/70000 completed. Agent1 Wins: 4144, Losses: 2004, Draws: 852
  Agent1 Epsilon: 0.9324
  Agent2 Epsilon: 0.9324
Episode 14000/70000 completed. Agent1 Wins: 8283, Losses: 4042, Draws: 1675
  Agent1 Epsilon: 0.8694
  Agent2 Epsilon: 0.8694
Episode 21000/70000 completed. Agent1 Wins: 12327, Losses: 6126, Draws: 2547
  Agent1 Epsilon: 0.8106
  Agent2 Epsilon: 0.8106
Episode 28000/70000 completed. Agent1 Wins: 16360, Losses: 8179, Draws: 3461
  Agent1 Epsilon: 0.7558
  Agent2 Epsilon: 0.7558
Episode 35000/70000 completed. Agent1 Wins: 20368, Losses: 10179, Draws: 4453
  Agent1 Epsilon: 0.7047
  Agent2 Epsilon: 0.7047
Episode 42000/70000 completed. Agent1 Wins: 24356, Losses: 12192, Draws: 5452
  Agent1 Epsilon: 0.6570
  Agent2 Epsilon: 0.6570
Episode 49000/70000 completed. Agent1 Wins: 28224, Losses: 14191, Draws: 6585
  Agent1 Eps

In [12]:
play_against_agent(rl_agent_X, game_env) # Uncomment to play against the trained agent


Starting game: Human (O) vs Trained Agent (X)
-------------
|   |   |   |
-------------
|   |   |   |
-------------
|   |   |   |
-------------

Agent 'X' is thinking...
Agent 'X' plays at (1, 1)
-------------
|   |   |   |
-------------
|   | X |   |
-------------
|   |   |   |
-------------

Available moves: [(0, 0), (0, 1), (0, 2), (1, 0), (1, 2), (2, 0), (2, 1), (2, 2)]
-------------
| O |   |   |
-------------
|   | X |   |
-------------
|   |   |   |
-------------

Agent 'X' is thinking...
Agent 'X' plays at (0, 1)
-------------
| O | X |   |
-------------
|   | X |   |
-------------
|   |   |   |
-------------

Available moves: [(0, 2), (1, 0), (1, 2), (2, 0), (2, 1), (2, 2)]
Invalid move. Cell occupied or out of bounds.
Available moves: [(0, 2), (1, 0), (1, 2), (2, 0), (2, 1), (2, 2)]
Invalid move. Cell occupied or out of bounds.
Available moves: [(0, 2), (1, 0), (1, 2), (2, 0), (2, 1), (2, 2)]
-------------
| O | X |   |
-------------
| O | X |   |
-------------
|   |   |   |

In [5]:
for key in rl_agent_X.q_table.keys():
    print(key)
    for action, value in rl_agent_X.q_table[key].items():
        print(f"  Action {action}: Q-value {value}")

(' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ')
  Action (0, 2): Q-value 0.02766648944879229
  Action (2, 1): Q-value 0.08629022300923389
  Action (1, 2): Q-value 0.07575579682711184
  Action (1, 0): Q-value 0.08803184469369554
  Action (0, 0): Q-value 0.06939957127652835
  Action (2, 2): Q-value 0.042297328846227035
  Action (0, 1): Q-value 0.07322895400024461
  Action (2, 0): Q-value 0.07383708101840414
  Action (1, 1): Q-value 0.18977449703715726
('O', ' ', 'X', ' ', ' ', ' ', ' ', ' ', ' ')
  Action (2, 2): Q-value 0.0
  Action (0, 1): Q-value 0.0017826678099500573
  Action (1, 0): Q-value 0.00275199228545309
  Action (2, 0): Q-value 0.0009000000000000002
  Action (2, 1): Q-value 0.010667682990000003
  Action (1, 1): Q-value 0.20629087746523744
  Action (1, 2): Q-value -0.0006390000000000007
('O', ' ', 'X', ' ', ' ', ' ', 'O', ' ', 'X')
  Action (1, 1): Q-value 0.0
  Action (0, 1): Q-value 0.0
  Action (1, 2): Q-value 0.19
  Action (1, 0): Q-value 0.0
  Action (2, 1): Q-value 0.0
('O

In [None]:
import numpy as np
import random
import pickle
from collections import defaultdict
import matplotlib.pyplot as plt
from matplotlib.patches import Circle, Rectangle # Not used in current text drawing, but good for future
import ipywidgets as widgets
from IPython.display import display, clear_output
import asyncio # <-- Import asyncio



# --- IV. Interactive UI for Playing Against Agent (Corrected) ---
class TicTacToeGUI:
    def __init__(self, agent, human_player_symbol='O'):
        self.env = TicTacToe()
        self.agent = agent
        self.agent.epsilon = 0 # Agent plays greedily
        self.human_player_symbol = human_player_symbol
        self.agent_player_symbol = 'X' if human_player_symbol == 'O' else 'O'

        # Get the current asyncio event loop
        self.loop = asyncio.get_event_loop()

        self.buttons = [[widgets.Button(description=' ', layout=widgets.Layout(width='60px', height='60px', margin='2px')) for _ in range(3)] for _ in range(3)]
        self.status_label = widgets.Label(value="") # Will be set below
        self.reset_button = widgets.Button(description="Reset Game", layout=widgets.Layout(margin='10px 0 0 0'))

        self.output_area = widgets.Output()

        for r in range(3):
            for c in range(3):
                self.buttons[r][c].on_click(self.on_button_clicked)
                self.buttons[r][c].style.font_weight = 'bold'
                self.buttons[r][c].style.font_size = '20px'
                self.buttons[r][c].style.button_color = 'lightgray'

        self.reset_button.on_click(self.reset_game_clicked) # Renamed to avoid conflict

        # Determine initial player and setup board
        self.setup_new_game()

    def setup_new_game(self):
        self.env.reset_board()
        self.draw_board_on_output() # Initial draw
        for r_idx in range(3):
            for c_idx in range(3):
                self.buttons[r_idx][c_idx].description = ' '
                self.buttons[r_idx][c_idx].disabled = False
                self.buttons[r_idx][c_idx].style.button_color = 'lightgray'

        if self.agent_player_symbol == 'X': # Agent 'X' starts
            self.current_player_symbol = self.agent_player_symbol
            self.status_label.value = f"Agent ({self.agent_player_symbol}) is thinking..."
            self.disable_all_buttons()
            self.loop.call_soon(self.agent_turn) # Schedule agent's turn
        else: # Human starts (e.g., human is 'X' or agent is 'O')
            self.current_player_symbol = self.human_player_symbol
            self.status_label.value = f"Your turn ({self.human_player_symbol})."
            self.enable_available_buttons()


    def draw_board_on_output(self):
        with self.output_area:
            clear_output(wait=True)
            fig, ax = plt.subplots(figsize=(3.5, 3.5)) # Slightly larger figure
            ax.set_xlim(0, 3)
            ax.set_ylim(0, 3)
            # Grid lines
            for i in range(4):
                ax.plot([i, i], [0, 3], color='black', lw=2) # Vertical
                ax.plot([0, 3], [i, i], color='black', lw=2) # Horizontal

            ax.tick_params(axis='both', which='both', bottom=False, top=False, left=False, right=False,
                           labelbottom=False, labelleft=False)
            ax.set_aspect('equal') # Ensure squares are square

            for r in range(3):
                for c in range(3):
                    symbol = self.env.board[r, c]
                    if symbol == 'X':
                        ax.text(c + 0.5, 2.5 - r, 'X', ha='center', va='center', fontsize=40, color='dodgerblue')
                    elif symbol == 'O':
                        ax.text(c + 0.5, 2.5 - r, 'O', ha='center', va='center', fontsize=40, color='orangered')
            plt.show()

    def on_button_clicked(self, b):
        if self.env.is_game_over() or self.current_player_symbol == self.agent_player_symbol:
            return

        move = None
        clicked_button_widget = None
        for r in range(3):
            for c in range(3):
                if self.buttons[r][c] == b:
                    move = (r, c)
                    clicked_button_widget = self.buttons[r][c]
                    break
            if move: break

        if move and self.env.board[move[0], move[1]] == ' ':
            self.env.make_move(move, self.human_player_symbol)
            clicked_button_widget.description = self.human_player_symbol
            clicked_button_widget.disabled = True
            clicked_button_widget.style.button_color = 'lightgreen' if self.human_player_symbol == 'O' else 'lightblue'

            self.draw_board_on_output()
            winner = self.env.check_winner()
            if winner:
                self.end_game(winner)
            else:
                self.current_player_symbol = self.agent_player_symbol
                self.status_label.value = f"Agent ({self.agent_player_symbol}) is thinking..."
                self.disable_all_buttons()
                self.loop.call_soon(self.agent_turn) # Schedule agent's turn
        else:
            self.status_label.value = "Invalid move. Cell occupied or error."


    def agent_turn(self):
        if self.env.is_game_over(): # Check if game ended before agent could move
            # This can happen if the human's last move won/drew the game,
            # and agent_turn was already scheduled by call_soon.
            # The end_game logic would have already run from on_button_clicked.
            # Ensure buttons are appropriately disabled if game is over.
            if not self.status_label.value.lower().endswith("wins!") and \
               not self.status_label.value.lower().endswith("draw!"):
                # If status wasn't updated to a final state by human's win
                winner_check = self.env.check_winner()
                if winner_check:
                    self.end_game(winner_check) # Ensure final state is declared
                else: # Should not occur if game_over is true
                    self.status_label.value = "Game Over (Agent Turn Check)"
                    self.disable_all_buttons()
            return

        board_hash = self.env.get_board_hash()
        available_moves = self.env.get_available_moves()
        agent_move = self.agent.get_action(board_hash, available_moves)

        if agent_move:
            self.env.make_move(agent_move, self.agent_player_symbol)
            button_to_update = self.buttons[agent_move[0]][agent_move[1]]
            button_to_update.description = self.agent_player_symbol
            button_to_update.disabled = True
            button_to_update.style.button_color = 'lightcoral' if self.agent_player_symbol == 'O' else 'lightskyblue'

            self.draw_board_on_output()
            winner = self.env.check_winner()
            if winner:
                self.end_game(winner)
            else:
                self.current_player_symbol = self.human_player_symbol
                self.status_label.value = f"Your turn ({self.human_player_symbol})."
                self.enable_available_buttons()
        else:
            # This case (agent has no move but game not over) should be rare if logic is correct.
            # Could happen if available_moves is empty but check_winner didn't declare a draw.
            current_winner_status = self.env.check_winner()
            if current_winner_status: # Game actually ended
                 self.end_game(current_winner_status)
            else: # Agent truly stuck
                self.status_label.value = "Agent cannot find a move. Game might be a draw or error."
                self.end_game('draw') # Declare a draw or investigate


    def end_game(self, winner):
        self.disable_all_buttons()
        if winner == 'draw':
            self.status_label.value = "It's a DRAW!"
        elif winner == self.human_player_symbol:
            self.status_label.value = f"Congratulations! You ({winner}) WIN!"
        elif winner == self.agent_player_symbol:
            self.status_label.value = f"Agent ({winner}) WINS! Better luck next time."
        else: # Should ideally not be reached if winner is 'X', 'O', or 'draw'
             self.status_label.value = "Game Over."


    def disable_all_buttons(self):
        for r in range(3):
            for c in range(3):
                self.buttons[r][c].disabled = True

    def enable_available_buttons(self):
        if self.env.is_game_over(): # Do not enable if game is over
            self.disable_all_buttons()
            return
        for r in range(3):
            for c in range(3):
                if self.env.board[r,c] == ' ':
                    self.buttons[r][c].disabled = False
                else:
                    self.buttons[r][c].disabled = True


    def reset_game_clicked(self, b=None): # Renamed from reset_game
        self.setup_new_game()


    def display_game(self):
        # self.draw_board_on_output() # Initial draw is now in setup_new_game
        grid_buttons = []
        for r in range(3):
            for c_widget in self.buttons[r]:
                grid_buttons.append(c_widget)

        grid = widgets.GridBox(grid_buttons, layout=widgets.Layout(grid_template_columns="repeat(3, 70px)"))   #(grid_template_columns="repeat(3, auto)"))
        display(widgets.VBox([self.status_label, grid, self.output_area, self.reset_button]))


# Human will play as 'O', the agent as 'X'
print("\n--- Starting Interactive Game: Human (O) vs Agent (X) ---")
# Ensure agent_x_for_play is your trained agent instance
interactive_game = TicTacToeGUI(agent=rl_agent_X, human_player_symbol='O')
interactive_game.display_game()

# To play as 'X' against an agent 'O':
# agent_o_for_play = RLAgent(player_symbol='O')
# try:
#    agent_o_for_play.load_q_table("agent_O_selfplay.pkl") # Assuming you trained an agent 'O'
#    print("Successfully loaded 'agent_O_selfplay.pkl'.")
# except FileNotFoundError:
#    print("Could not load 'agent_O_selfplay.pkl'. Agent O will be untrained.")
#
# print("\n--- Starting Interactive Game: Human (X) vs Agent (O) ---")
# interactive_game_human_X = TicTacToeGUI(agent=agent_o_for_play, human_player_symbol='X')
# interactive_game_human_X.display_game()


--- Starting Interactive Game: Human (O) vs Agent (X) ---


VBox(children=(Label(value='Agent (X) is thinking...'), GridBox(children=(Button(description=' ', disabled=Tru…

Findings

1. Problem: TD(0) with Q-learning fails to learn after 7e5 iterations.
Why?:
hypothesis 1: One-step TD learning (TD(0)) is very slow because value updates focus on the end of the trajectory. As a result, the beginning of the trajectory receives meaningful value estimates only after a long time.
-> It might true. But the true reason of training failure is a critical bug in current version of the code.


while not game_over:
            available_moves = env.get_available_moves()
            if not available_moves: # Should be caught by game_over, but as a safeguard
                break

            is_agent1_turn = current_player == agent1

            if is_agent1_turn:
                action = agent1.get_action(current_state_hash, available_moves)
                if action:
                    # Store state and action for agent1 before making the move
                    last_move_info[agent1.player_symbol] = {'state': current_state_hash, 'action': action}
                    env.make_move(action, agent1.player_symbol)
                else: # No valid action (should not happen with available_moves check)
                    game_over = True # End game if agent can't move
                    break
            else: # Opponent's turn (could be another RLAgent or a random one)
                if isinstance(agent2_opponent, RLAgent):
                    action = agent2_opponent.get_action(current_state_hash, available_moves)
                    if action:
                        # Store state and action for agent2 if it's an RL agent
                        last_move_info[agent2_opponent.player_symbol] = {'state': current_state_hash, 'action': action}
                        env.make_move(action, agent2_opponent.player_symbol)
                    else:
                        game_over = True
                        break
                else: # Random opponent
                    action = random.choice(available_moves)
                    env.make_move(action, agent2_opponent.player_symbol) # Opponent symbol

            winner = env.check_winner()
            game_over = (winner is not None)
            next_state_hash = env.get_board_hash()
            next_available_moves = env.get_available_moves()

            # --- THIS IS WHERE THE TRAINING ALGORITHM IS APPLIED ---
            ### But what if the game is over? We need to update the Q-table for the last move but it's not what done in here
            # Update Q-table for agent1 based on its last move
            if last_move_info[agent1.player_symbol]:
                s, a = last_move_info[agent1.player_symbol]['state'], last_move_info[agent1.player_symbol]['action']
                reward_agent1 = env.get_reward(agent1.player_symbol, winner)
                agent1.update_q_table(s, a, reward_agent1, next_state_hash, next_available_moves, game_over)
                last_move_info[agent1.player_symbol] = None # Clear after update

            # Update Q-table for agent2 if it's an RLAgent and made a move
            if isinstance(agent2_opponent, RLAgent) and last_move_info[agent2_opponent.player_symbol]:
                s, a = last_move_info[agent2_opponent.player_symbol]['state'], last_move_info[agent2_opponent.player_symbol]['action']
                reward_agent2 = env.get_reward(agent2_opponent.player_symbol, winner)
                # Note: The reward for agent2 is based on the same game outcome
                agent2_opponent.update_q_table(s, a, reward_agent2, next_state_hash, next_available_moves, game_over)
                last_move_info[agent2_opponent.player_symbol] = None


        => In here, when it is agent_1_turn the next_state_hash is the state after reflecting agent_1's action. Do you see what is the problem? The next state, which is given to agent 1 and used to select the max value action that agent 1 can take, should be the state after agent_2's action is taken. So in Q-learning, the true target value is always 0 unless it is the very before the termination state(where reward is not zero). Because it tries to use the estimation of non-existing state.