<a href="https://colab.research.google.com/github/jeniferGoncalvesDaSilvaDev/algo_min_max_tic_tac_toe/blob/main/C%C3%B3pia_de_C%C3%B3pia_de_minmax_algo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install collections

[31mERROR: Could not find a version that satisfies the requirement collections (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for collections[0m[31m
[0m

In [None]:
from collections import defaultdict
import random
import math

# ----------------------------- Game utilities --------------------------------

# Board indices: 0 1

#                2 3

WIN_LINES = [(0,1), (2,3), (0,2), (1,3), (0,3), (1,2)]  # all 2-in-line possibilities

EMPTY = 0
MAX = 1   # X
MIN = -1  # O

def is_terminal(board):
    """Determines if the given board state is terminal and returns the reward.

    Args:
        board (tuple): A tuple representing the current state of the game board.

    Returns:
        tuple: A tuple containing:
            - terminal (bool): True if the board is a terminal state (win, loss, or draw), False otherwise.
            - reward (int): +1 if MAX wins, -1 if MIN wins, 0 for a draw or non-terminal state.
    """
    # check wins
    for (i,j) in WIN_LINES:
        if board[i] == board[j] != EMPTY:
            return True, (1 if board[i] == MAX else -1)
    # draw (all filled)
    if all(cell != EMPTY for cell in board):
        return True, 0
    return False, 0

def legal_actions(board):
    """Returns a list of legal actions (empty cell indices) for the current board state.

    Args:
        board (tuple): A tuple representing the current state of the game board.

    Returns:
        list: A list of integers, where each integer is the index of an empty cell.
    """
    return [i for i,v in enumerate(board) if v == EMPTY]

def apply_action(board, action, player):
    """Applies a given action to the board for a specified player.

    Args:
        board (tuple): The current state of the game board.
        action (int): The index of the cell where the player wants to make a move.
        player (int): The player making the move (MAX=1 or MIN=-1).

    Returns:
        tuple: A new tuple representing the board state after the action has been applied.
    """
    new = list(board)
    new[action] = player
    return tuple(new)

# -------------------------- Truth-table mapping --------------------------------

def state_propositions(board):
    """Returns a dictionary of boolean propositions for a given board state.

    Example propositions: cell_i_is_X, cell_i_is_O for i in 0..3,
    X_two_in_line_threat, O_two_in_line_threat, any_center_empty.

    Args:
        board (tuple): A tuple representing the current state of the game board.

    Returns:
        dict: A dictionary where keys are proposition names (strings) and values are booleans.
    """
    props = {}
    for i in range(4):
        props[f'cell_{i}is_X'] = (board[i] == MAX)
        props[f'cell{i}_is_O'] = (board[i] == MIN)

    def has_threat(player):
        """Helper function to check for two-in-line threats."""
        for (a,b) in WIN_LINES:
            # X_has_threat: True if the player has one piece on a winning line and the other cell is empty.
            if (board[a] == player and board[b] == EMPTY) or (board[b] == player and board[a] == EMPTY):
                return True
        return False

    # X_has_threat: Proposition indicating if MAX (X) has a potential winning line with one piece and one empty cell.
    props['X_has_threat'] = has_threat(MAX)
    # O_has_threat: Proposition indicating if MIN (O) has a potential winning line with one piece and one empty cell.
    props['O_has_threat'] = has_threat(MIN)

    # A simple 'center' concept for the tiny board: cells 1 and 2 are considered 'center-ish'
    # center_any_empty: Proposition indicating if any of the 'center' cells (1 or 2) are empty.
    props['center_any_empty'] = (board[1] == EMPTY or board[2] == EMPTY)

    return props

def propositions_to_key(props):
    """Creates a deterministic string key from a dictionary of propositions.
    This key represents a unique 'truth-table row' for the state.

    Args:
        props (dict): A dictionary of boolean propositions.

    Returns:
        str: A string formed by concatenating '1' for True and '0' for False
             values of propositions, sorted by key name for canonical order.
    """
    keys = sorted(props.keys())
    bits = ['1' if props[k] else '0' for k in keys]
    return ''.join(bits)

# ---------------------------- Minimax search ----------------------------------

def minimax_value(board, player):
    """Calculates the minimax value for a given board state for the specified player.

    This function recursively explores the game tree to determine the optimal
    outcome for the 'player' assuming both players play optimally.

    Args:
        board (tuple): The current state of the game board.
        player (int): The current player whose turn it is (MAX=1 or MIN=-1).

    Returns:
        int: The optimal value of the state from the current player's perspective
             (+1 for win, -1 for loss, 0 for draw).
    """
    term, reward = is_terminal(board)
    if term:
        return reward

    if player == MAX:
        best = -math.inf
        for a in legal_actions(board):
            val = minimax_value(apply_action(board,a,MAX), MIN)
            if val > best:
                best = val
        return best
    else:
        best = math.inf
        for a in legal_actions(board):
            val = minimax_value(apply_action(board,a,MIN), MAX)
            if val < best:
                best = val
        return best

def minimax_policy(board, player):
    """Determines the optimal actions for a player using the minimax algorithm.

    This function finds all legal moves that lead to the best possible outcome
    for the current player, assuming optimal play from both sides.

    Args:
        board (tuple): The current state of the game board.
        player (int): The player for whom to determine the optimal actions (MAX=1 or MIN=-1).

    Returns:
        list: A list of integers, where each integer is an index representing an optimal action.
              Returns an empty list if the board is a terminal state.
    """
    term, _ = is_terminal(board)
    if term:
        return []
    best_actions = []
    best_val = -math.inf if player == MAX else math.inf
    for a in legal_actions(board):
        val = minimax_value(apply_action(board,a,player), -player)
        if player == MAX:
            if val > best_val:
                best_val = val
                best_actions = [a]
            elif val == best_val:
                best_actions.append(a)
        else:
            if val < best_val:
                best_val = val
                best_actions = [a]
            elif val == best_val:
                best_actions.append(a)
    return best_actions

# -------------------------- Tabular Q-learning ---------------------------------

class QLearner:
    """Implements a Q-learning agent for learning optimal policies in a game.

    The Q-learner uses a tabular approach to store Q-values for state-action pairs
    and updates them based on rewards received and future expected values.
    """
    def __init__(self, alpha=0.5, gamma=0.99, epsilon=0.2):
        """Initializes the QLearner with learning parameters.

        Args:
            alpha (float): The learning rate, controlling how much new information overrides old information.
            gamma (float): The discount factor, determining the importance of future rewards.
            epsilon (float): The exploration-exploitation trade-off parameter.
                             (Probability of choosing a random action).
        """
        # Q[state_key][action] = value
        self.Q = defaultdict(lambda: defaultdict(float))
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon

    def get_Q(self, state_key, action):
        """Retrieves the Q-value for a given state-action pair.

        Args:
            state_key (str): The unique string representation of the state.
            action (int): The action taken from that state.

        Returns:
            float: The Q-value associated with the state-action pair.
        """
        return self.Q[state_key][action]

    def choose_action(self, state_key, legal_actions_list):
        """Selects an action using an epsilon-greedy policy.

        Args:
            state_key (str): The unique string representation of the current state.
            legal_actions_list (list): A list of legal actions available in the current state.

        Returns:
            int: The chosen action (an index).
        """
        # epsilon-greedy: with probability epsilon, choose a random action.
        if random.random() < self.epsilon:
            return random.choice(legal_actions_list)
        # Otherwise, pick the action with the maximum Q-value.
        qs = [(self.get_Q(state_key,a),a) for a in legal_actions_list]
        maxq = max(qs, key=lambda x: x[0])[0]
        best = [a for q,a in qs if q==maxq]
        return random.choice(best)

    def update(self, s_key, a, r, s_next_key, legal_a_next, opponent_policy):
        """Updates the Q-value for a state-action pair using the Q-learning update rule.

        Args:
            s_key (str): The key for the previous state (current state where action 'a' was taken).
            a (int): The action taken from state `s_key`.
            r (int): The immediate reward received after taking action 'a'.
            s_next_key (str or None): The key for the next state. None if `s_next_key` is a terminal state.
            legal_a_next (list): A list of legal actions available in the next state `s_next_key`.
            opponent_policy (function): The opponent's policy function (e.g., minimax_policy).
        """
        # Since opponent is fixed, the next state's value is expectation under opponent policy
        # target = r + gamma * max_a' E_{opponent}[ Q(s', a') ]
        if not legal_a_next:  # next is terminal
            target = r
        else:
            # compute expected Q for each candidate next action a' (MAX's action)
            best_values = []
            for a_prime in legal_a_next:
                # opponent will respond according to their policy; here we look up Q(s',a')
                # for MAX's potential actions in the next state.
                best_values.append(self.get_Q(s_next_key, a_prime))
            # The Q-learner (MAX) aims to maximize its expected future reward, so it considers the max Q-value.
            target = r + self.gamma * max(best_values)
        # TD update: Q(s,a) = Q(s,a) + alpha * (target - Q(s,a))
        cur = self.get_Q(s_key, a)
        self.Q[s_key][a] = cur + self.alpha * (target - cur)

# ------------------------ Environment & Training loop --------------------------

def play_episode(qlearner, opponent_policy_func, train=True, verbose=False):
    """Simulates a single episode of the game between the Q-learner (MAX) and an opponent (MIN).

    Args:
        qlearner (QLearner): The Q-learning agent for player MAX.
        opponent_policy_func (function): A function representing the opponent's policy (e.g., minimax_policy).
        train (bool): If True, the Q-learner updates its Q-values during the episode.
        verbose (bool): If True, prints game progress.

    Returns:
        int: The final reward for MAX (+1 for win, -1 for loss, 0 for draw).
    """
    board = (0,0,0,0)
    player = MAX
    history = []

    while True:
        term, reward = is_terminal(board)
        if term:
            # If the game is terminal, return the final reward.
            if train:
                # Update for the last transition if necessary (e.g., if MAX made the move leading to terminal state)
                # For this specific implementation, the update happens when MIN makes a move, so nothing extra here.
                pass
            if verbose:
                print('Terminal:', board, 'reward', reward)
            return reward

        if player == MAX:
            # MAX's turn: choose action using the Q-learner's policy (epsilon-greedy).
            props = state_propositions(board)
            s_key = propositions_to_key(props)
            acts = legal_actions(board)
            a = qlearner.choose_action(s_key, acts)
            # Apply MAX's chosen action.
            board_next = apply_action(board, a, MAX)
            # Store MAX's move for potential future Q-value update.
            history.append(('MAX', board, s_key, a))
            board = board_next
            player = MIN

        else:
            # MIN's turn: opponent plays according to its fixed policy (minimax).
            best_actions = opponent_policy_func(board, MIN)
            if not best_actions:
                # This case handles a rare scenario where MIN has no legal moves (e.g., immediate draw/terminal before MIN's turn logic finishes).
                player = MAX
                continue
            a_op = random.choice(best_actions)
            board_next = apply_action(board, a_op, MIN)

            # Q-learner update: If MAX just made a move in the previous step, update its Q-value.
            if history and history[-1][0] == 'MAX':
                _, board_prev, s_key_prev, a_prev = history[-1]
                term_now, reward_now = is_terminal(board_next)
                # Determine the next state key and legal actions for MAX's perspective for the update.
                if term_now:
                    s_next_key = None
                    legal_next = []
                else:
                    props_next = state_propositions(board_next)
                    s_next_key = propositions_to_key(props_next)
                    legal_next = legal_actions(board_next)
                if train:
                    # Perform the Q-value update based on MAX's previous action and the outcome after MIN's response.
                    qlearner.update(s_key_prev, a_prev, reward_now, s_next_key, legal_next, opponent_policy_func)
                # Clear history for this MAX move as it has been processed.
                history.pop()

            board = board_next
            player = MAX

# ----------------------------- Evaluation -------------------------------------

def evaluate(qlearner, opponent_policy_func, episodes=200):
    """Evaluates the performance of the Q-learner against an opponent over multiple episodes.

    Args:
        qlearner (QLearner): The Q-learning agent to evaluate.
        opponent_policy_func (function): The opponent's policy function (e.g., minimax_policy).
        episodes (int): The number of episodes to run for evaluation.

    Returns:
        tuple: A tuple containing:
            - wins (int): The number of episodes won by the Q-learner (MAX).
            - ties (int): The number of episodes that ended in a draw.
            - losses (int): The number of episodes lost by the Q-learner (MAX).
    """
    wins = 0
    ties = 0
    losses = 0
    old_eps = qlearner.epsilon
    qlearner.epsilon = 0.0  # Set epsilon to 0 for greedy evaluation
    for _ in range(episodes):
        res = play_episode(qlearner, opponent_policy_func, train=False, verbose=False)
        if res == 1:
            wins += 1
        elif res == 0:
            ties += 1
        else:
            losses += 1
    qlearner.epsilon = old_eps # Restore original epsilon
    return wins, ties, losses

# ------------------------------- Main -----------------------------------------

if __name__ == '__main__':
    random.seed(42)

    q = QLearner(alpha=0.7, gamma=0.95, epsilon=0.2)

    episodes = 3000
    eval_every = 300

    # Prints an introductory message about the training process.
    print('Training Q-learner against fixed minimax opponent (MIN plays optimal).')
    # Explains the state representation used by the Q-learner.
    print('State representation is a truth-table of propositions (see state_propositions()).')

    for ep in range(1, episodes+1):
        play_episode(q, minimax_policy, train=True)

        if ep % eval_every == 0:
            w,t,l = evaluate(q, minimax_policy, episodes=500)
            # Reports the Q-learner's performance (Wins/Ties/Losses) against the minimax opponent
            # at regular evaluation intervals during training.
            print(f'Episode {ep}: Win/Tie/Loss = {w}/{t}/{l}')

    # Prints a newline for better readability before the final evaluation.
    print('\nFinal evaluation against minimax opponent:')
    w,t,l = evaluate(q, minimax_policy, episodes=2000)
    # Displays the final Win/Tie/Loss record of the Q-learner after all training episodes
    # against the fixed minimax opponent over a larger number of evaluation episodes.
    print('Wins/Ties/Losses:', w, t, l)

    # Inspect learned Q for initial empty board
    init_props = state_propositions((0,0,0,0))
    init_key = propositions_to_key(init_props)
    # Prints the canonical key representation for the initial empty board state based on its propositions.
    print('\nInitial state propositions key:', init_key)
    # Indicates that the following output will list the Q-values for possible actions from the initial state.
    print('Q-values for initial state:')
    for a in legal_actions((0,0,0,0)):
        # Shows the learned Q-value for each legal action from the initial empty board state.
        print('action', a, 'Q=', q.get_Q(init_key,a))

    # Example: show propositions mapping for a sample board
    sample = (1,0,-1,0)
    # Prints a sample board configuration to demonstrate the state representation.
    print('\nSample board', sample)
    # Displays the boolean propositions generated for the `sample` board, illustrating
    # how a board state is translated into features for the Q-learner.
    print('Propositions:', state_propositions(sample))

Training Q-learner against fixed minimax opponent (MIN plays optimal).
State representation is a truth-table of propositions (see state_propositions()).
Episode 300: Win/Tie/Loss = 500/0/0
Episode 600: Win/Tie/Loss = 500/0/0
Episode 900: Win/Tie/Loss = 500/0/0
Episode 1200: Win/Tie/Loss = 500/0/0
Episode 1500: Win/Tie/Loss = 500/0/0
Episode 1800: Win/Tie/Loss = 500/0/0
Episode 2100: Win/Tie/Loss = 500/0/0
Episode 2400: Win/Tie/Loss = 500/0/0
Episode 2700: Win/Tie/Loss = 500/0/0
Episode 3000: Win/Tie/Loss = 500/0/0

Final evaluation against minimax opponent:
Wins/Ties/Losses: 2000 0 0

Initial state propositions key: 00000000001
Q-values for initial state:
action 0 Q= 0.0
action 1 Q= 0.0
action 2 Q= 0.0
action 3 Q= 0.0

Sample board (1, 0, -1, 0)
Propositions: {'cell_0is_X': True, 'cell0_is_O': False, 'cell_1is_X': False, 'cell1_is_O': False, 'cell_2is_X': False, 'cell2_is_O': True, 'cell_3is_X': False, 'cell3_is_O': False, 'X_has_threat': True, 'O_has_threat': True, 'center_any_empty':

# Task
Add comments to the main execution block (if __name__ == '__main__':) in the provided code that explain what each print statement in the output signifies, line by line, to provide a clear understanding of the training and evaluation process.

## Explain code output

### Subtask:
Add comments to the main execution block (if __name__ == '__main__':) that explain what each print statement in the output signifies, line by line, to provide a clear understanding of the training and evaluation process.


## Summary:

### Data Analysis Key Findings
- The main execution block (`if __name__ == '__main__':`) of the code has been thoroughly commented.
- Each print statement in the training and evaluation process output now includes a line-by-line explanation clarifying its significance.
- The comments elucidate the sequential steps, from model initialization through epoch-specific training progress and loss reporting, to the final evaluation metrics.

### Insights or Next Steps
- The added comments provide a clear and explicit understanding of the model's operational flow and performance reporting, enhancing code readability and maintainability.
- This detailed explanation is beneficial for anyone reviewing the code output, allowing for quick comprehension of the training and evaluation phases without needing deep code inspection.


In [2]:
from collections import defaultdict
import random
import math
from copy import deepcopy

# ----------------------------- Game utilities --------------------------------

# Board indices: 0 1

#                2 3

WIN_LINES = [(0,1), (2,3), (0,2), (1,3), (0,3), (1,2)]  # all 2-in-line possibilities

EMPTY = 0

# ----------------------- Multi-agent environment -----------------------------

class MultiAgentGame:
    def __init__(self, n_players=4):
        assert 2 <= n_players <= 4, "n_players must be between 2 and 4"
        self.n = n_players

    def init_board(self):
        # Empty board represented by tuple of length 4 with values 0 (empty) or player id (1..n)
        return tuple([EMPTY]*4)

    def legal_actions(self, board):
        return [i for i,v in enumerate(board) if v == EMPTY]

    def apply_action(self, board, action, player):
        new = list(board)
        new[action] = player
        return tuple(new)

    def is_terminal(self, board):
        # Check for any winner: player who has both cells in any WIN_LINES
        for (i,j) in WIN_LINES:
            if board[i] != EMPTY and board[i] == board[j]:
                winner = board[i]
                return True, winner  # return winner id
        # draw if full
        if all(cell != EMPTY for cell in board):
            return True, None
        return False, None

    def reward_vector(self, winner_id):
        # reward 1 to winner, 0 to others; if draw, all 0
        if winner_id is None:
            return [0.0]*self.n
        else:
            return [1.0 if (i+1) == winner_id else 0.0 for i in range(self.n)]

# -------------------------- Truth-table mapping --------------------------------

def state_propositions_multi(board, n_players):
    """Return dict of boolean propositions for a given board and number of players.
    Propositions include: cell_i_is_pj for each cell 0..3 and player 1..n
    Additional features: for each player, 'has_threat' (one piece on a winning line and one empty)
    """
    props = {}
    for i in range(4):
        for p in range(1, n_players+1):
            props[f'cell_{i}_is_p{p}'] = (board[i] == p)

    def has_threat(player):
        for (a,b) in WIN_LINES:
            if (board[a] == player and board[b] == EMPTY) or (board[b] == player and board[a] == EMPTY):
                return True
        return False

    for p in range(1, n_players+1):
        props[f'P{p}_has_threat'] = has_threat(p)

    # simple center concept
    props['center_any_empty'] = (board[1] == EMPTY or board[2] == EMPTY)
    return props

def propositions_to_key(props):
    keys = sorted(props.keys())
    bits = ['1' if props[k] else '0' for k in keys]
    return ''.join(bits)

# ------------------------ Vector Q-table & utilities ---------------------------

class VectorQLearner:
    """Tabular vector-valued Q: Q[state_key][action] -> list of length n_players"""
    def __init__(self, n_players, alpha=0.5, gamma=0.95, epsilon=0.2):
        self.n = n_players
        self.Q = defaultdict(lambda: defaultdict(self._zero_action))
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon

    def _zero_action(self):
        # returns a fresh zero-vector for action values
        return [0.0]*self.n

    def get_Q_vector(self, state_key, action):
        return self.Q[state_key][action]

    def get_component(self, state_key, action, player_idx):
        return self.get_Q_vector(state_key, action)[player_idx]

    def set_Q_vector(self, state_key, action, vec):
        self.Q[state_key][action] = vec

    def choose_action(self, state_key, legal_actions_list, player_idx, coop_threshold=0.0):
        """Choose action for player index (0-based). Implements epsilon-greedy and a simple cooperative heuristic.
        Coop heuristic: if for some opponent j, max_a' Q_j(s,a') - max_a' Q_i(s,a') > coop_threshold,
        then consider cooperation with that opponent by maximizing sum of components (i+j) instead of only i.
        """
        if random.random() < self.epsilon:
            return random.choice(legal_actions_list)

        # Evaluate own-component values
        qs = [(self.get_component(state_key,a,player_idx), a) for a in legal_actions_list]
        max_own = max(qs, key=lambda x: x[0])[0]

        # find if any opponent has strictly larger best value
        best_opponent = None
        best_delta = 0.0
        for j in range(self.n):
            if j == player_idx: continue
            opp_qs = [self.get_component(state_key,a,j) for a in legal_actions_list]
            if not opp_qs: continue
            max_opp = max(opp_qs)
            delta = max_opp - max_own
            if delta > best_delta:
                best_delta = delta
                best_opponent = j

        # If cooperative advantage exists above threshold, compute action maximizing sum of components
        if best_opponent is not None and best_delta > coop_threshold:
            # maximize Q_i + Q_j
            summed = [ (self.get_component(state_key,a,player_idx) + self.get_component(state_key,a,best_opponent), a)
                      for a in legal_actions_list]
            maxsum = max(summed, key=lambda x: x[0])[0]
            best_actions = [a for val,a in summed if val == maxsum]
            return random.choice(best_actions)

        # otherwise act greedily on own component
        best = [a for val,a in qs if val == max_own]
        return random.choice(best)

    def update(self, s_key, a, reward_vector, s_next_key, legal_a_next):
        """Update only the component(s) of players who acted recently.
        Here we will update all components that have their turn in the next state? Simpler: update the acting player's component(s) passed via caller.
        For clarity we assume caller updates components for specific players. This method implements generic vector TD for all players using max over their components.
        """
        # current vector
        cur = self.get_Q_vector(s_key, a)
        target = [0.0]*self.n
        if not legal_a_next:
            # terminal: target = reward
            target = reward_vector
        else:
            # non-terminal: target_i = r_i + gamma * max_{a'} Q_i(s',a')
            for i in range(self.n):
                # find max over next actions for component i
                next_vals = [self.get_component(s_next_key, a2, i) for a2 in legal_a_next]
                best_next = max(next_vals) if next_vals else 0.0
                target[i] = reward_vector[i] + self.gamma * best_next

        # TD update for vector
        newvec = [0.0]*self.n
        for i in range(self.n):
            newvec[i] = cur[i] + self.alpha * (target[i] - cur[i])
        self.set_Q_vector(s_key, a, newvec)

# ------------------------ Training & Episode logic ----------------------------

def play_episode_multi(game, qlearner, coop_threshold=0.0, train=True, verbose=False):
    board = game.init_board()
    player_turn = 1  # players numbered 1..n
    history = []  # store (player, s_key, action) for updates

    while True:
        term, winner = game.is_terminal(board)
        if term:
            rewards = game.reward_vector(winner)
            # update for last actions in history
            if train:
                # update all stored actions with resulting reward and terminal next-state
                while history:
                    pl, s_key, a = history.pop()
                    # terminal next: no legal next actions
                    qlearner.update(s_key, a, rewards, None, [])
            if verbose:
                print('Terminal board', board, 'winner', winner, 'rewards', rewards)
            return winner, rewards

        # current state key
        props = state_propositions_multi(board, game.n)
        s_key = propositions_to_key(props)
        legal = game.legal_actions(board)

        # player chooses action
        a = qlearner.choose_action(s_key, legal, player_idx=player_turn-1, coop_threshold=coop_threshold)

        # record history for later update (we'll update after seeing next state / later terminal)
        history.append((player_turn, s_key, a))

        # apply action
        board = game.apply_action(board, a, player_turn)

        # next player's turn
        player_turn = player_turn + 1
        if player_turn > game.n:
            player_turn = 1

        # if training with intermediate updates, optionally do partial updates here (omitted for clarity)

# ------------------------------- Evaluation ----------------------------------

def evaluate_multi(game, qlearner, episodes=200):
    win_counts = {i+1:0 for i in range(game.n)}
    draws = 0
    old_eps = qlearner.epsilon
    qlearner.epsilon = 0.0
    for _ in range(episodes):
        winner, _ = play_episode_multi(game, qlearner, coop_threshold=0.0, train=False, verbose=False)
        if winner is None:
            draws += 1
        else:
            win_counts[winner] += 1
    qlearner.epsilon = old_eps
    return win_counts, draws

# --------------------------------- Main --------------------------------------

if __name__ == '__main__':
    random.seed(1)

    # Example: 3-player match
    n_players = 3
    game = MultiAgentGame(n_players=n_players)
    q = VectorQLearner(n_players=n_players, alpha=0.6, gamma=0.95, epsilon=0.3)

    episodes = 2000
    eval_every = 200

    print('Training multi-agent vector-Q (Max^N-like decisions)')
    print('Players:', n_players)

    for ep in range(1, episodes+1):
        play_episode_multi(game, q, coop_threshold=0.1, train=True)
        if ep % eval_every == 0:
            wins, draws = evaluate_multi(game, q, episodes=500)
            print(f'Episode {ep}: wins per player {wins}, draws {draws}')

    print('\nFinal evaluation:')
    wins, draws = evaluate_multi(game, q, episodes=2000)
    print('Wins:', wins, 'Draws:', draws)

    # Inspect Q for initial empty board
    init_key = propositions_to_key(state_propositions_multi(game.init_board(), n_players))
    print('\nInitial state key:', init_key)
    print('Q-vectors for initial state:')
    for a in game.legal_actions(game.init_board()):
        print('action', a, 'Q=', q.get_Q_vector(init_key, a))

    print('\nExample: show propositions for a sample board')
    sample = (1,0,2,0) if n_players>=2 else (1,0,0,0)
    print('sample board', sample)
    print('propositions', state_propositions_multi(sample, n_players))

Training multi-agent vector-Q (Max^N-like decisions)
Players: 3
Episode 200: wins per player {1: 500, 2: 0, 3: 0}, draws 0
Episode 400: wins per player {1: 500, 2: 0, 3: 0}, draws 0
Episode 600: wins per player {1: 500, 2: 0, 3: 0}, draws 0
Episode 800: wins per player {1: 500, 2: 0, 3: 0}, draws 0
Episode 1000: wins per player {1: 500, 2: 0, 3: 0}, draws 0
Episode 1200: wins per player {1: 500, 2: 0, 3: 0}, draws 0
Episode 1400: wins per player {1: 500, 2: 0, 3: 0}, draws 0
Episode 1600: wins per player {1: 500, 2: 0, 3: 0}, draws 0
Episode 1800: wins per player {1: 500, 2: 0, 3: 0}, draws 0
Episode 2000: wins per player {1: 500, 2: 0, 3: 0}, draws 0

Final evaluation:
Wins: {1: 2000, 2: 0, 3: 0} Draws: 0

Initial state key: 0000000000000001
Q-vectors for initial state:
action 0 Q= [1.0, 0.0, 0.0]
action 1 Q= [1.0, 0.0, 0.0]
action 2 Q= [1.0, 0.0, 0.0]
action 3 Q= [1.0, 0.0, 0.0]

Example: show propositions for a sample board
sample board (1, 0, 2, 0)
propositions {'cell_0_is_p1': Tru