In [1]:
import numpy as np
from collections import defaultdict
import random
import matplotlib.pyplot as plt
from tqdm import tqdm

In [6]:
class GameBoard:
    def __init__(self, board_size=4):
        self.board_size = board_size
        self.reset()
        
    def reset(self):
        self.board = np.zeros((self.board_size, self.board_size))
        self.current_player = 1  # 1 for X, -1 for O
        return self.get_state()
    
    def get_state(self):
        return str(self.board.tolist())
    
    def is_valid_move(self, row, col):
        return 0 <= row < self.board_size and 0 <= col < self.board_size and self.board[row][col] == 0
    
    def get_valid_moves(self):
        return [(i, j) for i in range(self.board_size) for j in range(self.board_size) if self.board[i][j] == 0]
    
    def make_move(self, row, col):
        if not self.is_valid_move(row, col):
            return None, -10, True  # Invalid move penalty
        
        self.board[row][col] = self.current_player
        
        # Check win condition
        if self.check_win():
            return self.get_state(), 1, True
        
        # Check draw
        if len(self.get_valid_moves()) == 0:
            return self.get_state(), 0, True
        
        self.current_player *= -1  # Switch player
        return self.get_state(), 0, False
    
    def check_win(self):
        # Check rows, columns and diagonals
        for i in range(self.board_size):
            if abs(sum(self.board[i])) == self.board_size:  # Check rows
                return True
            if abs(sum(self.board[:, i])) == self.board_size:  # Check columns
                return True
        
        # Check diagonals
        if abs(sum([self.board[i][i] for i in range(self.board_size)])) == self.board_size:
            return True
        if abs(sum([self.board[i][self.board_size-1-i] for i in range(self.board_size)])) == self.board_size:
            return True
        
        return False

In [3]:
class ValueIteration:
    def __init__(self, env, gamma=0.9):
        self.env = env
        self.gamma = gamma
        self.values = defaultdict(float)
        self.policy = {}
    
    def train(self, num_iterations=1000):
        for _ in range(num_iterations):
            delta = 0
            states_to_update = set()
            
            # Generate states to update
            self.env.reset()
            self._generate_states(self.env.get_state(), states_to_update)
            
            # Update values
            for state in states_to_update:
                board = eval(state)
                self.env.board = np.array(board)
                old_value = self.values[state]
                self.values[state] = self._get_max_value(state)
                delta = max(delta, abs(old_value - self.values[state]))
    
    def _generate_states(self, state, states):
        states.add(state)
        board = eval(state)
        self.env.board = np.array(board)
        
        if self.env.check_win() or len(self.env.get_valid_moves()) == 0:
            return
        
        for move in self.env.get_valid_moves():
            self.env.board = np.array(board)  # Reset board
            next_state, _, _ = self.env.make_move(move[0], move[1])
            if next_state not in states:
                self._generate_states(next_state, states)
    
    def _get_max_value(self, state):
        board = eval(state)
        self.env.board = np.array(board)
        
        if self.env.check_win():
            return 1 if self.env.current_player == 1 else -1
        
        valid_moves = self.env.get_valid_moves()
        if not valid_moves:
            return 0
        
        values = []
        for move in valid_moves:
            self.env.board = np.array(board)  # Reset board
            next_state, reward, done = self.env.make_move(move[0], move[1])
            if done:
                values.append(reward)
            else:
                values.append(reward + self.gamma * -self._get_max_value(next_state))  # Negative because opponent's turn
        
        return max(values)
    
    def get_action(self, state):
        board = eval(state)
        self.env.board = np.array(board)
        valid_moves = self.env.get_valid_moves()
        
        if not valid_moves:
            return None
        
        best_value = float('-inf')
        best_move = None
        
        for move in valid_moves:
            self.env.board = np.array(board)  # Reset board
            next_state, reward, done = self.env.make_move(move[0], move[1])
            if done:
                value = reward
            else:
                value = reward + self.gamma * -self.values[next_state]  # Negative because opponent's turn
            
            if value > best_value:
                best_value = value
                best_move = move
        
        return best_move

In [4]:
def play_game(env, agent, opponent_policy='random'):
    state = env.reset()
    done = False
    
    while not done:
        if env.current_player == 1:  # Agent's turn
            action = agent.get_action(state)
        else:  # Opponent's turn
            if opponent_policy == 'random':
                valid_moves = env.get_valid_moves()
                action = random.choice(valid_moves) if valid_moves else None
        
        if action is None:
            break
        
        state, reward, done = env.make_move(action[0], action[1])
    
    if env.check_win():
        return 1 if env.current_player == -1 else -1  # Return 1 if agent wins, -1 if opponent wins
    return 0  # Draw

In [5]:
def evaluate_agent(env, agent, num_games=100):
    wins = 0
    losses = 0
    draws = 0
    
    for _ in range(num_games):
        result = play_game(env, agent)
        if result == 1:
            wins += 1
        elif result == -1:
            losses += 1
        else:
            draws += 1
    
    return wins / num_games, losses / num_games, draws / num_games

In [None]:
# Training and evaluation for both board sizes
board_sizes = [4, 5]
results = {}

for size in board_sizes:
    env = GameBoard(board_size=size)
    agent = ValueIteration(env)
    agent.train(num_iterations=1000)
    
    win_rates = []
    eval_intervals = list(range(0, 1001, 100))
    
    for i in eval_intervals:
        if i > 0:
            agent.train(num_iterations=100)
        win_rate, loss_rate, draw_rate = evaluate_agent(env, agent)
        win_rates.append(win_rate)
    
    results[size] = (eval_intervals, win_rates)



In [None]:
# Plotting
plt.figure(figsize=(10, 6))
for size in board_sizes:
    eval_intervals, win_rates = results[size]
    plt.plot(eval_intervals, win_rates, label=f'{size}x{size} board')

plt.xlabel('Training Iterations')
plt.ylabel('Win Rate')
plt.title('RL Agent Win Rate vs Training Iterations')
plt.legend()
plt.grid(True)

# Display final win rates
for size in board_sizes:
    _, win_rates = results[size]
    print(f"Final win rate for {size}x{size} board: {win_rates[-1]:.2f}")