In [15]:
%pip install numpy ipywidgets

Note: you may need to restart the kernel to use updated packages.


In [16]:
import numpy as np
import ipywidgets as widgets 
import random

In [17]:
class TicTacToe:
    def __init__(self):
        self.board = np.zeros((3, 3), dtype=int)  # 0: empty, 1: player 1, -1: player 2
        self.current_player = 1  # Player 1 starts
        self.game_over = False
        self.winner = None
        self.history = []

    def reset(self):
        self.board.fill(0)
        self.current_player = 1
        self.game_over = False
        self.winner = None
        self.history = []
        return self.board.copy()

    def step(self, action):
        row, col = divmod(action, 3)
        if self.board[row, col] == 0 and not self.game_over:
            hist = {
                "player": self.current_player,
                "action": action,
                "state": self.board.copy()
            }
            
            self.board[row, col] = self.current_player
            hist["next_state"] = self.board.copy()
            self.history.append(hist)
            self.check_game_over(row, col)
            if not self.game_over:
                self.current_player *= -1  # Switch player
            return self.board.copy(), self.calculate_reward(), self.game_over, {}
        return self.board.copy(), 0, self.game_over, {"message": "Invalid move"}

    def calculate_reward(self):
        if self.game_over:
            if self.winner is not None:
                return 1 if self.winner == 1 else -1
            else:
                return 0.5  # Consider a draw as a neutral outcome
        return 0  # No reward if the game is not over

    def check_game_over(self, row, col):
        # Check for win conditions: rows, columns, diagonals
        if np.all(self.board[row, :] == self.current_player) or \
           np.all(self.board[:, col] == self.current_player) or \
           (row == col and np.all(np.diag(self.board) == self.current_player)) or \
           (row + col == 2 and np.all(np.diag(np.fliplr(self.board)) == self.current_player)):
            self.game_over = True
            self.winner = self.current_player
        elif not np.any(self.board == 0):
            self.game_over = True  # Draw

    def available_moves(self):
        return [i * 3 + j for i, j in zip(*np.where(self.board == 0))]

In [18]:
class QLearningAgent:
    def __init__(self, learning_rate=0.1, discount_factor=0.95, exploration_rate=1.0, exploration_decay=0.99):
        self.q_table = {}  # Initialize Q-table with an empty dictionary
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.exploration_rate = exploration_rate
        self.exploration_decay = exploration_decay

    def get_q_values(self, state):
        # Convert state to a tuple to use as a dictionary key
        state_key = tuple(state.reshape(-1))
        if state_key not in self.q_table:
            self.q_table[state_key] = np.zeros(9)
        return self.q_table[state_key]

    def update_q_values(self, state, action, reward, next_state):
        # Basic Q-learning formula to update Q-values
        
        current_q = self.get_q_values(state)[action]
        max_future_q = np.max(self.get_q_values(next_state))
        new_q = (1 - self.learning_rate) * current_q + self.learning_rate * (reward + self.discount_factor * max_future_q)
        self.q_table[tuple(state.reshape(-1))][action] = new_q

    def choose_action(self, state, available_actions):
        if random.uniform(0, 1) < self.exploration_rate:
            return random.choice(available_actions)  # Explore
        else:
            q_values = self.get_q_values(state)
            # Filter q_values by available actions
            q_values_filtered = np.array([q_values[i] if i in available_actions else -np.inf for i in range(9)])
            return np.argmax(q_values_filtered)  # Exploit

    def decay_exploration(self):
        self.exploration_rate *= self.exploration_decay
        self.exploration_rate = max(self.exploration_rate, 0.1)  # Ensure some level of exploration continues

    
def assign_p_per_agent(game, player1, player2):
    for history in game.history:
        action, state, next_state, player = history["action"], history["state"], history["next_state"], history["player"]
        
        if player == 1:
            if game.winner == 1:
                player1.update_q_values(state, action, 1, next_state)
            elif game.winner == -1:
                player1.update_q_values(state, action, -1, next_state)
            else:
                player1.update_q_values(state, action, 0.1, next_state)
        else:
            if game.winner == -1:
                player2.update_q_values(state, action, 1, next_state)
            elif game.winner == 1:
                player2.update_q_values(state, action, -1, next_state)
            else:
                player2.update_q_values(state, action, 0.1, next_state)
# Updating the QLearningAgent's train_q_learning function to accommodate the TicTacToe class updates
def train_q_learning(agent, num_episodes=1000):
    results = {"wins": 0, "losses": 0, "draws": 0}
    game = TicTacToe()
    for episode in range(num_episodes):
        state = game.reset()
        done = False
        states = []
        actions = []
        while not done:
            available_actions = game.available_moves()
            action = agent.choose_action(state, available_actions)
            next_state, reward, done, info = game.step(action)
            states.append(state)
            actions.append(action)
            state = next_state

        # Update Q-values after the game is over
        final_reward = reward
        assign_p_per_agent(game, agent, agent)

        # Update exploration rate
        agent.decay_exploration()
        
        # Log results
        if game.winner == 1:
            results["wins"] += 1
        elif game.winner == -1:
            results["losses"] += 1
        else:
            results["draws"] += 1


    print(f"Training completed. Results: {results} {agent.exploration_rate}")
    return agent

In [19]:
# train multiple battle royal style, return the best agent
def train_q_learning_battle_royal(num_agents=10, num_episodes=1000):
    # randomize the parameters of each agent
    agents = [QLearningAgent(learning_rate=random.uniform(0.1, 0.5), discount_factor=random.uniform(0.9, 0.99), exploration_rate=random.uniform(0.5, 1.0), exploration_decay=random.uniform(0.95, 0.99)) for _ in range(num_agents)]
    # add random agent
    #agents.append(QLearningAgent(exploration_decay=0.0))
    best_agent = None
    agent_wins = [0] * num_agents
    game = TicTacToe()
    for episode in range(num_episodes):
        for i, agent in enumerate(agents):
            # match up each agent with each other
            for j, opponent in enumerate(agents):
                if i == j:
                    continue
                
                state = game.reset()
                done = False
                states = []
                actions = []
                movenum = 0
                while not done:
                    available_actions = game.available_moves()
                    if movenum % 2 == 0: # opponent goes first
                        action = opponent.choose_action(state, available_actions)
                    else:
                        action = agent.choose_action(state, available_actions)
                    next_state, reward, done, info = game.step(action)
                    states.append(state)
                    actions.append(action)
                    state = next_state
                    movenum += 1
                count = 0
                for state, action in zip(states, actions):
                    if count % 2 == 0: # Player 1(opponent)
                        if game.winner == 1: # Player 1 won
                            opponent.update_q_values(state, action, 1, next_state)
                        elif game.winner == 0:
                            opponent.update_q_values(state, action, 0.5, next_state)
                        elif game.winner == -1: # Player 2 won so player 1 lost
                            opponent.update_q_values(state, action, -1, next_state)
                    else: # Player 2(agent)
                        if game.winner == -1:
                            agent.update_q_values(state, action, 1, next_state)
                        elif game.winner == 0:
                            agent.update_q_values(state, action, 0.5, next_state)
                        elif game.winner == 1: # Player 1 won so player 2 lost
                            agent.update_q_values(state, action, -1, next_state)

                    count += 1
                if game.winner == 1:
                    agent_wins[j] += 1
                    agent_wins[i] -= 1
                elif game.winner == -1:
                    agent_wins[i] += 1
                    agent_wins[j] -= 1
                else:
                    agent_wins[i] += 0.5
                    agent_wins[j] += 0.5
    
    best_agent = agents[np.argmax(agent_wins)]
    print(f"Training completed. Results: {agent_wins} {best_agent.exploration_rate}")
    # other wins
    print(agent_wins)
    # set best agent's exploration rate to 0.0
    best_agent.exploration_rate = 0.0
    return best_agent


In [20]:
battle_royal_agent = train_q_learning_battle_royal(num_agents=2, num_episodes=1000)

Training completed. Results: [23.0, 223.0] 0.5770343067682315
[23.0, 223.0]


In [21]:
game = TicTacToe()
agent = QLearningAgent()

# Train the agent
solo_trained_agent = train_q_learning(agent, 10000)

Training completed. Results: {'wins': 1752, 'losses': 924, 'draws': 7324} 0.1


In [22]:
def choose_action_q_learning(game, available_actions):
    state = game.board.copy()
    print(f"Board: {state}")
    
    agent = dropdown_dict[agent_dropdown.value]
    print(f"Agent: {agent_dropdown.value}")
    action = agent.choose_action(state, available_actions)
    row, col = divmod(action, 3)
    #print board
    
    print(f"Q-learning agent chooses action: {row}, {col}")
    return row, col

def random_action(game, available_actions):
    action = random.choice(available_actions)
    row, col = divmod(action, 3)
    print(f"Random agent chooses action: {row}, {col}")
    return row, col

random_actor = {"name": "Random", "choose_action": random_action}

In [24]:
wins = 0
losses = 0
import time

# Play against the Q-learning agent with jupyter widgets
from ipywidgets import GridBox, Button, Layout, ButtonStyle
def play_game_q_learning():
    game.reset()
    buttons = [Button(layout=Layout(width="45px", height="45px")) for _ in range(9)]
    grid = GridBox(buttons, layout=Layout(
    
        grid_template_columns="repeat(3, 50px)",
        grid_template_rows="repeat(3, 50px)"
    ))

    def on_button_clicked(button, row, col):
        make_move(row, col)
        if not game.game_over:
            available_actions = game.available_moves()
            row, col = choose_action_q_learning(game, available_actions)
            make_move(row, col)
        if game.game_over:
            
            reset()
        
    def make_move(row, col):
        global wins, losses
        if game.board[row, col] == 0 and not game.game_over:
            game.board[row, col] = game.current_player
            buttons[row * 3 + col].description = "X" if game.current_player == 1 else "O"
            game.check_game_over(row, col)
            if game.game_over:
                if game.winner is not None:
                    print(f"Player {game.winner} wins!")
                    if game.winner == 1:
                        wins += 1
                    else:
                        losses += 1
                else:
                    print("It's a draw!")
            else:
                game.current_player *= -1
    def reset():
        global label
        print(f"Resetting game...{wins} wins, {losses} losses")
        label.value = f"Player {game.winner} wins!" if game.winner is not None else "It's a draw!"
        time.sleep(1)
        for i, button in enumerate(buttons):
            button.description = " " 
        label.value = f"Tic-Tac-Toe: {wins} wins, {losses} losses"
        game.reset()
    
    for i, button in enumerate(buttons):
        row, col = divmod(i, 3)
        button.on_click(lambda _, row=row, col=col: on_button_clicked(_, row, col))
    return grid

game.reset()
label = widgets.Label(f"Tic-Tac-Toe: {wins} wins, {losses} losses")
dropdown_dict = {"battle royal trained": battle_royal_agent, "solo trained": solo_trained_agent, "random": random_actor}
agent_dropdown = widgets.Dropdown(
    options=["battle royal trained", "solo trained", "random"],
    value="solo trained",
    description='Agent:',
    disabled=False,
)
checkbox = widgets.Checkbox(description="Allow learning", value=False)
footer = widgets.HBox([agent_dropdown, checkbox])
widgets.VBox([
    label,
    play_game_q_learning(),
    footer],
    layout=widgets.Layout(align_items="center")
    )


VBox(children=(Label(value='Tic-Tac-Toe: 0 wins, 0 losses'), GridBox(children=(Button(layout=Layout(height='45…

Board: [[1 0 0]
 [0 0 0]
 [0 0 0]]
Agent: solo trained
Q-learning agent chooses action: 1, 1
Board: [[ 1  1  0]
 [ 0 -1  0]
 [ 0  0  0]]
Agent: solo trained
Q-learning agent chooses action: 0, 2
Board: [[ 1  1 -1]
 [ 1 -1  0]
 [ 0  0  0]]
Agent: solo trained
Q-learning agent chooses action: 2, 2
Board: [[ 1  1 -1]
 [ 1 -1  1]
 [ 0  0 -1]]
Agent: solo trained
Q-learning agent chooses action: 2, 0
Player -1 wins!
Resetting game...0 wins, 1 losses
