In [1]:
%pip install numpy ipywidgets

Note: you may need to restart the kernel to use updated packages.


In [1]:
import numpy as np
import ipywidgets as widgets 
import random

In [2]:
class TicTacToe:
    def __init__(self):
        self.board = np.zeros((3, 3), dtype=int)  # 0: empty, 1: player 1, -1: player 2
        self.current_player = 1  # Player 1 starts
        self.game_over = False
        self.winner = None

    def reset(self):
        self.board.fill(0)
        self.current_player = 1
        self.game_over = False
        self.winner = None

    def make_move(self, row, col):
        if self.board[row, col] == 0 and not self.game_over:
            self.board[row, col] = self.current_player
            self.check_game_over(row, col)
            self.current_player *= -1
            return True
        return False

    def check_game_over(self, row, col):
        # Check for win conditions: rows, columns, diagonals
        if np.all(self.board[row, :] == self.current_player) or \
           np.all(self.board[:, col] == self.current_player) or \
           (row == col and np.all(np.diag(self.board) == self.current_player)) or \
           (row + col == 2 and np.all(np.diag(np.fliplr(self.board)) == self.current_player)):
            self.game_over = True
            self.winner = self.current_player
        elif not np.any(self.board == 0):
            self.game_over = True  # Draw

    def available_moves(self):
        return [(row, col) for row in range(3) for col in range(3) if self.board[row, col] == 0]

In [5]:
class QLearningAgent:
    def __init__(self, learning_rate=0.1, discount_factor=0.95, exploration_rate=1.0, exploration_decay=0.99):
        self.q_table = {}  # Initialize Q-table with an empty dictionary
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.exploration_rate = exploration_rate
        self.exploration_decay = exploration_decay

    def get_q_values(self, state):
        # Convert state to a tuple to use as a dictionary key
        state_key = tuple(state.reshape(-1))
        if state_key not in self.q_table:
            self.q_table[state_key] = np.zeros(9)
        return self.q_table[state_key]

    def update_q_values(self, state, action, reward, next_state):
        # Basic Q-learning formula to update Q-values
        current_q = self.get_q_values(state)[action]
        max_future_q = np.max(self.get_q_values(next_state))
        new_q = (1 - self.learning_rate) * current_q + self.learning_rate * (reward + self.discount_factor * max_future_q)
        self.q_table[tuple(state.reshape(-1))][action] = new_q

    def choose_action(self, state, available_actions):
        if random.uniform(0, 1) < self.exploration_rate:
            return random.choice(available_actions)  # Explore
        else:
            q_values = self.get_q_values(state)
            # Filter q_values by available actions
            q_values_filtered = np.array([q_values[i] if i in available_actions else -np.inf for i in range(9)])
            return np.argmax(q_values_filtered)  # Exploit

    def decay_exploration(self):
        self.exploration_rate *= self.exploration_decay

# Simplified game loop with Q-learning (pseudocode)
def train_q_learning(agent, num_episodes=1000):
    for episode in range(num_episodes):
        game = TicTacToe()
        while not game.game_over:
            state = game.board.copy()
            available_actions = [i * 3 + j for i, j in game.available_moves()]
            action = agent.choose_action(state, available_actions)
            # Convert action to row, col
            row, col = divmod(action, 3)
            game.make_move(row, col)
            next_state = game.board.copy()
            reward = 1 if game.winner == game.current_player else -1 if game.winner is not None else 0
            agent.update_q_values(state, action, reward, next_state)
        agent.decay_exploration()

In [31]:
# Initialize Q-learning agent and train
agent = QLearningAgent()
train_q_learning(agent)

In [6]:
def choose_action_q_learning(game, available_actions):
    state = game.board.copy()
    action = agent.choose_action(state, available_actions)
    row, col = divmod(action, 3)
    #print board
    print(f"Board: {state}")
    print(f"Q-learning agent chooses action: {row}, {col}")
    return row, col