# Atelier d'apprentissage par renforcement : Agent Q-learning pour le Morpion

Dans cet atelier, nous allons développer un agent d'apprentissage par renforcement pour jouer au Morpion en utilisant la méthode Q-learning. Nous utiliserons la bibliothèque NumPy pour gérer les tableaux et les calculs.

## 1. Création de l'environnement de Morpion

In [1]:
import numpy as np

class TicTacToe:
    def __init__(self):
        # Initialize the board (3x3 matrix filled with zeros)
        self.board = np.zeros((3, 3))

    def reset(self):
        # Reset the board to its initial state
        self.board = np.zeros((3, 3))
        return self.board.flatten()

    def available_actions(self):
        # Get the indices of the available actions (empty cells)
        return np.argwhere(self.board == 0)

    def step(self, action, player):
        # Place the player's marker (1 for player 1, -1 for player 2) at the chosen position
        self.board[action] = player

        # Check if the player has won
        if np.any(np.sum(self.board, axis=0) == 3 * player) or np.any(np.sum(self.board, axis=1) == 3 * player) or np.sum(np.diagonal(self.board)) == 3 * player or np.sum(np.diagonal(np.fliplr(self.board))) == 3 * player:
            reward = 1
            done = True
        # Check if the player has lost
        elif np.any(np.sum(self.board, axis=0) == 3 * -player) or np.any(np.sum(self.board, axis=1) == 3 * -player) or np.sum(np.diagonal(self.board)) == 3 * -player or np.sum(np.diagonal(np.fliplr(self.board))) == 3 * -player:
            reward = -1
            done = True
        # Check if the game is a draw
        elif np.all(self.board != 0):
            reward = 0
            done = True
        else:
            reward = 0
            done = False

        return self.board.flatten(), reward, done


## 2. Initialisation du Q-table

In [2]:
# Q-table initialization
state_size = 3**9
action_size = 9
q_table = np.zeros((state_size, action_size))

## 3. Configuration des paramètres d'apprentissage et entraînement de l'agent

In [None]:
# Hyperparameters
alpha = 0.1  # Learning rate
gamma = 0.99  # Discount factor
epsilon = 1  # Exploration rate
epsilon_decay = 0.9999
min_epsilon = 0.01

In [3]:
# Training loop
n_episodes = 50000

for episode in range(n_episodes):
    env = TicTacToe()
    state = env.reset()
    state_idx = int(''.join(map(str, state.astype(int) + 1)), 3)
    done = False

    while not done:
        # Choose an action (exploration or exploitation)
        if np.random.uniform(0, 1) < epsilon:
            action = np.random.choice(action_size)
        else:
            action = np.argmax(q_table[state_idx])

        # Take the action and get the next state, reward, and done flag
        next_state, reward, done = env.step(np.unravel_index(action, (3, 3)), 1)

        # Update the Q-table
        next_state_idx = int(''.join(map(str, next_state.astype(int) + 1)), 3)
        q_table[state_idx, action] += alpha * (reward + gamma * np.max(q_table[next_state_idx]) - q_table[state_idx, action])

        # Update the state and state_idx
        state = next_state
        state_idx = next_state_idx

    # Decay the exploration rate
    epsilon = max(epsilon * epsilon_decay, min_epsilon)

    # Print the episode number and epsilon value every 1000 episodes
    if episode % 1000 == 0:
        print(f"Episode: {episode}, Epsilon: {epsilon}")



Episode: 0, Epsilon: 0.9999
Episode: 1000, Epsilon: 0.9047424102692004
Episode: 2000, Epsilon: 0.8186406930090225
Episode: 3000, Epsilon: 0.7407330270401349
Episode: 4000, Epsilon: 0.6702396082111141
Episode: 5000, Epsilon: 0.6064548440752141
Episode: 6000, Epsilon: 0.548740291377179
Episode: 7000, Epsilon: 0.4965182656589779
Episode: 8000, Epsilon: 0.4492660590208893
Episode: 9000, Epsilon: 0.406510708161521
Episode: 10000, Epsilon: 0.3678242603283259
Episode: 11000, Epsilon: 0.332819489793915
Episode: 12000, Epsilon: 0.3011460219829101
Episode: 13000, Epsilon: 0.27248682645444433
Episode: 14000, Epsilon: 0.24655504363736244
Episode: 15000, Epsilon: 0.22309111355585096
Episode: 16000, Epsilon: 0.20186017780594118
Episode: 17000, Epsilon: 0.1826497287783945
Episode: 18000, Epsilon: 0.16526748259824006
Episode: 19000, Epsilon: 0.14953945449050376
Episode: 20000, Epsilon: 0.13530821730781062
Episode: 21000, Epsilon: 0.12243132578887629
Episode: 22000, Epsilon: 0.11077989077575923
Episode

## 4. Test de l'agent entraîné

In [4]:
n_test_episodes = 100
wins = 0
draws = 0
losses = 0

for episode in range(n_test_episodes):
    env = TicTacToe()
    state = env.reset()
    state_idx = int(''.join(map(str, state.astype(int) + 1)), 3)
    done = False

    while not done:
        # Choose the best action according to the Q-table
        action = np.argmax(q_table[state_idx])

        # Take the action and get the next state, reward, and done flag
        next_state, reward, done = env.step(np.unravel_index(action, (3, 3)), 1)

        # Update the state and state_idx
        state = next_state
        state_idx = int(''.join(map(str, state.astype(int) + 1)), 3)

        if not done:
            # Random opponent's turn
            opponent_action = np.random.choice(len(env.available_actions()))
            next_state, _, done = env.step(env.available_actions()[opponent_action], -1)

            # Update the state and state_idx
            state = next_state
            state_idx = int(''.join(map(str, state.astype(int) + 1)), 3)

    if reward == 1:
        wins += 1
    elif reward == 0:
        draws += 1
    else:
        losses += 1

print(f"Wins: {wins}, Draws: {draws}, Losses: {losses}")


Wins: 0, Draws: 100, Losses: 0


## 5. Jouer contre l'agent

In [5]:
# fonction d'affichage du plateau de jeu
def render(board):
    symbols = {0: ' ', 1: 'X', -1: 'O'}
    print('---------')
    for i in range(3):
        print('|', end='')
        for j in range(3):
            print(symbols[board[i, j]], end='|')
        print('\n---------')


In [6]:
def check_winner(board):
    for player in [1, -1]:
        if np.any(np.sum(board, axis=0) == 3 * player) or np.any(np.sum(board, axis=1) == 3 * player) or np.sum(np.diagonal(board)) == 3 * player or np.sum(np.diagonal(np.fliplr(board))) == 3 * player:
            return player
    return None

def valid_move(board, action):
    return board[np.unravel_index(action, (3, 3))] == 0

env = TicTacToe()
state = env.reset()
done = False

while not done:
    # Agent's turn
    state_idx = int(''.join(map(str, state.astype(int) + 1)), 3)
    
    # Make sure the agent picks a valid move
    valid_actions = np.where(q_table[state_idx] != 0)[0]
    
    if valid_actions.size > 0:
        action = valid_actions[np.argmax(q_table[state_idx, valid_actions])]
    else:
        action = np.random.choice(np.ravel_multi_index(env.available_actions().T, (3, 3)))
    
    if not valid_move(env.board, action):
        action = np.random.choice(np.ravel_multi_index(env.available_actions().T, (3, 3)))
    
    state, _, done = env.step(np.unravel_index(action, (3, 3)), 1)
    
    render(env.board)
    
    winner = check_winner(env.board)
    if winner is not None:
        break
    
    if not done:
        # Player's turn
        player_action = int(input("Enter your move (0-8): "))
        state, _, done = env.step(np.unravel_index(player_action, (3, 3)), -1)

        render(env.board)

        winner = check_winner(env.board)
        if winner is not None:
            break

# Display the game result
if winner == 1:
    print("You lost!")
elif winner == -1:
    print("You won!")
else:
    print("It's a draw!")


---------
|X| | |
---------
| | | |
---------
| | | |
---------
Enter your move (0-8): 1
---------
|X|O| |
---------
| | | |
---------
| | | |
---------
---------
|X|O| |
---------
| | | |
---------
|X| | |
---------
Enter your move (0-8): 3
---------
|X|O| |
---------
|O| | |
---------
|X| | |
---------
---------
|X|O| |
---------
|O| | |
---------
|X| |X|
---------
Enter your move (0-8): 7
---------
|X|O| |
---------
|O| | |
---------
|X|O|X|
---------
---------
|X|O| |
---------
|O| |X|
---------
|X|O|X|
---------
Enter your move (0-8): 2
---------
|X|O|O|
---------
|O| |X|
---------
|X|O|X|
---------
---------
|X|O|O|
---------
|O|X|X|
---------
|X|O|X|
---------
You lost!
