Creating Connect-4 AIs using Reinforcement Learning

In this file, there are two models, Model 1 and Model 2, here are there key differences:

Architecture:
Model 1 does not use DQN, while Model 2 introduces a Dueling DQN, which separates value and advantage streams, leading to more efficient learning.

Training Enhancements:
Model 2 explicitly uses target network updates for better training stability, which may not be as explicitly implemented in Model 1.

Exploration Strategy:
Model 2 formalizes epsilon-greedy exploration with structured decay, allowing smoother transitions between exploration and exploitation.

In [None]:
import torch
import random
import torch.optim as optim
import torch.nn as nn
from collections import deque
import numpy as np

In [None]:
# Constants
BOARD_ROWS, BOARD_COLS = 6, 7

BUFFER_SIZE = 10000
BATCH_SIZE = 64
UPDATE_TARGET_FREQUENCY = 100

ALIGN_THREE_REWARD = 0.5
BLOCK_OPPONENT_REWARD = 1.0
WIN_REWARD = 10.0
LOSS_PENALTY = -10.0
DRAW_REWARD = 0.5
DRAW_PENALTY = -1

Create and Play against Model #1

In [None]:
# Model 1 Environment
class Connect4Env:
    def __init__(self):
        self.board = np.zeros((6, 7), dtype=int)
        self.current_player = 1

    def reset(self):
        self.board = np.zeros((6, 7), dtype=int)
        self.current_player = 1
        return self.board

    def step(self, action):
        if self.board[0, action] != 0:
            raise ValueError("Column is full!")

        row = np.max(np.where(self.board[:, action] == 0))
        self.board[row, action] = self.current_player

        done, reward = self.check_winner(row, action)
        if not done:
            self.current_player = 3 - self.current_player
        return self.board, reward, done

    def check_winner(self, row, col):
        player = self.current_player
        directions = [(0, 1), (1, 0), (1, 1), (1, -1)]
        for dr, dc in directions:
            count = 1
            for direction in [1, -1]:
                r, c = row + dr * direction, col + dc * direction
                while 0 <= r < 6 and 0 <= c < 7 and self.board[r, c] == player:
                    count += 1
                    r += dr * direction
                    c += dc * direction
                if count >= 4:
                    return True, 10
        if np.all(self.board != 0):
            return True, 5
        return False, 0

    def available_actions(self):
        return [col for col in range(7) if self.board[0, col] == 0]

    def opponent_about_to_win(self):
        opponent = 3 - self.current_player
        for col in self.available_actions():
            row = np.max(np.where(self.board[:, col] == 0))
            self.board[row, col] = opponent
            win, _ = self.check_winner(row, col)
            self.board[row, col] = 0
            if win:
                return True, 5
        return False, 0


In [None]:
# Model 1 Agent
class DQNAgent(nn.Module):
    def __init__(self):
        super(DQNAgent, self).__init__()
        self.fc1 = nn.Linear(42, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, 7)

    def forward(self, x):
        x = x.view(-1, 42)
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [None]:
# Replay Buffer for all Models
class ReplayBuffer:
    def __init__(self, size):
        self.buffer = deque(maxlen=size)

    def push(self, experience):
        self.buffer.append(experience)

    def sample(self, batch_size):
        indices = np.random.choice(len(self.buffer), batch_size, replace=False)
        return [self.buffer[i] for i in indices]

    def __len__(self):
        return len(self.buffer)

In [None]:
# Model 1 Train
def train():
    agent = DQNAgent()
    env = Connect4Env()
    optimizer = optim.Adam(agent.parameters(), lr=0.001)
    criterion = nn.MSELoss()
    gamma = 0.9
    episodes = 1000

    for episode in range(episodes):
        state = torch.tensor(env.reset(), dtype=torch.float32).flatten()
        done = False

        while not done:
            q_values = agent(state.unsqueeze(0))
            available_actions = env.available_actions()

            masked_q_values = torch.full((7,), float('-inf'))
            masked_q_values[available_actions] = q_values[0, available_actions]

            if np.random.rand() < 0.1:
                action = np.random.choice(available_actions)
            else:
                action = torch.argmax(masked_q_values).item()

            next_state, reward, done = env.step(action)
            next_state = torch.tensor(next_state, dtype=torch.float32).flatten()

            with torch.no_grad():
                next_q_values = agent(next_state.unsqueeze(0))
                if done:
                    target_q_value = reward
                else:
                    target_q_value = reward + gamma * torch.max(next_q_values).item()

            loss = criterion(q_values[0, action], torch.tensor(target_q_value, dtype=torch.float32))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        if (episode + 1) % 100 == 0:
            print(f"Episode {episode + 1}/{episodes} completed.")

    return agent


In [None]:
# Train Model 1
agent = train()
print("done")

Episode 100/1000 completed.
Episode 200/1000 completed.
Episode 300/1000 completed.
Episode 400/1000 completed.
Episode 500/1000 completed.
Episode 600/1000 completed.
Episode 700/1000 completed.
Episode 800/1000 completed.
Episode 900/1000 completed.
Episode 1000/1000 completed.
done


In [None]:
# Model 1 Play Game Function
def play_game(agent):
    env = Connect4Env()
    state = torch.tensor(env.reset(), dtype=torch.float32)
    done = False

    print("Welcome to Connect 4! You're Player 1, and the AI is Player 2.\n")

    while not done:
        print("Current board:")
        print(env.board)

        valid_move = False
        while not valid_move:
            try:
                col = int(input("Enter the column (0-6) where you want to drop your piece: "))
                if col in env.available_actions():
                    valid_move = True
                else:
                    print("Column is full or invalid. Choose another column.")
            except ValueError:
                print("Invalid input. Please enter an integer between 0 and 6.")

        state, reward, done = env.step(col)
        state = torch.tensor(state, dtype=torch.float32)
        if done:
            print("You won!")
            break

        with torch.no_grad():
            q_values = agent(state)
            action = torch.argmax(q_values).item()

        while action not in env.available_actions():
            q_values[action] = float('-inf')
            action = torch.argmax(q_values).item()

        state, reward, done = env.step(action)
        state = torch.tensor(state, dtype=torch.float32)
        print(f"AI placed its piece in column {action}.")

        if done:
            if reward == WIN_REWARD:
                print("AI won!")
            elif reward == DRAW_PENALTY:
                print("It's a draw!")
            break

    print("Final board:")
    print(env.board)


In [None]:
#Play against model 1
play_game(agent)

Welcome to Connect 4! You're Player 1, and the AI is Player 2.

Current board:
[[0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]]
Enter the column (0-6) where you want to drop your piece: 3
AI placed its piece in column 2.
Current board:
[[0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 2 1 0 0 0]]
Enter the column (0-6) where you want to drop your piece: 3
AI placed its piece in column 2.
Current board:
[[0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 2 1 0 0 0]
 [0 0 2 1 0 0 0]]
Enter the column (0-6) where you want to drop your piece: 3
AI placed its piece in column 2.
Current board:
[[0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 2 1 0 0 0]
 [0 0 2 1 0 0 0]
 [0 0 2 1 0 0 0]]
Enter the column (0-6) where you want to drop your piece: 3
You won!
Final board:
[[0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 1 0 0 0]
 [0 0 2 1 0 0 0]
 [0 0 2 1 0 0 0]
 [0 0 2 1 0 0 0]]


In [None]:
# Model 2 Environment
class Connect4Env:
    def __init__(self):
        self.board = np.zeros((6, 7), dtype=int)
        self.current_player = 1

    def reset(self):
        self.board = np.zeros((6, 7), dtype=int)
        self.current_player = 1
        return self.board

    def step(self, action):
        for row in range(5, -1, -1):
            if self.board[row, action] == 0:
                self.board[row, action] = self.current_player
                reward, done = self.check_game_status()
                self.current_player *= -1
                return self.board, reward, done
        return self.board, -1, True

    def check_game_status(self):

        for row in range(6):
            for col in range(4):
                if abs(sum(self.board[row, col:col+4])) == 4:
                    return 10, True

        for col in range(7):
            for row in range(3):
                if abs(sum(self.board[row:row+4, col])) == 4:
                    return 10, True

        for row in range(3):
            for col in range(4):
                if abs(sum([self.board[row+i, col+i] for i in range(4)])) == 4:
                    return 10, True

        for row in range(3):
            for col in range(3, 7):
                if abs(sum([self.board[row+i, col-i] for i in range(4)])) == 4:
                    return 10, True

        if not np.any(self.board == 0):
            return 0, True

        return 0, False

    def get_valid_actions(self):
        return [c for c in range(7) if self.board[0, c] == 0]

In [None]:
# # Model 2 Dueling DQN Agent
class DuelingDQN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(DuelingDQN, self).__init__()
        self.feature_layer = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU()
        )
        self.value_stream = nn.Sequential(
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )
        self.advantage_stream = nn.Sequential(
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, output_dim)
        )

    def forward(self, state):
        features = self.feature_layer(state)
        value = self.value_stream(features)
        advantages = self.advantage_stream(features)
        q_values = value + (advantages - advantages.mean())
        return q_values

In [None]:
# Model 2 Agent
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=10000)
        self.gamma = 0.99
        self.epsilon = 1.0
        self.epsilon_min = 0.1
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001
        self.batch_size = 64

        self.model = DuelingDQN(state_size, action_size)
        self.target_model = DuelingDQN(state_size, action_size)
        self.update_target_model()
        self.optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate)

    def update_target_model(self):
        self.target_model.load_state_dict(self.model.state_dict())

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.choice(range(self.action_size))
        state = torch.FloatTensor(state).unsqueeze(0)
        act_values = self.model(state)
        return torch.argmax(act_values).item()

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def replay(self):
        if len(self.memory) < self.batch_size:
            return

        minibatch = random.sample(self.memory, self.batch_size)
        for state, action, reward, next_state, done in minibatch:
            state = torch.FloatTensor(state)
            next_state = torch.FloatTensor(next_state)

            target = reward
            if not done:
                target += self.gamma * torch.max(self.target_model(next_state)).item()

            target_f = self.model(state).detach()
            target_f[action] = target

            self.optimizer.zero_grad()
            output = self.model(state)[action]
            loss = nn.MSELoss()(output, target_f[action])
            loss.backward()
            self.optimizer.step()

        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

In [None]:
# Model 2 Train Function
def train_dqn(episodes=1000):
    env = Connect4Env()
    agent = DQNAgent(state_size=42, action_size=7)

    for e in range(episodes):
        state = env.reset().flatten()
        done = False
        total_reward = 0

        while not done:
            action = agent.act(state)
            valid_actions = env.get_valid_actions()
            if action not in valid_actions:
                action = random.choice(valid_actions)

            next_state, reward, done = env.step(action)
            next_state = next_state.flatten()

            agent.remember(state, action, reward, next_state, done)
            state = next_state
            total_reward += reward

        agent.replay()
        agent.update_target_model()

        print(f"Episode {e+1}/{episodes}, Total Reward: {total_reward}, Epsilon: {agent.epsilon:.2f}")

    return agent


In [None]:
# Train Model 2
agent = train_dqn()

Episode 1/1000, Total Reward: 10, Epsilon: 1.00
Episode 2/1000, Total Reward: 10, Epsilon: 1.00
Episode 3/1000, Total Reward: 10, Epsilon: 1.00
Episode 4/1000, Total Reward: 10, Epsilon: 0.99
Episode 5/1000, Total Reward: 10, Epsilon: 0.99
Episode 6/1000, Total Reward: 10, Epsilon: 0.99
Episode 7/1000, Total Reward: 10, Epsilon: 0.98
Episode 8/1000, Total Reward: 10, Epsilon: 0.98
Episode 9/1000, Total Reward: 10, Epsilon: 0.97
Episode 10/1000, Total Reward: 10, Epsilon: 0.97
Episode 11/1000, Total Reward: 10, Epsilon: 0.96
Episode 12/1000, Total Reward: 10, Epsilon: 0.96
Episode 13/1000, Total Reward: 10, Epsilon: 0.95
Episode 14/1000, Total Reward: 10, Epsilon: 0.95
Episode 15/1000, Total Reward: 10, Epsilon: 0.94
Episode 16/1000, Total Reward: 10, Epsilon: 0.94
Episode 17/1000, Total Reward: 10, Epsilon: 0.93
Episode 18/1000, Total Reward: 10, Epsilon: 0.93
Episode 19/1000, Total Reward: 10, Epsilon: 0.92
Episode 20/1000, Total Reward: 10, Epsilon: 0.92
Episode 21/1000, Total Reward

In [None]:
# Model 2 Play Game Function
def play_game(agent):
    env = Connect4Env()
    state = env.reset().flatten()
    done = False

    print("\nYou are Player 1 (1), and the Agent is Player -1 (-1).")

    while not done:
        print("\nCurrent Board:")
        print(env.board)

        if env.current_player == 1:
            valid_actions = env.get_valid_actions()
            action = -1
            while action not in valid_actions:
                try:
                    action = int(input(f"Choose a column (0-6): "))
                except ValueError:
                    print("Invalid input. Please enter a number between 0 and 6.")

            next_state, reward, done = env.step(action)
            state = next_state.flatten()

            if done:
                print("\nFinal Board:")
                print(env.board)
                if reward == 10:
                    print("You win!")
                elif reward == 0:
                    print("It's a draw!")
                break

        else:
            action = agent.act(state)
            valid_actions = env.get_valid_actions()
            if action not in valid_actions:
                action = random.choice(valid_actions)

            print(f"Agent chooses column: {action}")
            next_state, reward, done = env.step(action)
            state = next_state.flatten()

            if done:
                print("\nFinal Board:")
                print(env.board)
                if reward == 10:
                    print("Agent wins!")
                elif reward == 0:
                    print("It's a draw!")
                break

In [None]:
# Play Against Model 2
play_game(agent)


You are Player 1 (1), and the Agent is Player -1 (-1).

Current Board:
[[0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]]
Choose a column (0-6): 2

Current Board:
[[0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 1 0 0 0 0]]
Agent chooses column: 2

Current Board:
[[ 0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0]
 [ 0  0 -1  0  0  0  0]
 [ 0  0  1  0  0  0  0]]
Choose a column (0-6): 3

Current Board:
[[ 0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0]
 [ 0  0 -1  0  0  0  0]
 [ 0  0  1  1  0  0  0]]
Agent chooses column: 2

Current Board:
[[ 0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0]
 [ 0  0 -1  0  0  0  0]
 [ 0  0 -1  0  0  0  0]
 [ 0  0  1  1  0  0  0]]
Choose a column (0-6): 2

Current Board:
[[ 0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0]
 [ 0  0  1  0  0  0  0]
 [ 0  0 -1  

Both models that learn via reinforcement learning have a tendancy to learn to stack in the same column repeatatively. This is likely because they learn by playing against random moves which do not make an effort to break this. Additionally, even if the random moves do happen to block the algorithm, it is not likely that it will win or block again before the AI gets 4 in a row. As such, there needs to be a difference in incentive in order for the AI to learn better or a better opponent that will block 4 in a row when it is there.

Model 3:

Learning by playing against a smarter bot.

Changes to the opposing bot:
  - Will make a move to block the opponents 3 in a row
  - Will make a move to win if available
  - If neither are available it will do a random move

In [None]:
class Connect4Env:
    def __init__(self):
        self.rows = 6
        self.cols = 7
        self.board = np.zeros((self.rows, self.cols), dtype=int)
        self.current_player = 1  # Player 1 starts

    def reset(self):
        """Reset the board and set Player 1 to start."""
        self.board = np.zeros((self.rows, self.cols), dtype=int)
        self.current_player = 1
        return self.board

    def step(self, action, player=None):
        """Perform the given action (column)."""
        player = player or self.current_player
        for row in range(self.rows - 1, -1, -1):
            if self.board[row, action] == 0:
                self.board[row, action] = player
                self.current_player *= -1
                if self.check_win(player):
                    return self.board, 10, True
                if np.all(self.board != 0):
                    return self.board, 0, True
                return self.board, 0, False
        raise ValueError("Invalid action: Column is full.")

    def get_valid_actions(self):
        """Return a list of valid columns where a move can be made."""
        return [col for col in range(self.cols) if self.board[0, col] == 0]

    def check_win(self, player):
        """Check if the given player has won."""
        for row in range(self.rows):
            for col in range(self.cols):
                if (
                    col + 3 < self.cols and
                    np.all(self.board[row, col:col+4] == player)
                ):
                    return True
                if (
                    row + 3 < self.rows and
                    np.all(self.board[row:row+4, col] == player)
                ):
                    return True
                if (
                    row + 3 < self.rows and col + 3 < self.cols and
                    all(self.board[row+i, col+i] == player for i in range(4))
                ):
                    return True
                if (
                    row + 3 < self.rows and col - 3 >= 0 and
                    all(self.board[row+i, col-i] == player for i in range(4))
                ):
                    return True
        return False

    def clone(self):
        """Return a deep copy of the environment."""
        clone = Connect4Env()
        clone.board = self.board.copy()
        clone.current_player = self.current_player
        return clone


In [None]:
class RLAgent:
    def __init__(self, player_id, epsilon=0.1, alpha=0.5, gamma=0.9):
        self.player_id = player_id
        self.q_table = {}
        self.epsilon = epsilon
        self.alpha = alpha
        self.gamma = gamma

    def state_to_key(self, state):
        """Convert state to a hashable key for Q-table."""
        return tuple(state.flatten())

    def act(self, state):
        """Choose an action based on epsilon-greedy policy."""
        key = self.state_to_key(state)
        if random.random() < self.epsilon or key not in self.q_table:
            return random.choice(env.get_valid_actions())
        return max(self.q_table[key], key=self.q_table[key].get)

    def update(self, state, action, reward, next_state, done):
        """Update Q-values with a customized reward."""
        key = self.state_to_key(state)
        next_key = self.state_to_key(next_state)

        if key not in self.q_table:
            self.q_table[key] = {a: 0 for a in env.get_valid_actions()}

        if next_key not in self.q_table:
            self.q_table[next_key] = {a: 0 for a in env.get_valid_actions()}

        reward += 5 if env.check_win(self.player_id) else 0
        reward -= 5 if env.check_win(-self.player_id) else 0

        next_max = max(self.q_table[next_key].values()) if not done else 0
        self.q_table[key][action] += self.alpha * (
            reward + self.gamma * next_max - self.q_table[key][action]
        )


    def decay_epsilon(self):
        """Reduce exploration over time."""
        self.epsilon = max(0.01, self.epsilon * 0.995)


In [None]:
class HeuristicOpponent:
    def __init__(self, player_id):
        self.player_id = player_id

    def choose_action(self, env):
        """Choose an action based on heuristic rules."""
        valid_actions = env.get_valid_actions()
        for action in valid_actions:
            env_copy = env.clone()
            env_copy.step(action, self.player_id)
            if env_copy.check_win(self.player_id):
                return action

        opponent_id = -self.player_id
        for action in valid_actions:
            env_copy = env.clone()
            env_copy.step(action, opponent_id)
            if env_copy.check_win(opponent_id):
                return action

        return random.choice(valid_actions)

In [None]:
def train_with_heuristic_opponent(agent, env, heuristic_opponent, episodes=1000):
    rewards = []

    for episode in range(episodes):
        state = env.reset().flatten()
        total_reward = 0
        done = False

        while not done:
            if env.current_player == 1:
                action = agent.act(state)
                valid_actions = env.get_valid_actions()
                if action not in valid_actions:
                    action = random.choice(valid_actions)
                next_state, reward, done = env.step(action)
                agent.update(state, action, reward, next_state.flatten(), done)
                state = next_state.flatten()
                total_reward += reward
            else:
                action = heuristic_opponent.choose_action(env)
                _, reward, done = env.step(action)

        rewards.append(total_reward)

        agent.decay_epsilon()

        if (episode + 1) % 100 == 0:
            print(f"Episode {episode + 1}/{episodes}, Total Reward: {total_reward}, Epsilon: {agent.epsilon:.2f}")

    return rewards


In [None]:
env = Connect4Env()
agent = RLAgent(player_id=1)
heuristic_opponent = HeuristicOpponent(player_id=-1)

print("\nTraining the RL agent against the heuristic opponent...")
train_with_heuristic_opponent(agent, env, heuristic_opponent, episodes=1000)
print("done")


Training the RL agent against the heuristic opponent...
Episode 100/1000, Total Reward: 0, Epsilon: 0.06
Episode 200/1000, Total Reward: 0, Epsilon: 0.04
Episode 300/1000, Total Reward: 0, Epsilon: 0.02
Episode 400/1000, Total Reward: 0, Epsilon: 0.01
Episode 500/1000, Total Reward: 0, Epsilon: 0.01
Episode 600/1000, Total Reward: 0, Epsilon: 0.01
Episode 700/1000, Total Reward: 0, Epsilon: 0.01
Episode 800/1000, Total Reward: 0, Epsilon: 0.01
Episode 900/1000, Total Reward: 0, Epsilon: 0.01
Episode 1000/1000, Total Reward: 0, Epsilon: 0.01
done


In [None]:
def play_against_trained_bot(agent, env):
    print("\nYou are Player 1 (1), and the RL Bot is Player -1 (-1).")
    state = env.reset().flatten()
    done = False

    while not done:
        print("\nCurrent Board:")
        print(env.board)

        if env.current_player == 1:
            valid_actions = env.get_valid_actions()
            action = -1
            while action not in valid_actions:
                try:
                    action = int(input(f"Choose a column (0-6): "))
                except ValueError:
                    print("Invalid input. Please enter a number between 0 and 6.")

            next_state, reward, done = env.step(action)
            state = next_state.flatten()

            if done:
                print("\nFinal Board:")
                print(env.board)
                if reward == 10:
                    print("You win!")
                elif reward == 0:
                    print("It's a draw!")
                break

        else:
            action = agent.act(state)
            print(f"RL Bot chooses column: {action}")
            next_state, reward, done = env.step(action)
            state = next_state.flatten()

            if done:
                print("\nFinal Board:")
                print(env.board)
                if reward == 10:
                    print("RL Bot wins!")
                elif reward == 0:
                    print("It's a draw!")
                break


In [None]:
play_against_trained_bot(agent, env)


You are Player 1 (1), and the RL Bot is Player -1 (-1).

Current Board:
[[0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]]
Choose a column (0-6): 3

Current Board:
[[0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 1 0 0 0]]
RL Bot chooses column: 0

Current Board:
[[ 0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0]
 [-1  0  0  1  0  0  0]]
Choose a column (0-6): 3

Current Board:
[[ 0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0]
 [ 0  0  0  1  0  0  0]
 [-1  0  0  1  0  0  0]]
RL Bot chooses column: 5

Current Board:
[[ 0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0]
 [ 0  0  0  1  0  0  0]
 [-1  0  0  1  0 -1  0]]
Choose a column (0-6): 3

Current Board:
[[ 0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0]
 [ 0  0  

Model 4: Self Play

Instaed of having the bot play against a hueristic model or random moves, It will play against itself.

In [None]:
# Training the RL agent via self-play
def train_with_self_play(agent, env, episodes=1000):
    rewards = []

    for episode in range(episodes):
        state = env.reset().flatten()
        total_reward = 0
        done = False
        while not done:
            action = agent.act(state)
            valid_actions = env.get_valid_actions()
            if action not in valid_actions:
                action = random.choice(valid_actions)
            next_state, reward, done = env.step(action)
            agent.update(state, action, reward, next_state.flatten(), done)
            state = next_state.flatten()
            total_reward += reward

        rewards.append(total_reward)

        agent.decay_epsilon()

        if (episode + 1) % 100 == 0:
            print(f"Episode {episode + 1}/{episodes}, Total Reward: {total_reward}, Epsilon: {agent.epsilon:.2f}")

    return rewards


In [None]:
# Play against the trained bot
def play_against_trained_self_play_bot(agent, env):
    print("\nYou are Player 1 (1), and the RL Bot is Player -1 (-1).")
    state = env.reset().flatten()
    done = False

    while not done:
        print("\nCurrent Board:")
        print(env.board)

        if env.current_player == 1:
            valid_actions = env.get_valid_actions()
            action = -1
            while action not in valid_actions:
                try:
                    action = int(input(f"Choose a column (0-6): "))
                except ValueError:
                    print("Invalid input. Please enter a number between 0 and 6.")

            next_state, reward, done = env.step(action)
            state = next_state.flatten()

            if done:
                print("\nFinal Board:")
                print(env.board)
                if reward == 10:
                    print("You win!")
                elif reward == 0:
                    print("It's a draw!")
                break

        else:
            action = agent.act(state)
            print(f"RL Bot chooses column: {action}")
            next_state, reward, done = env.step(action)
            state = next_state.flatten()

            if done:
                print("\nFinal Board:")
                print(env.board)
                if reward == 10:
                    print("RL Bot wins!")
                elif reward == 0:
                    print("It's a draw!")
                break

In [None]:
env = Connect4Env()
agent = RLAgent(player_id=1)

print("\nTraining the RL agent via self-play...")
train_with_self_play(agent, env, episodes=1000)
print("done")


Training the RL agent via self-play...
Episode 100/1000, Total Reward: 10, Epsilon: 0.06
Episode 200/1000, Total Reward: 10, Epsilon: 0.04
Episode 300/1000, Total Reward: 10, Epsilon: 0.02
Episode 400/1000, Total Reward: 10, Epsilon: 0.01
Episode 500/1000, Total Reward: 10, Epsilon: 0.01
Episode 600/1000, Total Reward: 10, Epsilon: 0.01
Episode 700/1000, Total Reward: 10, Epsilon: 0.01
Episode 800/1000, Total Reward: 10, Epsilon: 0.01
Episode 900/1000, Total Reward: 10, Epsilon: 0.01
Episode 1000/1000, Total Reward: 10, Epsilon: 0.01
done


In [None]:
play_against_trained_self_play_bot(agent, env)


You are Player 1 (1), and the RL Bot is Player -1 (-1).

Current Board:
[[0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]]
Choose a column (0-6): 3

Current Board:
[[0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 1 0 0 0]]
RL Bot chooses column: 0

Current Board:
[[ 0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0]
 [-1  0  0  1  0  0  0]]
Choose a column (0-6): 0

Current Board:
[[ 0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0]
 [ 1  0  0  0  0  0  0]
 [-1  0  0  1  0  0  0]]
RL Bot chooses column: 0

Current Board:
[[ 0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0]
 [-1  0  0  0  0  0  0]
 [ 1  0  0  0  0  0  0]
 [-1  0  0  1  0  0  0]]
Choose a column (0-6): 3

Current Board:
[[ 0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0]
 [-1  0  