In [2]:
import torch
import random
import torch.optim as optim
import torch.nn as nn
from collections import deque
import numpy as np

In [9]:
# Define constants
BOARD_ROWS, BOARD_COLS = 6, 7  # Connect 4 board dimensions

# Define experience replay buffer size
BUFFER_SIZE = 10000
BATCH_SIZE = 64
UPDATE_TARGET_FREQUENCY = 100  # How often to update the target network

# Modified reward values
ALIGN_THREE_REWARD = 0.5  # Reward for aligning three in a row
BLOCK_OPPONENT_REWARD = 1.0  # Reward for blocking an opponent's winning move
WIN_REWARD = 10.0
LOSS_PENALTY = -10.0
DRAW_REWARD = 0.5
DRAW_PENALTY = -1

In [10]:
class Connect4Env:
    def __init__(self):
        self.board = np.zeros((BOARD_ROWS, BOARD_COLS), dtype=int)
        self.current_player = 1  # Start with player 1

    def reset(self):
        self.board.fill(0)
        self.current_player = 1
        return self.board

    def step(self, action):
        # Check if the chosen column is full
        if self.board[0, action] != 0:
            raise ValueError("Column is full! Invalid move.")

        # Find the next available row in the chosen column
        row = np.max(np.where(self.board[:, action] == 0))
        self.board[row, action] = self.current_player

        done, reward = self.check_winner(row, action)

        if not done:
            # Switch to the other player if the game is not done
            self.current_player = 3 - self.current_player

        return self.board, reward, done

    def available_actions(self):
        # Return a list of columns that are not full
        return [col for col in range(BOARD_COLS) if self.board[0, col] == 0]

    def check_winner(self, row, col):
        """Check if there's a winner and return (done, reward)."""
        # Define winning directions: horizontal, vertical, and two diagonals
        directions = [(1, 0), (0, 1), (1, 1), (1, -1)]

        def count_in_direction(r, c, dr, dc):
            """Count consecutive pieces from (r, c) in the direction (dr, dc)."""
            count = 0
            player = self.board[r, c]
            while 0 <= r < BOARD_ROWS and 0 <= c < BOARD_COLS and self.board[r, c] == player:
                count += 1
                r += dr
                c += dc
            return count

        for dr, dc in directions:
            # Check in both positive and negative directions for each of the four directions
            count = count_in_direction(row, col, dr, dc) + count_in_direction(row, col, -dr, -dc) - 1
            if count >= 4:  # Found 4 in a row
                return True, WIN_REWARD  # Game won by current player

        if np.all(self.board != 0):  # If the board is full
            return True, DRAW_PENALTY  # Draw game

        return False, 0  # No winner yet, game continues

    def has_opponent_winning_move(self):
        # Simulate all possible moves for the opponent to see if any result in a win
        for col in self.available_actions():
            temp_board = self.board.copy()
            row = np.max(np.where(temp_board[:, col] == 0))
            temp_board[row, col] = 3 - self.current_player  # Opponent's piece
            if self.check_winner(row, col)[0]:  # If opponent would win with this move
                return True
        return False

    def is_blocking_move(self, action):
        # Check if playing in this column would prevent an opponent's win
        temp_board = self.board.copy()
        row = np.max(np.where(temp_board[:, action] == 0))
        temp_board[row, action] = self.current_player  # AI's piece
        return self.check_winner(row, action)[0]  # True if this move blocks opponent

In [11]:
class DQNAgent(nn.Module):
    def __init__(self):
        super(DQNAgent, self).__init__()
        # Define the network layers with correct input/output sizes
        self.fc1 = nn.Linear(42, 128)  # Input layer matches the 42-board size
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, 7)  # Output layer has 7 actions, one per column

    def forward(self, x):
        # Ensure the input is flattened to a 1D tensor with 42 elements
        x = x.view(-1, 42)  # Reshape to [batch_size, 42]
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [12]:
# Experience Replay Buffer
class ReplayBuffer:
    def __init__(self, size):
        self.buffer = deque(maxlen=size)

    def push(self, experience):
        self.buffer.append(experience)

    def sample(self, batch_size):
        indices = np.random.choice(len(self.buffer), batch_size, replace=False)
        return [self.buffer[i] for i in indices]

    def __len__(self):
        return len(self.buffer)

In [13]:
def train():
    num_episodes = 1000  # Number of episodes to train on
    learning_rate = 0.001
    gamma = 0.99  # Discount factor
    epsilon = 1.0  # Starting epsilon for exploration
    epsilon_min = 0.1  # Minimum epsilon
    epsilon_decay = 0.995  # Decay factor for epsilon

    env = Connect4Env()  # Initialize your Connect 4 environment
    agent = DQNAgent()  # Initialize your DQN agent
    optimizer = optim.Adam(agent.parameters(), lr=learning_rate)
    criterion = torch.nn.MSELoss()

    for episode in range(num_episodes):

        if episode % 100 == 0:
          print(episode)

        state = env.reset()
        done = False

        while not done:
            # Convert state to tensor format expected by the neural network
            state_tensor = torch.tensor(state, dtype=torch.float32)

            # Get Q-values from the agent
            q_values = agent(state_tensor)

            # Mask unavailable actions
            available_actions = env.available_actions()  # List of available actions (columns not full)
            mask = torch.tensor([1 if a in available_actions else 0 for a in range(7)], dtype=torch.float32)
            masked_q_values = torch.where(mask == 1, q_values, torch.tensor(float('-inf')))

            # Select an action
            if random.random() < epsilon:
                # Exploration: randomly select from available actions
                action = random.choice(available_actions)
            else:
                # Exploitation: choose the best available action
                action = torch.argmax(masked_q_values).item()

            # Perform the action in the environment
            next_state, reward, done = env.step(action)
            next_state_tensor = torch.tensor(next_state, dtype=torch.float32)

            # Calculate the target Q-value
            if done:
                target_q_value = reward  # If the game is done, reward is final
            else:
                next_q_values = agent(next_state_tensor)
                next_masked_q_values = torch.where(mask == 1, next_q_values, torch.tensor(float('-inf')))
                target_q_value = reward + gamma * torch.max(next_masked_q_values).item()

            # Compute loss and update weights
            predicted_q_value = q_values[0, action]  # Use [0, action] indexing
            loss = criterion(predicted_q_value, torch.tensor(target_q_value, dtype=torch.float32))  # Ensure float dtype
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Update the state
            state = next_state

            # Decay epsilon after each step to reduce exploration
            if epsilon > epsilon_min:
                epsilon *= epsilon_decay

    return agent  # Return the trained agent


In [14]:
agent = train()
print("done")

0
100
200
300
400
500
600
700
800
900
done


In [15]:
def play_game(agent):
    env = Connect4Env()
    state = torch.tensor(env.reset(), dtype=torch.float32)  # Ensure initial state is a tensor
    done = False

    print("Welcome to Connect 4! You're Player 1, and the AI is Player 2.\n")

    while not done:
        # Display current board
        print("Current board:")
        print(env.board)

        # Human player (Player 1)
        valid_move = False
        while not valid_move:
            try:
                col = int(input("Enter the column (0-6) where you want to drop your piece: "))
                if col in env.available_actions():
                    valid_move = True
                else:
                    print("Column is full or invalid. Choose another column.")
            except ValueError:
                print("Invalid input. Please enter an integer between 0 and 6.")

        # Update the environment with human move
        state, reward, done = env.step(col)
        state = torch.tensor(state, dtype=torch.float32)  # Convert to tensor
        if done:
            print("You won!")
            break

        # AI player (Player 2)
        with torch.no_grad():
            q_values = agent(state)
            action = torch.argmax(q_values).item()

        # Ensure AI chooses only from available actions
        while action not in env.available_actions():
            q_values[action] = float('-inf')  # Mask out invalid action
            action = torch.argmax(q_values).item()

        # Update the environment with AI move
        state, reward, done = env.step(action)
        state = torch.tensor(state, dtype=torch.float32)  # Convert to tensor
        print(f"AI placed its piece in column {action}.")

        # Check for game end
        if done:
            if reward == WIN_REWARD:
                print("AI won!")
            elif reward == DRAW_PENALTY:
                print("It's a draw!")
            break

    # Show final board
    print("Final board:")
    print(env.board)


In [16]:
# Assuming the DQNAgent is trained and we have a trained agent available as 'agent'
play_game(agent)

Welcome to Connect 4! You're Player 1, and the AI is Player 2.

Current board:
[[0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]]
Enter the column (0-6) where you want to drop your piece: 3
AI placed its piece in column 6.
Current board:
[[0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 1 0 0 2]]
Enter the column (0-6) where you want to drop your piece: 3
AI placed its piece in column 1.
Current board:
[[0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 1 0 0 0]
 [0 2 0 1 0 0 2]]
Enter the column (0-6) where you want to drop your piece: 2
AI placed its piece in column 1.
Current board:
[[0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 2 0 1 0 0 0]
 [0 2 1 1 0 0 2]]
Enter the column (0-6) where you want to drop your piece: 1
AI placed its piece in column 1.
Current board:
[[0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 2 0 0 0 0 0]
 [0 1 0 0 0 0 0]
 [0 2 0 1