In [None]:
# Super global constant variables
BOARD_ROW = 6
BOARD_COL = 7

In [None]:
import numpy as np
import random

class Connect4Env:
    def __init__(self):
        self.rows = BOARD_ROW
        self.cols = BOARD_COL
        self.board = np.zeros((self.rows, self.cols), dtype = np.int8)
        self.current_player = 1  # Player 1 = 1, Player 2 = 2
        self.reward = {'win': 10, 'draw': 0, 'lose': -10}

    def reset(self):
        self.__init__()
        return self.board.flatten()

    def play(self, action):
        """Thực hiện một hành động"""
        if action not in self.valid_moves():
            return self.board.flatten(), -10, True, {}  # Kết thúc nếu chọn sai

        for row in reversed(range(self.rows)):  # Tìm hàng trống thấp nhất
            if self.board[row, action] == 0:
                self.board[row, action] = self.current_player
                reward, done = self.isWinningMove()
                self.current_player = 3 - self.current_player  # Đổi lượt
                return self.board.flatten(), reward, done, {}

        reward, done = self.isWinningMove()
        if done:
            # Trả reward cho player hiện tại
            final_reward = reward if self.current_player == 1 else -reward
        else:
            final_reward = 0
    
        self.current_player = 3 - self.current_player
        return self.board.flatten(), final_reward, done, {}
    
    """ Check if current state is ended after making a move"""
    def isWinningMove(self):
        def check_direction(r, c, dr, dc, player):
            count = 0
            for _ in range(4):
                if 0 <= r < self.rows and 0 <= c < self.cols and self.board[r, c] == player:
                    count += 1
                    r += dr
                    c += dc
                else:
                    break
            return count == 4

        for r in reversed(range(self.rows)):
            for c in range(self.cols):
                if self.board[r, c] != 0:
                    player = self.board[r, c]
                    if (check_direction(r, c, 1, 0, player) or  # Vertical
                            check_direction(r, c, 0, 1, player) or  # Horizontal
                            check_direction(r, c, 1, 1, player) or  # Diagonal /
                            check_direction(r, c, 1, -1, player)):  # Diagonal \
                        return (self.reward['win'], True)
                    
        # Check draw game
        if np.all(self.board != 0):
            return (self.reward['draw'], True)
        
        return (0, False) # have not done yet

    def valid_moves(self):
        """Trả về danh sách cột có thể đi"""
        return [c for c in range(self.cols) if self.board[0, c] == 0]
    
    def printBoard(self):
        for row in self.board:
            print(" ".join(["⚫" if x == 0 else "🚗" if x == 1 else "🚕" for x in row]))
        print(" 0  1   2  3   4  5   6")

board = Connect4Env()
state = board.reset()
print(state)
print(board.board)


In [None]:
all_episodes = []  
all_win_rates = []      
all_avg_rewards = []  
all_avg_win_steps = [] 

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

class DQN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(DQN, self).__init__()
        self.layer1 = nn.Linear(input_dim, 128)
        self.layer2 = nn.Linear(128, 128)
        self.layer3 = nn.Linear(128, output_dim)

    def forward(self, x):
        x = torch.relu(self.layer1(x))
        x = torch.relu(self.layer2(x))
        return self.layer3(x)  # Không dùng softmax vì DQN tối ưu giá trị Q


In [None]:
import matplotlib.pyplot as plt
# epilson decay graph
epsilon_start = 1.0
epsilon_end = 0.1

max_episode = 50
episode = np.arange(max_episode)
epsilon_decay = np.log(epsilon_start/epsilon_end*100) / max_episode

eps = epsilon_end + (epsilon_start - epsilon_end) * np.exp(-episode * epsilon_decay)
plt.plot(episode, eps)
print(epsilon_decay)
print(eps)
plt.title('Epsilon decay graph')
plt.xlabel('Episode no.')
plt.ylabel('Epsilon')
plt.show()

In [None]:
""" Use Epsilon-greedy to find out the best way to make a move

    @Params:
    @Return: best column to play"""
def select_action(model, state, valid_moves, episode=None, training=True):
    if not valid_moves:  # Nếu không có nước hợp lệ
        return None
    
    if not training:
        with torch.no_grad():
            state_tensor = torch.tensor(state, dtype=torch.float32)
            q_values = model(state_tensor)
            q_values = q_values[valid_moves]  # Chỉ xét các cột hợp lệ
            return valid_moves[torch.argmax(q_values).item()]
    
    # Tính epsilon với decay
    epsilon = epsilon_end + (epsilon_start - epsilon_end) * np.exp(-episode * epsilon_decay)
    
    if random.random() < epsilon:
        return random.choice(valid_moves)  # Exploration
    else:
        with torch.no_grad():
            state_tensor = torch.tensor(state, dtype=torch.float32)
            q_values = model(state_tensor)
            q_values = q_values[valid_moves]  # Mask invalid moves
            return valid_moves[torch.argmax(q_values).item()]  # Exploitation

In [None]:
""" Let 2 models play against each other
    @Return: reward after play optimally"""
def play_game(dqn_player1, dqn_player2, env, epsilon):
    state = env.reset()
    done = False
    turn = 1  # 1 là Player 1, 2 là Player 2

    while not done:
        model = dqn_player1 if turn == 1 else dqn_player2
        action = select_action(model, state, epsilon)
        
        next_state, reward, done, _ = env.step(action)

        if done:
            return reward  # Trả về kết quả trận đấu

        state = next_state
        turn = 3 - turn  # Đổi lượt chơi


In [None]:
from collections import deque

"""@Params: """
def train_model(model, target_model, optimizer, memory, batch_size):
    if len(memory) < batch_size:
        return

    batch = random.sample(memory, batch_size)
    states, actions, rewards, next_states, dones = zip(*batch)
    
    # Chuyển sang tensor
    states = torch.tensor(np.array(states), dtype=torch.float32)
    actions = torch.tensor(actions, dtype=torch.long)
    rewards = torch.tensor(rewards, dtype=torch.float32)
    next_states = torch.tensor(np.array(next_states), dtype=torch.float32)
    dones = torch.tensor(dones, dtype=torch.bool)
    
    # Tính Q values hiện tại
    current_q = model(states).gather(1, actions.unsqueeze(1))
    
    # Tính target Q values
    with torch.no_grad():
        next_actions = model(next_states).max(1)[1]
        next_q = target_model(next_states).gather(1, next_actions.unsqueeze(1)).squeeze()
        target_q = rewards + (1 - dones.float()) * 0.99 * next_q
    
    # Tính loss
    loss = nn.MSELoss()(current_q.squeeze(), target_q)
    
    # Backpropagation
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

In [None]:
def evaluate_model(model, env, num_games=100):
    model.eval()
    total_wins = 0
    total_rewards = 0
    total_win_steps = 0  # Thêm biến đếm tổng số bước khi thắng
    
    with torch.no_grad():
        for _ in range(num_games):
            state = env.reset()
            done = False
            current_player = 1
            steps = 0  # Đếm số bước trong mỗi game
            
            while not done:
                valid_moves = env.valid_moves()
                if not valid_moves:
                    break
                
                if current_player == 1:
                    action = select_action(model, state, valid_moves, training=False)
                else:
                    action = random.choice(valid_moves)
                
                next_state, reward, done, _ = env.play(action)
                steps += 1  # Mỗi lượt chơi là 1 bước
                
                if done:
                    if current_player == 1:
                        total_rewards += reward
                        if reward == env.reward['win']:
                            total_wins += 1
                            total_win_steps += steps  # Cộng số bước khi thắng
                
                state = next_state
                current_player = 3 - current_player
    
    win_rate = total_wins / num_games
    avg_reward = total_rewards / num_games
    avg_win_steps = total_win_steps / total_wins if total_wins > 0 else 0  # Tính trung bình

    print(f"✅ Win: {total_wins} ({win_rate:.1%})")
    print(f"🎯 Avg Win Steps: {avg_win_steps:.2f}")
    print(f"💰 Avg Total Reward: {avg_reward:.2f}")
    return win_rate, avg_reward, avg_win_steps

In [None]:
import time

env = Connect4Env()

input_dim = BOARD_ROW * BOARD_COL  # Size of board
output_dim = BOARD_COL  # Ouput action (column 0 -> n)

dqn_player1 = DQN(input_dim, output_dim)
dqn_player2 = DQN(input_dim, output_dim)

target_player1 = DQN(input_dim, output_dim)
target_player1.load_state_dict(dqn_player1.state_dict())
target_player2 = DQN(input_dim, output_dim)
target_player2.load_state_dict(dqn_player2.state_dict())

optimizer1 = optim.Adam(dqn_player1.parameters(), lr=0.001)
optimizer2 = optim.Adam(dqn_player2.parameters(), lr=0.001)

batch_size = 32
num_episodes = 500
epsilon = 0.1
startTime = time.time()

memory_p1 = deque(maxlen=10000)  
memory_p2 = deque(maxlen=10000)

for episode in range(num_episodes):
    state = env.reset()
    done = False
    turn = 1

    while not done:
        valid_moves = env.valid_moves()
        if not valid_moves:
            break
            
        # Chọn model và memory tương ứng
        if turn == 1:
            model = dqn_player1
            target_model = target_player1
            optimizer = optimizer1
            memory = memory_p1
        else:
            model = dqn_player2
            target_model = target_player2
            optimizer = optimizer2
            memory = memory_p2

        action = select_action(model, state, valid_moves, episode, training=True)
        next_state, reward, done, _ = env.play(action)

        # Lưu experience vào memory
        memory.append((state, action, reward, next_state, done))
        
        # Huấn luyện
        train_model(model, target_model, optimizer, memory, batch_size)

        state = next_state
        turn = 3 - turn

    epsilon = epsilon_end + (epsilon_start - epsilon_end) * np.exp(-episode * epsilon_decay)
        
    # Cập nhật target network định kỳ
    if episode % 20 == 0:
        target_player1.load_state_dict(dqn_player1.state_dict())
        target_player2.load_state_dict(dqn_player2.state_dict())

        win_rate, avg_reward, avg_win_steps = evaluate_model(dqn_player1, env, num_games=100)
        all_episodes.append(episode)
        all_win_rates.append(win_rate)
        all_avg_rewards.append(avg_reward)
        all_avg_win_steps.append(avg_win_steps)
        
        print(f"Episode {episode}:")
        print(f"  Win Rate: {win_rate:.2%}")
        print(f"  Avg Reward: {avg_reward:.2f}")
        print(f"  Avg Win Steps: {avg_win_steps:.2f}")

endTime = time.time()
print(f"Trained {num_episodes} episodes after {endTime-startTime:.4f} seconds")
print("Training complete!")

In [None]:
torch.save(dqn_player1.state_dict(), "dqn_player1.pth")
torch.save(dqn_player2.state_dict(), "dqn_player2.pth")
print("✅ Model saved successfully!")


In [None]:
# import time

# env = Connect4Env()

# input_dim = BOARD_ROW * BOARD_COL  # Size of board
# output_dim = BOARD_COL  # Ouput action (column 0 -> n)

# dqn_player1 = DQN(input_dim, output_dim)
# dqn_player2 = DQN(input_dim, output_dim)

dqn_player1.load_state_dict(torch.load("dqn_player1.pth"))
dqn_player2.load_state_dict(torch.load("dqn_player2.pth"))
print("✅ Model loaded successfully!")

In [None]:
start_time = time.time()
evaluate_model(dqn_player1, env, num_games=100)
end_time = time.time()

print(f"⏳ Time taken: {end_time - start_time:.2f} seconds")


In [None]:
import matplotlib.pyplot as plt

def plot_training_metrics(episodes, win_rates, avg_rewards, avg_win_steps):
    plt.figure(figsize=(15, 5))
    
    # Biểu đồ Win Rate
    plt.subplot(1, 3, 1)
    plt.plot(episodes, win_rates, 'b-o')
    plt.title('Win Rate')
    plt.xlabel('Episode')
    plt.ylabel('Win Rate')
    plt.grid(True)

    # Biểu đồ Avg Reward
    plt.subplot(1, 3, 2)
    plt.plot(episodes, avg_rewards, 'r-o')
    plt.title('Avg Reward')
    plt.xlabel('Episode')
    plt.ylabel('Reward')
    plt.grid(True)

    # Biểu đồ Avg Win Steps
    plt.subplot(1, 3, 3)
    plt.plot(episodes, avg_win_steps, 'g-o')
    plt.title('Avg Win Steps')
    plt.xlabel('Episode')
    plt.ylabel('Steps')
    plt.grid(True)

    plt.tight_layout()
    plt.savefig('training_metrics.png')  # Lưu ảnh
    plt.show()

plot_training_metrics(all_episodes, all_win_rates, all_avg_rewards, all_avg_win_steps)