In [13]:
import numpy as np
import random

class Connect4Env:
    def __init__(self):
        self.rows = 6
        self.cols = 7
        self.board = np.zeros((self.rows, self.cols), dtype=int)
        self.current_player = 1  # Player 1 = 1, Player 2 = -1

    def reset(self):
        """Reset bàn cờ"""
        self.board = np.zeros((self.rows, self.cols), dtype=int)
        self.current_player = 1
        return self.board.flatten()

    def step(self, action):
        """Thực hiện một hành động"""
        if action not in self.valid_moves():
            return self.board.flatten(), -10, True, {}  # Kết thúc nếu chọn sai

        for row in reversed(range(self.rows)):  # Tìm hàng trống thấp nhất
            if self.board[row, action] == 0:
                self.board[row, action] = self.current_player
                reward, done = self.check_winner()
                self.current_player *= -1  # Đổi lượt
                return self.board.flatten(), reward, done, {}

        return self.board.flatten(), -10, True, {}  # Kết thúc nếu cột đầy

    def check_winner(self):
        """Kiểm tra xem có ai thắng không"""
    # Kiểm tra hàng ngang
        for row in range(self.rows):
            for col in range(self.cols - 3):
                if abs(sum(self.board[row, col:col+4])) == 4:
                    return (10, True)

    # Kiểm tra cột dọc
        for row in range(self.rows - 3):
            for col in range(self.cols):
                if abs(sum(self.board[row:row+4, col])) == 4:
                    return (10, True)

    # Kiểm tra đường chéo chính (\)
        for row in range(self.rows - 3):
            for col in range(self.cols - 3):
                diag = [self.board[row+i, col+i] for i in range(4)]
                if abs(sum(diag)) == 4:
                    return (10, True)

    # Kiểm tra đường chéo phụ (/)
        for row in range(self.rows - 3):
            for col in range(3, self.cols):
                diag = [self.board[row+i, col-i] for i in range(4)]
                if abs(sum(diag)) == 4:
                    return (10, True)

    # Kiểm tra hòa
        if np.all(self.board != 0):
            return (0, True)

        return (0, False)  # Game chưa kết thúc

    def valid_moves(self):
        """Trả về danh sách cột có thể đi"""
        return [c for c in range(self.cols) if self.board[0, c] == 0]


In [14]:
import torch
import torch.nn as nn
import torch.optim as optim

class DQN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, output_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)  # Không dùng softmax vì DQN tối ưu giá trị Q


In [26]:
def select_action(model, state, epsilon):
    """Chọn hành động bằng epsilon-greedy với kiểm tra hợp lệ"""
    valid_moves = env.valid_moves()  # Lấy danh sách các cột hợp lệ

    if random.random() < epsilon:
        return np.random.choice(valid_moves)  # Chọn ngẫu nhiên trong các cột hợp lệ
    else:
        with torch.no_grad():
            q_values = model(torch.tensor(state, dtype=torch.float32).unsqueeze(0))
            q_values = q_values.numpy().flatten()  # Chuyển thành mảng numpy

        # Chỉ chọn hành động có Q-value cao nhất trong các cột hợp lệ
        best_action = max(valid_moves, key=lambda a: q_values[a])
        return best_action


In [27]:
def play_game(dqn_player1, dqn_player2, env, epsilon):
    """Để hai model tự chơi với nhau"""
    state = env.reset()
    done = False
    turn = 0  # 0 là Player 1, 1 là Player 2

    while not done:
        model = dqn_player1 if turn == 0 else dqn_player2
        action = select_action(model, state, epsilon)
        
        next_state, reward, done, _ = env.step(action)

        if done:
            return reward  # Trả về kết quả trận đấu

        state = next_state
        turn = 1 - turn  # Đổi lượt chơi


In [28]:
from collections import deque

def train_model(model, optimizer, memory, batch_size):
    """Huấn luyện model DQN"""
    if len(memory) < batch_size:
        return

    batch = random.sample(memory, batch_size)
    
    for state, action, reward, next_state, done in batch:
        q_values = model(torch.tensor(state, dtype=torch.float32))
        q_value = q_values[action]

        with torch.no_grad():
            next_q_values = model(torch.tensor(next_state, dtype=torch.float32))
            target_q_value = reward if done else reward + 0.99 * torch.max(next_q_values)

        loss = (q_value - target_q_value) ** 2

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


In [29]:
env = Connect4Env()

input_dim = 6 * 7  # Bàn cờ 6x7
output_dim = 7  # 7 cột

dqn_player1 = DQN(input_dim, output_dim)
dqn_player2 = DQN(input_dim, output_dim)

optimizer1 = optim.Adam(dqn_player1.parameters(), lr=0.001)
optimizer2 = optim.Adam(dqn_player2.parameters(), lr=0.001)

memory = deque(maxlen=10000)
batch_size = 64
num_episodes = 50
epsilon = 0.1

for episode in range(num_episodes):
    state = env.reset()
    done = False
    turn = 0  # 0 là Player 1, 1 là Player 2

    while not done:
        model = dqn_player1 if turn == 0 else dqn_player2
        optimizer = optimizer1 if turn == 0 else optimizer2

        action = select_action(model, state, epsilon)
        next_state, reward, done, _ = env.step(action)

        memory.append((state, action, reward, next_state, done))

        # Huấn luyện mô hình
        train_model(model, optimizer, memory, batch_size)

        state = next_state
        turn = 1 - turn  # Đổi lượt chơi

    if episode % 10 == 0:
        print(f"Episode {episode}: Reward {reward}")


print("Training complete!")


Episode 0: Reward 10
Episode 10: Reward 10
Episode 20: Reward 10
Episode 30: Reward 10
Episode 40: Reward 10
Training complete!


In [30]:
import time

env.reset()
start_time = time.time()

done = False
while not done:
    action = select_action(dqn_player1, env.board.flatten(), epsilon=0.0)
    _, _, done, _ = env.step(action)

end_time = time.time()
print(f"Thời gian hoàn thành 1 game: {end_time - start_time:.2f} giây")


Thời gian hoàn thành 1 game: 0.02 giây


In [31]:
def evaluate_model(model, env, num_games=10):
    total_rewards = 0
    for _ in range(num_games):
        state = env.reset()
        done = False
        move_count = 0
        while not done:
            action = select_action(model, state, epsilon=0.0)  # Luôn chọn hành động tốt nhất
            state, reward, done, _ = env.step(action)
            # print(f"🧐 Chọn cột: {action}")
            move_count +=1
            if move_count > 42:
                print("qua 42 buoc")
                print("Board state:\n", env.board)
                break
        
        total_rewards += reward  # Cộng tổng điểm thưởng
        
    avg_reward = total_rewards / num_games
    print(f"📊 Model's Average Reward over {num_games} games: {avg_reward}")
    return avg_reward

# Đánh giá model sau khi train
evaluate_model(dqn_player1, env, num_games=10)


📊 Model's Average Reward over 10 games: 0.0


0.0

In [32]:
torch.save(dqn_player1.state_dict(), "dqn_player1.pth")
torch.save(dqn_player2.state_dict(), "dqn_player2.pth")
print("✅ Model saved successfully!")


✅ Model saved successfully!


In [None]:
dqn_player1.load_state_dict(torch.load("dqn_player1.pth"))
dqn_player2.load_state_dict(torch.load("dqn_player2.pth"))
print("✅ Model loaded successfully!")

In [21]:
import time

start_time = time.time()
evaluate_model(dqn_player1, env, num_games=100)
end_time = time.time()

print(f"⏳ Time taken: {end_time - start_time:.2f} seconds")


📊 Model's Average Reward over 100 games: -10.0
⏳ Time taken: 0.34 seconds


In [22]:
import time

start = time.time()
action = select_action(model, state, epsilon=0.0)
end = time.time()

print(f"⏳ Thời gian chọn action: {end - start:.4f} giây")


⏳ Thời gian chọn action: 0.0014 giây
