In [11]:
import numpy as np
import random

class Connect4Env:
    def __init__(self):
        self.rows = 6
        self.cols = 7
        self.board = np.zeros((self.rows, self.cols), dtype=int)
        self.current_player = 1  # Player 1 = 1, Player 2 = -1

    def reset(self):
        """Reset b√†n c·ªù"""
        self.board = np.zeros((self.rows, self.cols), dtype=int)
        self.current_player = 1
        return self.board.flatten()

    def step(self, action):
        """Th·ª±c hi·ªán m·ªôt h√†nh ƒë·ªông"""
        if action not in self.valid_moves():
            return self.board.flatten(), -10, True, {}  # K·∫øt th√∫c n·∫øu ch·ªçn sai

        for row in reversed(range(self.rows)):  # T√¨m h√†ng tr·ªëng th·∫•p nh·∫•t
            if self.board[row, action] == 0:
                self.board[row, action] = self.current_player
                reward, done = self.check_winner()
                self.current_player *= -1  # ƒê·ªïi l∆∞·ª£t
                return self.board.flatten(), reward, done, {}

        return self.board.flatten(), -10, True, {}  # K·∫øt th√∫c n·∫øu c·ªôt ƒë·∫ßy

    def check_winner(self):
        """Ki·ªÉm tra xem c√≥ ai th·∫Øng kh√¥ng"""
    # Ki·ªÉm tra h√†ng ngang
        for row in range(self.rows):
            for col in range(self.cols - 3):
                if abs(sum(self.board[row, col:col+4])) == 4:
                    return (10, True)

    # Ki·ªÉm tra c·ªôt d·ªçc
        for row in range(self.rows - 3):
            for col in range(self.cols):
                if abs(sum(self.board[row:row+4, col])) == 4:
                    return (10, True)

    # Ki·ªÉm tra ƒë∆∞·ªùng ch√©o ch√≠nh (\)
        for row in range(self.rows - 3):
            for col in range(self.cols - 3):
                diag = [self.board[row+i, col+i] for i in range(4)]
                if abs(sum(diag)) == 4:
                    return (10, True)

    # Ki·ªÉm tra ƒë∆∞·ªùng ch√©o ph·ª• (/)
        for row in range(self.rows - 3):
            for col in range(3, self.cols):
                diag = [self.board[row+i, col-i] for i in range(4)]
                if abs(sum(diag)) == 4:
                    return (10, True)

    # Ki·ªÉm tra h√≤a
        if np.all(self.board != 0):
            return (0, True)

        return (0, False)  # Game ch∆∞a k·∫øt th√∫c

    def valid_moves(self):
        """Tr·∫£ v·ªÅ danh s√°ch c·ªôt c√≥ th·ªÉ ƒëi"""
        return [c for c in range(self.cols) if self.board[0, c] == 0]


In [13]:
import torch
import torch.nn as nn
import torch.optim as optim

class DQN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, output_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)  # Kh√¥ng d√πng softmax v√¨ DQN t·ªëi ∆∞u gi√° tr·ªã Q


In [15]:
import matplotlib.pyplot as plt
# epilson decay graph
epsilon_start = 1.0
epsilon_end = 0.1

max_episode = 50
episode = np.arange(max_episode)
epsilon_decay = np.log(epsilon_start/epsilon_end*100) / max_episode

eps = epsilon_end + (epsilon_start - epsilon_end) * np.exp(-episode * epsilon_decay)
plt.plot(episode, eps)
plt.title('Epsilon decay graph')
plt.xlabel('Episode no.')
plt.ylabel('Epsilon')
plt.show()

  plt.show()


In [26]:
def select_action(model, state, episode=None, training=True):
    """Select action using epsilon-greedy with decay."""
    # Calculate epsilon based on episode if training, otherwise no exploration
    epsilon = epsilon_end + (epsilon_start - epsilon_end) * np.exp(-episode * epsilon_decay) if training else 0.0

    if random.random() < epsilon:
        return np.random.choice(range(7))
    else:
        with torch.no_grad():
            q_values = model(torch.tensor(state, dtype=torch.float32).unsqueeze(0))
            return torch.argmax(q_values).item()


In [27]:
def play_game(dqn_player1, dqn_player2, env, epsilon):
    """ƒê·ªÉ hai model t·ª± ch∆°i v·ªõi nhau"""
    state = env.reset()
    done = False
    turn = 0  # 0 l√† Player 1, 1 l√† Player 2

    while not done:
        model = dqn_player1 if turn == 0 else dqn_player2
        action = select_action(model, state, epsilon)
        
        next_state, reward, done, _ = env.step(action)

        if done:
            return reward  # Tr·∫£ v·ªÅ k·∫øt qu·∫£ tr·∫≠n ƒë·∫•u

        state = next_state
        turn = 1 - turn  # ƒê·ªïi l∆∞·ª£t ch∆°i


In [28]:
from collections import deque

def train_model(model, optimizer, memory, batch_size):
    """Hu·∫•n luy·ªán model DQN"""
    if len(memory) < batch_size:
        return

    batch = random.sample(memory, batch_size)
    
    for state, action, reward, next_state, done in batch:
        q_values = model(torch.tensor(state, dtype=torch.float32))
        q_value = q_values[action]

        with torch.no_grad():
            next_q_values = model(torch.tensor(next_state, dtype=torch.float32))
            target_q_value = reward if done else reward + 0.99 * torch.max(next_q_values)

        loss = (q_value - target_q_value) ** 2

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


In [29]:
env = Connect4Env()

input_dim = 6 * 7  # B√†n c·ªù 6x7
output_dim = 7  # 7 c·ªôt

dqn_player1 = DQN(input_dim, output_dim)
dqn_player2 = DQN(input_dim, output_dim)

optimizer1 = optim.Adam(dqn_player1.parameters(), lr=0.001)
optimizer2 = optim.Adam(dqn_player2.parameters(), lr=0.001)

memory = deque(maxlen=10000)
batch_size = 64
num_episodes = 50
epsilon = 0.1

for episode in range(num_episodes):
    state = env.reset()
    done = False
    turn = 0  # 0 l√† Player 1, 1 l√† Player 2

    while not done:
        model = dqn_player1 if turn == 0 else dqn_player2
        optimizer = optimizer1 if turn == 0 else optimizer2

        action = select_action(model, state, episode)
        next_state, reward, done, _ = env.step(action)

        memory.append((state, action, reward, next_state, done))

        # Hu·∫•n luy·ªán m√¥ h√¨nh
        train_model(model, optimizer, memory, batch_size)

        state = next_state
        turn = 1 - turn  # ƒê·ªïi l∆∞·ª£t ch∆°i

    epsilon = epsilon_end + (epsilon_start - epsilon_end) * np.exp(-episode * epsilon_decay)

    if episode % 10 == 0:
        print(f"Episode {episode}: Reward {reward}, Epsilon {epsilon:.4f}")

print("Training complete!")


Episode 0: Reward 10
Episode 10: Reward 10
Episode 20: Reward 10
Episode 30: Reward 10
Episode 40: Reward 10
Training complete!


In [30]:
import time

env.reset()
start_time = time.time()

done = False
while not done:
    action = select_action(dqn_player1, env.board.flatten(), training=False)
    _, _, done, _ = env.step(action)

end_time = time.time()
print(f"Th·ªùi gian ho√†n th√†nh 1 game: {end_time - start_time:.2f} gi√¢y")


Th·ªùi gian ho√†n th√†nh 1 game: 0.02 gi√¢y


In [31]:
def evaluate_model(model, env, num_games=10):
    total_rewards = 0
    for _ in range(num_games):
        state = env.reset()
        done = False
        move_count = 0
        while not done:
            action = select_action(model, state, training=False)  # Lu√¥n ch·ªçn h√†nh ƒë·ªông t·ªët nh·∫•t
            state, reward, done, _ = env.step(action)
            # print(f"üßê Ch·ªçn c·ªôt: {action}")
            move_count +=1
            if move_count > 42:
                print("qua 42 buoc")
                print("Board state:\n", env.board)
                break
        
        total_rewards += reward  # C·ªông t·ªïng ƒëi·ªÉm th∆∞·ªüng
        
    avg_reward = total_rewards / num_games
    print(f"üìä Model's Average Reward over {num_games} games: {avg_reward}")
    return avg_reward

# ƒê√°nh gi√° model sau khi train
evaluate_model(dqn_player1, env, num_games=10)


üìä Model's Average Reward over 10 games: 0.0


0.0

In [32]:
torch.save(dqn_player1.state_dict(), "dqn_player1.pth")
torch.save(dqn_player2.state_dict(), "dqn_player2.pth")
print("‚úÖ Model saved successfully!")


‚úÖ Model saved successfully!


In [None]:
dqn_player1.load_state_dict(torch.load("dqn_player1.pth"))
dqn_player2.load_state_dict(torch.load("dqn_player2.pth"))
print("‚úÖ Model loaded successfully!")

In [21]:
import time

start_time = time.time()
evaluate_model(dqn_player1, env, num_games=100)
end_time = time.time()

print(f"‚è≥ Time taken: {end_time - start_time:.2f} seconds")


üìä Model's Average Reward over 100 games: -10.0
‚è≥ Time taken: 0.34 seconds


In [22]:
import time

start = time.time()
action = select_action(model, state, epsilon=0.0)
end = time.time()

print(f"‚è≥ Th·ªùi gian ch·ªçn action: {end - start:.4f} gi√¢y")


‚è≥ Th·ªùi gian ch·ªçn action: 0.0014 gi√¢y
