In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.optimizers import Adam
import random
import os
import math
import copy

In [2]:
## 6x6 board othello game
n=6
class Board:
    def __init__(self, player1, player2):
        self.n = n
        self.board = [[0] * n for _ in range(n)]
        self.current_play = 1
        self.reset()
        self.player1 = player1
        self.player2 = player2
        self.done = False
        self.directions = [(-1, -1), (-1, 0), (-1, 1), (0, -1), (0, 1), (1, -1), (1, 0), (1, 1)]

    def reset(self):
        n = self.n
        self.board = [[0] * n for _ in range(n)]
        self.current_play = 1
        if n % 2 == 0:
            mid = n // 2
            self.board[mid - 1][mid - 1] = 1
            self.board[mid][mid] = 1
            self.board[mid - 1][mid] = -1
            self.board[mid][mid - 1] = -1

    def __getitem__(self, index):
        return self.board[index]

    def countDiff(self, color):
        count = 0
        for j in range(self.n):
            for i in range(self.n):
                if self.board[i][j] == color:
                    count += 1
                if self.board[i][j] == -color:
                    count -= 1
        return count

    def get_legal_moves(self, color):
        moves = set()  # stores the legal moves.
        for j in range(self.n):
            for i in range(self.n):
                if self.board[i][j] == color:
                    newmoves = self.get_moves_for_square((i, j))
                    moves.update(newmoves)
        return list(moves)

    def has_legal_moves(self, color):#check legal movel
        for j in range(self.n):
            for i in range(self.n):
                if self.board[i][j] == color:
                    newmoves = self.get_moves_for_square((i, j))
                    if len(newmoves) > 0:
                        return True
        return False

    def get_moves_for_square(self, square):
        (i, j) = square
        color = self.board[i][j]
        if color == 0:
            return None
        # search all possible directions.
        moves = []
        for direction in self.directions:
            move = self._discover_move(square, direction)
            if move:
                moves.append(move)
        # return the generated move list
        return moves

    def execute_move(self, move, color):
        flips = []
        for direction in self.directions:
            flip = self._get_flips(move, direction, color)
            flips.extend(flip)

        if flips:
            for i, j in flips:
                self.board[i][j] = color
        else:
            raise Exception("Invalid move")
        row, col = move
        self.board[row][col] = color
        
    #valid move in a specified direction starting from the origin position on the game board
    def _discover_move(self, origin, direction):
        i, j = origin
        color = self.board[i][j]
        flips = []
        for new_i, new_j in Board._increment_move(origin, direction, self.n):
            if self.board[new_i][new_j] == 0:
                if flips:
                    return (new_i, new_j)
                else:
                    return None
            elif self.board[new_i][new_j] == color:
                return None
            elif self.board[new_i][new_j]== -color:
                flips.append((new_i, new_j))
        return None

    def _get_flips(self, origin, direction, color):
        flips = []
        for i, j in Board._increment_move(origin, direction, self.n):
            if self.board[i][j] == 0:
                return []
            if self.board[i][j] == -color:
                flips.append((i, j))
            elif self.board[i][j] == color and len(flips) > 0:
                return flips
        return []
    
    #increment current position of the move inthe specified position
    def _increment_move(move, direction, n):
        i, j = move
        move = (i + direction[0], j + direction[1])#adding the next position
        while 0 <= move[0] < n and 0 <= move[1] < n:
            yield move
            move = (move[0] + direction[0], move[1] + direction[1])
            
    def step(self, action):
        i, j = action
        reward = 0
        done = False

        if i == -1 and j == -1:
            self.current_play *= -1
            self.done = True
            reward = self.get_reward()
        elif not self.has_legal_moves(self.current_play):
            print("Invalid Move!")
            self.done = True
            reward = -1 * self.current_play  
        else:
            self.execute_move((i, j), self.current_play)
            self.current_play *= -1

        if not self.has_legal_moves(self.current_play):
            print("No legal moves for the current player")
            self.current_play *= -1

        return copy.deepcopy(self.board), reward, self.done, {}


    def get_reward(self):
        player1_score = self.countDiff(1)
        player2_score = self.countDiff(-1)
        if player1_score > player2_score:
            return 1  # Player 1 (Black) wins
        elif player1_score < player2_score:
            return -1  # Player 2 (White) wins
        else:
            return 0  # It's a draw

    def game_over(self):
        return not self.has_legal_moves(1) and not self.has_legal_moves(-1)
       

In [3]:
#Agent experience
class ReplayBuffer:
    def __init__(self, buffer_size):
        self.buffer_size = buffer_size
        self.buffer = []

    def add(self, experience):
        if len(self.buffer) > self.buffer_size:
            self.buffer.pop(0)
        self.buffer.append(experience)
        
    def sample(self, batch_size):
        return random.sample(self.buffer, batch_size)


In [4]:
## DQN agent class
class DQNAgent:
    #Initialization the hyperparameter
    def __init__(self, color, n=6):
        self.n = n
        self.replay_buffer_size = 200000
        self.current_play = 1
        self.batch_size = 32
        self.gamma = 0.95
        self.learning_rate = 0.001
        self.epsilon = 1
        self.min_epsilon = 0.1
        self.max_epsilon = 1
        self.update_rate = 1000
        self.color = color
        self.steps = 0
        self.lambd = 0.0005
        self.train_frequency = 10
        self.weight_backup = "nn_model.h5" #weight
        self.memory = ReplayBuffer(self.replay_buffer_size) #store memory
        self.nn_model = self.neural_network() #Q network
        self.target_model = self.neural_network() #target network
        
    ## Qnetwork architecture:The Q network is the agent that is trained to produce the Optimal State-Action value.   
    def neural_network(self):
        model = Sequential([
            Flatten(input_shape=(self.n, self.n)),
            Dense(18, activation='relu'),
            Dense(9, activation='relu'),
            Dense(1, activation='linear')
        ])
        model.compile(loss='mse', optimizer=Adam(learning_rate=self.learning_rate))

        if os.path.isfile(self.weight_backup):
            model.load_weights(self.weight_backup)
        return model

    def update_target_model(self):
        self.target_model.set_weights(self.nn_model.get_weights())

    def get_hash(self, board):
        return np.array(board)

    def choose_action(self, board):
        positions = board.get_legal_moves(self.color)
        if np.random.rand() < self.epsilon:
            return random.choice(positions)
        else:
            state_old = self.get_hash(board)
            pred = self.predict(state_old.reshape((1, self.n, self.n)))[0]
#             print(pred)
            return positions[np.argmax(pred)]

    def remember(self, board, action, reward, next_board, done):
        state = self.get_hash(board)
        next_state, updated_reward, _, _ = board.step(action)
        experience = (state, action, updated_reward, next_state, done)
        self.memory.add(experience)
        if self.steps % self.train_frequency == 0:
            self.replay()
            
        ##updated target network
        if self.steps % self.update_rate == 0:
            self.update_target_model()

        self.steps += 1
        self.epsilon = self.min_epsilon + (self.max_epsilon - self.min_epsilon) * math.exp(-self.lambd * self.steps)
    
    def predict(self, board):
        return self.nn_model.predict(board)

    def replay(self):
        if len(self.memory.buffer) < self.batch_size:
            return
        minibatch = self.memory.sample(self.batch_size) #qnetworrk takes the current state and action
        curr_state = np.array([transition[0] for transition in minibatch])
        curr_qvalue = self.nn_model.predict(curr_state)
        new_curr_state = np.array([transition[3] for transition in minibatch])
        future_qvalue = self.target_model.predict(new_curr_state)  #target network takes takes the next state from each data sample and predict the best q value     
        x, y = [], []

        for idx, (board, action, reward, next_board, done) in enumerate(minibatch):
            if not done:
                #Bellman equation
                target = reward + self.gamma * np.max(future_qvalue[idx])
            else:
                target = reward

            curr_qs = curr_qvalue[idx]
            curr_qs[action] = (1 - self.learning_rate) * curr_qs[action] + self.learning_rate * target
            x.append(board)
            y.append(curr_qs)

        self.nn_model.fit(np.array(x), np.array(y), batch_size=self.batch_size, verbose=0, shuffle=True)

    def save_model(self, filename):
        self.nn_model.save(self.weight_backup)


In [5]:
def evaluate(player1, player2, num_games):
    wins_player1 = 0
    wins_player2 = 0
    draws = 0
    game_durations = []

    for _ in range(num_games):
        game = Board(player1, player2)
        done = False
        duration = 0

        while not done:
            # Player 1's turn
            player1_position = game.get_legal_moves(game.current_play)

            if not player1_position:
                break

            if len(player1_position) > 0:
                player1_action = player1.choose_action(game)
                if isinstance(player1_action, tuple):
                    player1_action = player1_action[0]
                if 0 <= player1_action < len(player1_position):
                    next_state_player1, _, done, _ = game.step(player1_position[player1_action])

                    if done:
                        break

                    duration += 1
                else:
                    break
            else:
                break

            # Player 2's turn
            player2_position = game.get_legal_moves(game.current_play)

            if not player2_position:
                break

            if len(player2_position) > 0:
                player2_action = player2.choose_action(game)
                if isinstance(player2_action, tuple):
                    player2_action = player2_action[0]
                if 0 <= player2_action < len(player2_position):
                    _, _, done, _ = game.step(player2_position[player2_action])

                    if done:
                        break
                else:
                    break
            else:
                break

        winner = game.get_reward()

        if winner == 1:
            wins_player1 += 1
        elif winner == -1:
            wins_player2 += 1
        else:
            draws += 1

        game_durations.append(duration)

    win_rate_player1 = wins_player1 / num_games
    win_rate_player2 = wins_player2 / num_games
    draw_rate = draws / num_games
    avg_duration = np.mean(game_durations)

    return win_rate_player1, win_rate_player2, draw_rate, avg_duration


In [6]:
player1=DQNAgent(1,6)
player2=DQNAgent(-1,6)

In [7]:
num_evaluation_games = 10
win_rate_player1, win_rate_player2, draw_rate, avg_duration = evaluate(player1, player2, num_evaluation_games)

print(f"Player 1 Win Rate: {win_rate_player1}")
print(f"Player 2 Win Rate: {win_rate_player2}")
print(f"Draw Rate: {draw_rate}")
print(f"Average Game Duration: {avg_duration}")

Player 1 Win Rate: 0.6
Player 2 Win Rate: 0.1
Draw Rate: 0.3
Average Game Duration: 5.8
