In [1]:
# import the necessary libraries
import os
import random
import torch
import pygame
import numpy as np
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm

pygame 2.5.0 (SDL 2.28.0, Python 3.10.12)
Hello from the pygame community. https://www.pygame.org/contribute.html


In [2]:
# intialize the variables for the color codes
BLACK = (0, 0, 0)
WHITE = (255, 255, 255)
GREEN = (0, 255, 0)
RED = (255, 0, 0)

In [10]:
# create an coninconquest environment
class CoinConquestEnv:
  def __init__(self, board_size = 4, max_moves = 32):
    self.board_size = board_size
    self.board = np.zeros((board_size, board_size), dtype=int)
    self.player_positions = [[0, 0], [board_size-1, board_size-1]]  # AI player and Opponent player
    self.place_bombs()  # Place bombs on the board
    #self.place_coins()  # Place coins on the board
    self.bombs_placed = 0  # Counter to track the number of bombs placed
    self.max_moves = max_moves
    self.num_actions = 5
    self.current_move = 0
    self.cell_size = 100
    self.window_size = self.board_size * self.cell_size

  def place_bombs(self):
    num_bombs = 1  # Place one bomb in a random cell
    placed_bombs = 0
    while placed_bombs < num_bombs:
        row, col = random.choice(range(self.board_size)), random.choice(range(self.board_size))
        if self.board[row][col] == 0:  # Check if the cell is empty
            self.board[row][col] = -1  # Place a bomb
            placed_bombs += 1

  def collect(self):
    state = self.board.reshape((self.board_size, self.board_size))
    row, col = self.player_positions[0]
    cell_value = state[row, col]
    print("Cell Value", cell_value)
    if cell_value > 0:
        # If the AI player is on a coin cell, collect the coin and update the cell value
        state[row, col] = 0

        # Find the next highest value on the board
        highest_value = np.max(state)
        next_highest_value = np.partition(state.flatten(), -2)[-2]

        # Update the cell with the next highest value
        state[row, col] = next_highest_value

        return cell_value  # Return the reward for collecting a coin
    elif cell_value == -1:
        # If the AI player is on a bomb cell, return the penalty for stepping on a bomb
        return -10  # You can set any negative value as a penalty
    else:
        return 0  # No reward or penalty for other cells


  def step(self, action):
    reward = 0
    done = False

    if action == 0:  # Move North
      self.move(-1, 0)
    elif action == 1:  # Move South
      self.move(1, 0)
    elif action == 2:  # Move East
      self.move(0, 1)
    elif action == 3:  # Move West
      self.move(0, -1)
    elif action == 4:  # Place Bomb
      self.place_bombs()

    reward = self.collect()
    self.current_move += 1

    print("Board", self.board.flatten())

    if self.current_move >= self.max_moves or np.sum(self.board) == 0:
      done = True

    return self.board.flatten(), reward, done

  def place_coins(self):
    num_coins = 2 * self.board_size  # Place a few coins randomly
    for _ in range(num_coins):
        row, col = random.choice(range(self.board_size)), random.choice(range(self.board_size))
        while self.board[row][col] != 0:
            row, col = random.choice(range(self.board_size)), random.choice(range(self.board_size))

        coin_value = random.randint(1, 9)  # Generate a random coin value (1 to 9)
        self.board[row][col] = coin_value

  def move(self, dx=0, dy=0):
    new_row, new_col = self.player_positions[0][0] + dx, self.player_positions[0][1] + dy
    if 0 <= new_row < self.board_size and 0 <= new_col < self.board_size:
      # If the new position is valid, update the AI player position
      self.player_positions[0] = [new_row, new_col]

  def draw_board(self, screen):
    cell_size = 100
    for row in range(self.board_size):
      for col in range(self.board_size):
        x = col * cell_size
        y = row * cell_size
        cell_value = self.board[row][col]
        color = GREEN if cell_value > 0 else RED if cell_value == -1 else WHITE
        if cell_value == -1:  # Change color for bombs
           color = BLACK
        pygame.draw.rect(screen, color, (x, y, cell_size, cell_size))
        pygame.draw.rect(screen, BLACK, (x, y, cell_size, cell_size), 1)
        if [row, col] in self.player_positions:
          pygame.draw.rect(screen, RED, (x, y, cell_size, cell_size))

  def get_state(self):
    # Get the current state of the board as a 1D array
    state = self.board.flatten()
    return torch.tensor(state, dtype=torch.float32)

  def get_valid_actions(self):
    # Return a list of valid actions
    valid_actions = [0, 1, 2, 3]  # Move North, South, East, West
    if self.bombs_placed < self.board_size // 3:
      valid_actions.append(4)  # Place a bomb
      return valid_actions

  def reset(self):
    self.board = np.zeros((self.board_size, self.board_size), dtype=int)
    self.player_position = [0, 0]
    self.place_bombs()  # Place bombs on the board
    self.place_coins()  # Place coins on the board
    self.bombs_placed = 0  # Reset the counter for bombs placed
    return self.board.flatten()

In [11]:
# build an AI agent (Q-Learning)
class QLearningModel(nn.Module):
  def __init__(self, input_size, output_size, hidden_size = 32):
    super(QLearningModel, self).__init__()
    self.fc1 = nn.Linear(input_size, hidden_size)
    self.fc2 = nn.Linear(hidden_size, hidden_size)
    self.fc3 = nn.Linear(hidden_size, output_size)

  def forward(self, x):
    x = torch.relu(self.fc1(x))
    x = torch.relu(self.fc2(x))
    x = self.fc3(x)
    return x

In [12]:
# create an agent training loop
def train_dqn(model, env, learning_rate = 0.001, gamma = 0.9, epsilon = 0.9, epsilon_min = 0.01, epsilon_decay = 0.995, batch_size = 32, episodes = 1000):
  # initialize the optimizers
  print("---------------------------------")
  print("Agent Learning")
  optimizer = optim.Adam(model.parameters(), lr = learning_rate)
  criterion = nn.MSELoss()
  replay_memory = []

  for episode in tqdm(range(episodes), desc="Training Progress", ncols=80):
    state = env.reset()
    state = torch.tensor(state, dtype=torch.float32).view(1, -1)
    done = False
    total_reward = 0

    while not done:
      if np.random.rand() <= epsilon:
        action = np.random.randint(0, env.num_actions)
      else:
        q_values = model(state)
        action = torch.argmax(q_values, dim=1).item()

      next_state, reward, done = env.step(action)
      next_state = torch.tensor(next_state, dtype=torch.float32).view(1, -1)

      replay_memory.append((state, action, reward, next_state, done))

      state = next_state
      total_reward += reward

      if len(replay_memory) >= batch_size:
        model.train()  # training mode
        minibatch = random.sample(replay_memory, batch_size)
        batch_states, batch_actions, batch_rewards, batch_next_states, batch_dones = zip(*minibatch)
        batch_states = torch.cat(batch_states)
        batch_actions = torch.tensor(batch_actions, dtype=torch.long).view(-1, 1)
        batch_rewards = torch.tensor(batch_rewards, dtype=torch.float32).view(-1, 1)
        batch_next_states = torch.cat(batch_next_states)
        batch_dones = torch.tensor(batch_dones, dtype=torch.float32).view(-1, 1)

        q_values = model(batch_states)
        next_q_values = model(batch_next_states)
        target_q_values = batch_rewards + gamma * (1 - batch_dones) * torch.max(next_q_values, dim=1, keepdim=True)[0]

        q_values = q_values.gather(1, batch_actions)
        loss = criterion(q_values, target_q_values.detach())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epsilon = max(epsilon * epsilon_decay, epsilon_min)

  print("Training completed!")
  print("---------------------------------")

In [13]:
# method to get the actions for the agent
def get_action_str(action):
  actions_str = ["Move North", "Move South", "Move East", "Move West", "Place Bomb"]
  return actions_str[action]

In [14]:
# train the QLearningModel
env = CoinConquestEnv()
dqnModel = QLearningModel(env.board_size ** 2, 5) # 4 => valid actions
train_dqn(dqnModel, env)

# save the model for later use
torch.save(dqnModel.state_dict(), "coin_conquest_dqn_model.pt")

---------------------------------
Agent Learning


Training Progress:   1%|▏                    | 11/1000 [00:00<00:09, 106.82it/s]

Cell Value 8
Board [ 9  3  1  0  0  9 -1  9  0  0  0  7 -1  0  1  1]
Cell Value 9
Board [ 9  3  1  0  0  9 -1  9  0  0  0  7 -1  0  1  1]
Cell Value 0
Board [ 9  3  1  0  0  9 -1  9  0  0  0  7 -1  0  1  1]
Cell Value 9
Board [ 9  3  1  0  0  9 -1  9  0  0  0  7 -1  0  1  1]
Cell Value 9
Board [ 9  3  1  0  0  9 -1  9  0  0  0  7 -1  0  1  1]
Cell Value 3
Board [ 9  9  1  0  0  9 -1  9  0  0  0  7 -1  0  1  1]
Cell Value 9
Board [ 9  9  1  0  0  9 -1  9  0  0  0  7 -1  0  1  1]
Cell Value 9
Board [ 9  9  1  0  0  9 -1  9  0  0  0  7 -1 -1  1  1]
Cell Value 0
Board [ 9  9  1  0  0  9 -1  9  0  0  0  7 -1 -1  1  1]
Cell Value -1
Board [ 9  9  1  0 -1  9 -1  9  0  0  0  7 -1 -1  1  1]
Cell Value 9
Board [ 9  9  1  0 -1  9 -1  9  0  0  0  7 -1 -1  1  1]
Cell Value -1
Board [ 9  9  1  0 -1  9 -1  9  0  0  0  7 -1 -1  1  1]
Cell Value -1
Board [ 9  9  1 -1 -1  9 -1  9  0  0  0  7 -1 -1  1  1]
Cell Value 9
Board [ 9  9  1 -1 -1  9 -1  9  0  0  0  7 -1 -1  1  1]
Cell Value 9
Board [ 9  9  1 -1

Training Progress:   6%|█▎                   | 65/1000 [00:00<00:04, 229.62it/s]

Cell Value 5
Board [ 5  3  0  7  0  4 -1  3  7  0  0  0  1  8  0  0]
Cell Value 0
Board [ 6  0  6  0  0  4  4  0  0 -1  1  5  0  0  8  7]
Cell Value 0
Board [ 0  0  0  7  3  4  8 -1  0  0  9  6  0  9  0  3]
Cell Value 2
Board [ 0  0  0  0 -1  0  7  0  2  3  0  6  9  7  5  3]
Cell Value 0
Board [ 5  0 -1 -1  0  4  0  0  3  4  1  5  6  1  0  0]
Cell Value 0
Board [ 6  1  0  1 -1  2  8  0  0  2  9  0  0  0  1  0]
Cell Value 0
Board [ 4  0  0  6 -1  4  2  0  0  3  0  0  7  1 -1  3]
Cell Value 0
Board [ 6 -1  0  0  6  0  9  0 -1  5  3  0  2  4  0  4]
Cell Value 5
Board [ 0  5  0  2  4  4  6  6  0  0  1  8  0 -1  0  0]
Cell Value 0
Board [ 0  4  0  4  8  0  7  0  0  0  6  2 -1  9  0  2]
Cell Value 0
Board [ 0  3  3 -1  0  7  0  0  0  0  8  2  4  1  8  0]
Cell Value 0
Board [ 9  0  9  4  8  7  4  8  0  0  0 -1  0  6  0  0]
Cell Value 6
Board [ 0  0  9  4 -1  0  0  4  2  0  0  5  9  1  9  0]
Cell Value 9
Board [ 4  0  0  1  0  1  4  0  8 -1  0  8  2  0  0  8]
Cell Value 0
Board [-1  7  0  9  0

Training Progress:  12%|██▍                 | 122/1000 [00:00<00:03, 255.23it/s]

Cell Value 0
Board [ 5  0  0  7  6  2  0  0  0  4  8  0  0  1  5 -1]
Cell Value 8
Board [-1  7  7  3  0  5  2  0  0  1  0  0  0  0  6  8]
Cell Value 4
Board [ 0  0  6  2  0  9  4  5  0  0  6 -1  3  2  0  0]
Cell Value 4
Board [ 0  0  5  3 -1  1  0  9  0  4  0  2  0  0  4  5]
Cell Value 5
Board [ 4  0  0  6  0  0  4  2  0  0  6 -1  0  2  9  4]
Cell Value 0
Board [ 0  5  9  0  0  0  0  5  3  0  0  4  2  4 -1  2]
Cell Value 6
Board [ 4  3  0  7  3  0  0  5  0  6  0  7 -1  0  0  7]
Cell Value 0
Board [ 0  0  0  0  7  0  7  1  2  0  2  6  0 -1  9  9]
Cell Value 8
Board [ 3  3  0  6 -1  0  0  0  0  6  0  0  5  4  7  1]
Cell Value 0
Board [ 4  4  7  0  5  0  0  0  2  1  4 -1  8  0  0  0]
Cell Value 0
Board [ 8  7  5  0  0  0  0 -1  7  0  1  3  2  0  6  0]
Cell Value 0
Board [ 0  1  3  0  8  0  5 -1  0  1  5  3  0  6  0  0]
Cell Value 0
Board [ 4  8  7  0  0  0  0  7  0  0  3  5  2 -1  0  7]
Cell Value 0
Board [ 0  2  6  0  4  0  3  0  6  0 -1  2  9 -1  1  0]
Cell Value 8
Board [ 8  6  9  9  0

Training Progress:  17%|███▍                | 173/1000 [00:00<00:03, 246.41it/s]


Board [ 8  0  0  0  0  8  0  0  1  5  7  7  0  4 -1  9]
Cell Value 0
Board [ 1  0 -1  0  0  1  9  9  0 -1  8  0  0  1  3  5]
Cell Value 6
Board [-1  5  7  8  8  0  0  9  0  0  0  5  0  0  1  7]
Cell Value -1
Board [ 4  8  7 -1  3  0  0  1  6  0  0 -1  9  0  2  0]
Cell Value 8
Board [-1  5  0  8  7  8  0  0  6  0  0  9  7  0  0  7]
Cell Value 0
Board [ 3 -1  1  0  5  0  0  0  0  3  9  6  0  5  8 -1]
Cell Value 0
Board [ 4  4  9  0  1  1  0  0  0  0  9  8  5  0  0 -1]
Cell Value 6
Board [ 0  3  0  0  0  3  0  6  4 -1  8  1 -1  3  6  0]
Cell Value 0
Board [ 0  0  2  0  9  4  9  0  2 -1  8  6  0  7  0  0]
Cell Value 0
Board [ 1  0  0  3  5  0 -1  0  6  1  3  3  3  0  0 -1]
Cell Value 0
Board [ 5  9  0  0  4  2  0  7  2  3  0  0  0  0  2 -1]
Cell Value 5
Board [ 0  0  0 -1  5  2  9  7  0  6  7  1  0  0  4  0]
Cell Value 0
Board [ 0  2  0  0  0  1  7  0 -1  0  4  6  3  0  3  7]
Cell Value 2
Board [ 0  9  0 -1  0  0  6  9  7  0  9  0  6  0  3  8]
Cell Value 5
Board [ 0  0 -1  7  1  0  0  7  

Training Progress:  23%|████▌               | 226/1000 [00:00<00:03, 251.35it/s]

Cell Value 0
Board [ 9  5  8  0  5  0  2  6  6 -1  0  6  0  0  0  0]
Cell Value -1
Board [ 2  0  3  5  0 -1  6  5  0  0  3  0  7  4  0 -1]
Cell Value 3
Board [ 0  5  0  0  9  0  6  8  8 -1  1  0  1  0 -1  8]
Cell Value 0
Board [ 8  0  2  0 -1  7  0  7  2  3  1  4  0  0  0  0]
Cell Value 0
Board [-1 -1  3  6  7  0  4  0  3  0  0  5  2  6  0  0]
Cell Value 1
Board [-1  2  6  4  9  0  0  2  0  0  0  0  2  1 -1  6]
Cell Value 9
Board [ 0 -1  1  4  0  0  8  6  6  6 -1  0  0  0  4  6]
Cell Value 2
Board [-1  0  0  9  6  0  0  6  5  1  0 -1  2  7  0  7]
Cell Value 7
Board [ 0  0 -1  3 -1  8  0  0  1  2  6  0  2  0  4  6]
Cell Value 0
Board [ 2  6 -1  0  0  3  2  4  0  7  8  0  0  5  0  0]
Cell Value 0
Board [ 0  0  1  0  1 -1  3  0  6  7  0  4  1  3  0  0]
Cell Value 9
Board [ 0  1  3  7 -1 -1  0  5  0  2  0  6  2  0  0  6]
Cell Value 0
Board [ 0 -1  9  7  0  4  0  0  0  6  5  0  3  3  2  0]
Cell Value 8
Board [ 5  6  0  6  8  0  0 -1  8  4  1  0  0  0  0  8]
Cell Value 0
Board [ 1  0  0  1  

Training Progress:  28%|█████▌              | 279/1000 [00:01<00:02, 251.03it/s]

0
Board [ 0  9  0 -1  4  0  2  0  9  9  3  5  5  0  0  0]
Cell Value 5
Board [ 0  1  3  0 -1  0  0  0  0  9  3  2  7  0  9  9]
Cell Value 0
Board [ 0  0  0  8  6  1  0  2  5  0  6 -1  0  1  6  0]
Cell Value 7
Board [ 0  0  0  7  0 -1  6  6  9  4  0  3  0  2  0  7]
Cell Value 9
Board [ 0  5  0  0  0  1  0  0  5  0  9 -1  4  5  6  6]
Cell Value 1
Board [-1  5  8  9  4  0  0  0  6  0 -1  4  0  0  8  8]
Cell Value -1
Board [ 0  4  7  8  5  0 -1  7  0  0  0  0  3  2  4 -1]
Cell Value 2
Board [ 0  0  0  0  2  0  7  0 -1  4  0  3  8  5  3  7]
Cell Value 0
Board [ 0  8  6  1  2  6  0  1  0  7  0  0 -1  0  1  0]
Cell Value 0
Board [ 2  0 -1  1  4  1  0  5  0  3  0  0  4  1  0  0]
Cell Value -1
Board [ 0  1  0  0  2  0  9  4  5  4  0  0  8  7 -1  0]
Cell Value 0
Board [ 0  6 -1  7  0  7 -1  0  0  3  4  7  9  0  0  6]
Cell Value 7
Board [ 0  8  6  0  1  0  5  0 -1  0  9  0  0  3  1  8]
Cell Value 0
Board [ 8  0  4  0  0  5 -1  0  2  2  6  0  0  4  4  0]
Cell Value 3
Board [-1  9  0  1  0  7  0  2

Training Progress:  33%|██████▌             | 330/1000 [00:01<00:02, 243.45it/s]

Cell Value 0
Board [ 9  8  0  7  4  0  0  1  0  0  7  8  0 -1  5  0]
Cell Value 8
Board [ 2  9  0  9  6  2  9  0  0  0  0 -1 -1  0  3  9]
Cell Value 0
Board [ 1  2  0  2  1  0  0  0  1  9  2 -1  0  0  8  0]
Cell Value 0
Board [ 0  7  0  0  2 -1  0  7  0  3  4  6  8  1  0  0]
Cell Value 0
Board [ 0  4  0  0  0 -1  2  0  7  1  1  9  9  0  7  0]
Cell Value -1
Board [ 4  9  0  7  0  7  0  2  7  0  0  0  0  3  3 -1]
Cell Value 4
Board [ 2  0  8  0  0  5 -1  4  6  7  0  0  0  0  4  7]
Cell Value 0
Board [ 0  0  8  6  2  7 -1  4  3 -1  1  0  0  5  0  0]
Cell Value 0
Board [ 4 -1  2  0  0  0  4  0  2  1  7  0  1  9  0  0]
Cell Value 6
Board [-1  0  5  0  2  0 -1  3  2  1  0  2  0  6  0  5]
Cell Value -1
Board [ 6  4  1  0  1  0  4  0  6  0  0  8  1  0  0 -1]
Cell Value 3
Board [ 1  0  0  0  2  4  3 -1  0  3  0  0  0  8  5  5]
Cell Value 0
Board [ 7  0  2  9  6  0  2  0  6  3  0  0 -1  4  0  0]
Cell Value 8
Board [ 0  0  0  4  8  5  9  0  9  4  7  0  0 -1  0  9]
Cell Value -1
Board [ 9  0  0  8

Training Progress:  38%|███████▌            | 381/1000 [00:01<00:02, 246.97it/s]

Cell Value 2
Board [ 1  9  0  9  0  0  0 -1  0  0  0  1  9  1  8  1]
Cell Value 8
Board [ 0  3  0  9  9 -1  0  0  0  8  0  4  7  9  9  0]
Cell Value 0
Board [ 0  0  0  0  0  2  4  0  5  5  2  3  3 -1  0  2]
Cell Value 2
Board [ 9  5  0  0  8  0  6 -1  9  0  0  9  3  3  0  0]
Cell Value 1
Board [ 0  0  0  0 -1  9  3  2  0  0  2  7  7  3  0  7]
Cell Value 3
Board [ 0  5  9  9  2  0  7 -1  0  0  0  9  0  0  1  1]
Cell Value 0
Board [ 7  0  7  0  8  0  5  1  9  0 -1  0  0  2  8  0]
Cell Value 9
Board [ 9  1  1  7  0 -1  9  0  0  0  0  9  2  2  0  0]
Cell Value 5
Board [ 5  0  6  0  2 -1  0  0  0  5  0  7  7  0  8  1]
Cell Value 0
Board [ 5  2  0  0 -1  0  5  0  0  0  8  0  4  7  7  9]
Cell Value 4
Board [ 0  8  0  8  3  0  5  1  3  0  7  0  0 -1  8  0]
Cell Value 3
Board [ 7  0  7  0  3  9  3  0  5  0 -1  0  1  4  0  0]
Cell Value 0
Board [-1  0  5  0  0  9  7  0  0  5  9  1  0  0  7  9]
Cell Value 0
Board [ 7  8  8  0  3  0  0  0  3  4 -1  0  4  0  4  0]
Cell Value 0
Board [ 3  0  0  0  0

Training Progress:  43%|████████▌           | 431/1000 [00:01<00:02, 245.98it/s]

Cell Value 2
Board [ 0  0  6  8  0  0  6  0 -1  4  9  0  8  0  8  5]
Cell Value 3
Board [ 2  0  7  7  0  7  0  0 -1  0  3  5 -1  0  2  6]
Cell Value 0
Board [-1  5  8  0  6  0  0  3  0  5  0  8  0  8  4  0]
Cell Value 0
Board [ 7  9  0  0  7  8  0  0  6  0  7 -1  0  2  9  0]
Cell Value 0
Board [ 9  0 -1  0  5  0  4  3  5  0  5  6  9  0  0  0]
Cell Value 5
Board [ 8  8 -1  8  0  0  0  0  1  0  4  7  6  0  9  0]
Cell Value 0
Board [ 5  0  7  0  0 -1  0  9  4  4  0  0  4  8  2  0]
Cell Value 0
Board [ 1  2  0  0 -1  0  7  4  8  0  0  3  8  8  0  0]
Cell Value 0
Board [ 0  0 -1  0  0  0  7  5  0  2  0  8  9  7  9  9]
Cell Value 5
Board [ 0  2  0  0  7  9  0  7  2  0  3  0  1  5  0 -1]
Cell Value 0
Board [ 0  2  4  0  0  0  4  0  3  0  2  5  8  0  4 -1]
Cell Value 5
Board [ 6  7  8  7  0  5  1  0  7  0  0  4  0  0  0 -1]
Cell Value 9
Board [ 0  0  8  8  9 -1  2  3  0  0  0  0  4  0  3  3]
Cell Value -1
Board [ 0  7  0  0  0  0  2 -1  0  0  7  2  6  5  9  9]
Cell Value 7
Board [ 1  0  0  2  

Training Progress:  48%|█████████▋          | 484/1000 [00:02<00:02, 242.09it/s]

Cell Value 3
Board [ 0  0 -1  0  0  0  1  6  0  5  3  0  3  7  3  6]
Cell Value 0
Board [ 8  0  1 -1  0  0  2  7  0  6  4  0  6  0  0  2]
Cell Value 0
Board [ 0  0  1  8  4  0 -1  1  0 -1  7  0  1  8  6  0]
Cell Value 6
Board [ 0  0  0  2  2  0  0 -1  9  8  9  1  0  4  0  9]
Cell Value 6
Board [ 0  2 -1  9  6  0  5  0  0  8  0  4  1 -1  0  8]
Cell Value 0
Board [ 6  4  0  6  0  0  8  4  0  5 -1  0  5  0  0  1]
Cell Value 6
Board [ 0  6  4 -1  0  9  1  6  0  0  6  0  4  0  1  0]
Cell Value 2
Board [ 4  0  0  0 -1  0  4  2  0  4  0  4  2  5  0  4]
Cell Value 9
Board [ 1  0  9  4  0  0  0  1  0  2  4 -1  5  0  0  5]
Cell Value 4
Board [-1  4  0  5  8  6  0  0  3  0  8  8  0  0  0  8]
Cell Value 0
Board [ 0  0  0  0  9  4  4  5  0  6  4  8 -1  4  0  0]
Cell Value 5
Board [ 9  0  0  0  0  1  4  0  1 -1  6  0  7  0  7  3]
Cell Value 0
Board [ 6  0  6  8  0  5  0  0  0  7  0  4  2  0 -1  7]
Cell Value 4
Board [ 2  4  1  7  0 -1  7  1  0  0  0  9  7  0  0  0]
Cell Value 0
Board [ 9  0  0  9  0

Training Progress:  53%|██████████▋         | 534/1000 [00:02<00:01, 242.72it/s]

Cell Value 2
Board [ 6  0  4  9  2  4  6  0 -1  5  6  0  0  0  0  0]
Cell Value 0
Board [ 7  1  0  2  0  0  0  2  9  2  0  3  2  0  0 -1]
Cell Value -1
Board [-1  1  5  0  4  0  0  0  1  0  0  2  2  5  2  0]
Cell Value 2
Board [ 0  2  0  2  7  0  0  8  7  7  4 -1  3  0  0  0]
Cell Value 9
Board [ 0  0  0  0  2  2  1  7  7  0  0 -1  2  5  0  8]
Cell Value 6
Board [-1  0  0  0  9  1  0  5  1  9  0  2  0  0  9  1]
Cell Value 0
Board [-1  0  0  7  9  0  8  2  8  4  0  0  9  3  0  0]
Cell Value 0
Board [ 2  0  0  6  0  0  3  9  2  0  0  8 -1  5 -1  5]
Cell Value 4
Board [-1  4  0  7  0  0  0  0  6  7  5  8  0  5  0  7]
Cell Value 4
Board [ 8  2  5  1  0  0  3  0  0 -1  0  8  0  8  0  5]
Cell Value 0
Board [ 0  0  8  9  4  0  9 -1  0  0  0  3  3  4  4  0]
Cell Value -1
Board [ 4  0  8  0  2  0  2  5 -1  1  4  0  0  0  0  1]
Cell Value 4
Board [ 0  7  8  3  8  5 -1  0  0  0  0  1  1  0  8  0]
Cell Value 0
Board [ 0  9  6  0  0  9  4  0  9  9  4  0 -1  2  0  0]
Cell Value 5
Board [ 2  0  0  0 

Training Progress:  59%|███████████▊        | 588/1000 [00:02<00:01, 254.45it/s]

Cell Value 5
Board [ 6  0  1  8  0  3  5  0  0  8  8  0  0  0  3 -1]
Cell Value 6
Board [ 1  2  5  0  5  3 -1  5  0  3  0  0  0  4  0  0]
Cell Value 3
Board [ 0  0  8  6 -1  0  5  8  0  3  0  0  6  0  8  7]
Cell Value 4
Board [ 8  0  4  8  0  6  0 -1  0  0  0  0  8  5  8  8]
Cell Value 0
Board [ 3  0  0  0  0 -1  5  0  7  0  8  3  3  0  2  8]
Cell Value 5
Board [ 0  0  0  7  7  9  2  0  0  0 -1  1  1  1  6  0]
Cell Value 6
Board [ 1  0  8  8  9  0  4  2  0  2  0  0 -1  2  0  0]
Cell Value 3
Board [ 9  6  3  6  0  0  0  5  0  0  0  1  0 -1  3  1]
Cell Value 0
Board [-1  8  6  0  0  8  0  9  7  4  0  0  3 -1  0  5]
Cell Value 9
Board [ 6  0  0  7  1  0 -1  0  3  0  0  4  4  8  7  0]
Cell Value 8
Board [ 0  0  0  0  2  0  2  5  5 -1  1  0  4  0  3  6]
Cell Value 0
Board [ 8  0  0  0  0  3  0  0  4  5  4  4  5  0  5 -1]
Cell Value 8
Board [ 8  0 -1  9  2  0  2  9  9  0  5  0  9  0  0  0]
Cell Value 0
Board [ 7  0  2  0  7  0  9  9 -1  0  2  0  0  6  1  0]
Cell Value 0
Board [ 5  1  0 -1  0

Training Progress:  64%|████████████▊       | 640/1000 [00:02<00:01, 244.64it/s]

Cell Value -1
Board [ 0  0  0  0  0  6  4  0  5  8 -1  4  5  5  0  3]
Cell Value 6
Board [ 0 -1  1  0  9  0  2  0  0  8  0  0  9  1  9  8]
Cell Value 0
Board [ 4  8 -1  8  0  6  0  0  9  4  0  9  0 -1  0  8]
Cell Value 3
Board [ 2  0  0  1  1  0 -1  0  1  0  7  0  0  1  3  3]
Cell Value 9
Board [ 0  0  4  4  6  0  0  0  3  3  4  0 -1  3  1  0]
Cell Value 0
Board [ 5  5  9  0  0  0  0  0  6 -1  1  1  4  0  6  0]
Cell Value 3
Board [ 8  0  7  3  0 -1  0  8  0  0  5  9  0  2  1  0]
Cell Value 9
Board [ 2  0  0  8  1  0  8  0  7  0  0  8 -1  0  5  5]
Cell Value 1
Board [ 7  0  4  8  0  0  0  0  3  8  8  0  6  4 -1  0]
Cell Value 0
Board [ 2  0  2 -1  7  8  0  0  6  2  0  5  0  0  5  0]
Cell Value 6
Board [ 0  9  0  4  2  1  0  0  1  0  3  4  0  3  0 -1]
Cell Value 0
Board [ 7  0  1 -1  1  0  0  0  1  5  4  0  4  8  0  0]
Cell Value 7
Board [ 0  2  1  6 -1  0  0  1  0  5  4  0  6  0  7  0]
Cell Value 3
Board [ 2  7  0  7  0  0  4  7  0  3  0  0  0  5 -1  3]
Cell Value 2
Board [ 7  0  5  6  

Training Progress:  70%|█████████████▉      | 696/1000 [00:02<00:01, 258.09it/s]

Cell Value 9
Board [ 2  0  0  0  0 -1  5  0  0  9  9  0  9  6  5  4]
Cell Value 0
Board [ 0  0  1  0  0  0  6 -1  5  8  0  8  4  0  3  2]
Cell Value 3
Board [ 0  8  8  2  0  0  0  2  1  4  0  0  8 -1  0  4]
Cell Value 9
Board [ 5  0  0  0  0  5 -1  1  3  5  0  1  5  0  0  7]
Cell Value 7
Board [-1  7  0  8  0  0  4  0  6  0  0  2  3  0  1  7]
Cell Value 0
Board [ 0  1  4 -1  0  0  7  8  0  7  9  9  0  0  0  6]
Cell Value 0
Board [ 0  0  0  1  6  0  9  0  0  6  0  1  9 -1  5  1]
Cell Value 5
Board [ 4  7  9  6  0  2  0  0  7  0  6  0 -1  0  3  0]
Cell Value 5
Board [ 2  7  0  3  0  5  4  0  0  0  7  7  5 -1  0  0]
Cell Value 1
Board [ 1  7  7  4  4  0  5  0  0  0  0  3  0  0  7 -1]
Cell Value 0
Board [ 3  0  0  4  0  2  0  0  7 -1  2  0  3  0  7  6]
Cell Value 3
Board [ 5  0  0  8  8  8  6  0  0  0  2  0  0 -1  6  4]
Cell Value 9
Board [ 0 -1  5  0  6  1  0  6  1  3  9  0  0  0  1  0]
Cell Value 3
Board [ 0  8  0  8  0  0  0  7  4  9 -1  5  5  0  1  0]
Cell Value 8
Board [ 8  0  5  7  0

Training Progress:  75%|███████████████     | 752/1000 [00:03<00:00, 259.84it/s]

Cell Value 9
Board [ 3  4  5  2  0 -1  0  0  0  2  7  0  0  0  5  3]
Cell Value 0
Board [ 3  6  0  0  0  0  7  0  0  8  3  3  2 -1  0  7]
Cell Value 0
Board [ 6  0  0  0  7  0 -1  0  8  3  3  0  0  3  7  2]
Cell Value 7
Board [ 0  1  7  1  0  0  0  0 -1  4  7  0  6  0  9  1]
Cell Value 0
Board [ 0  0  0  0  7 -1  9  8  7  0  9  6  4  0  4  0]
Cell Value 1
Board [ 0  0  8  8  0  0  0  7  6  3 -1  4  9  0  1  0]
Cell Value 0
Board [ 7 -1  0  6  4  6  0  5  6  0  0  6  0  2  0  0]
Cell Value 0
Board [ 0  0  0  8  7  8  0  4  0  9  3  0  0  8  9 -1]
Cell Value 0
Board [ 6  0  0  0 -1  0  6  0  4  1  0  5  0  7  7  6]
Cell Value 2
Board [ 0  5  6  6  9  0  6  0  3  5  0  0  6 -1  0  0]
Cell Value 1
Board [ 2  0  0  5  4  0  7  3  0 -1  0  2  5  4  0  0]
Cell Value 0
Board [ 3  0  2  0  0  0  8  4  4 -1  0  8  4  0  1  0]
Cell Value 8
Board [ 0  0  0  8  0  2  7  8  3  5  0 -1  0  0  9  4]
Cell Value 0
Board [-1  0  1  0  1  9  9  0  0  0  8  0  4  3  9  0]
Cell Value 0
Board [ 1  6  7  0 -1

Training Progress:  81%|████████████████▏   | 807/1000 [00:03<00:00, 258.92it/s]

Cell Value 0
Board [ 2  7  0  0  0  0  5  0  6  0  7 -1  7  0  5  9]
Cell Value 5
Board [ 7  0  0  0 -1  1  5  0  0  5  7  7  0  4  4  0]
Cell Value 7
Board [ 8  9  0  0  8  8  0  6  7  7  0  0 -1  0  0  6]
Cell Value 3
Board [ 0  0  0  1  2  4  0  0  8  8  4  0  2  0 -1  9]
Cell Value 0
Board [ 0  0  9  0  0  5  9  6  0  5  0  3  0 -1  6  3]
Cell Value 0
Board [ 8  9  5  0  9  0  0  0  0  0  9  9  0  6 -1  7]
Cell Value 2
Board [ 6  0  0  0  6  6  0 -1  8  8  8  0  9  0  8  0]
Cell Value 0
Board [ 4 -1  7  0  2  0  0  0  1  8  2  0  0  8  0  7]
Cell Value 5
Board [ 0  2  0  9  5  0  1  0  7  0  5 -1  7  0  0  6]
Cell Value 4
Board [ 0  1  1  0 -1  0  8  0  4  2  0  8  8  0  0  6]
Cell Value 0
Board [ 2  0  5  0  4  1  0  2  0  0  5 -1  0  7  4  0]
Cell Value 0
Board [ 9  0  0  0 -1  5  0  1  0  7  3  0  5  5  5  0]
Cell Value 5
Board [ 2  4  8  7  7  5  0  0 -1  0  0  0  0  1  1  0]
Cell Value 5
Board [ 5  0  9  0  8 -1  4  0  7  4  2  0  0  8  0  0]
Cell Value 0
Board [ 0  0  0  5  1

Training Progress:  86%|█████████████████▏  | 860/1000 [00:03<00:00, 260.19it/s]

Board [ 0  0  7 -1  0  1  0  9  0  0  5  6  0  7  3  7]
Cell Value 8
Board [ 0  0 -1  6  5  2  0  3  5  0  2  0  8  0  0  6]
Cell Value 7
Board [ 0  7  7  7  0  6  0  2 -1 -1  0  7  0  0  4  7]
Cell Value 0
Board [ 0  7  2  7  1  0  7  0  0  5  0  0  1  4  0 -1]
Cell Value 7
Board [ 8  0  0  8  8  0  3 -1  2  4  0  8  0  0  5  0]
Cell Value 5
Board [ 8  4  0  8  9  0  0  6  5  0  2  8  0 -1  0  0]
Cell Value 5
Board [ 8  0  0 -1  0  0  7  8  0  9  8  0  3  0  6  5]
Cell Value 0
Board [ 4  1  0  0  0  0  0  1  1  3  0  5  9 -1  0  7]
Cell Value 6
Board [ 0  7  1  6  5  2  0 -1  0  0  7  0  0  5  0  7]
Cell Value 4
Board [ 0  0  8  5  3  0  0  0  4  0  1  6  4 -1  0  6]
Cell Value 7
Board [ 7  3  4 -1  6  0  0  0  7  0  4  9  0  0  7  0]
Cell Value 4
Board [ 8 -1  0  2  0  8  9  3  7  0  8  0  0  8  0  0]
Cell Value 0
Board [ 8  6  8  8  0  8  0  0  0  1  6 -1  2  0  0  0]
Cell Value 8
Board [ 2  6  0  0  5 -1  0  8  7  9  8  0  0  0  0  6]
Cell Value 0
Board [ 5  0  1  0  7 -1  7  5  0 

Training Progress:  91%|██████████████████▎ | 913/1000 [00:03<00:00, 259.31it/s]

 [ 0  8  0  0  3  0  8  0  2 -1  0  4  1  0  3  2]
Cell Value 0
Board [ 0  9  0  0 -1  8  0  2  1  2  0  1  6  1  0  0]
Cell Value 0
Board [ 0  0  2  0  0  5 -1  3  4  0  7  1  9  7  0  0]
Cell Value 0
Board [ 3  0  4  3  0  2 -1  0  0  6  7  7  0  8  0  0]
Cell Value 9
Board [ 7  0  0  0  3  7  3  5  0  0 -1  4  0  7  3  0]
Cell Value 8
Board [ 0 -1  0  7  0  8  8  4  0 -1  3  1  0  0  2  8]
Cell Value 5
Board [ 0  9  0  9  1 -1  0  0  9  1  6  9  0  0  1  0]
Cell Value 5
Board [ 7  7  8  0  0  0 -1  0  7  4  0  0  7  5  5  0]
Cell Value 0
Board [ 4  0  2  1  0  0  2  0  5  4  0  0  5  2  0 -1]
Cell Value 8
Board [ 0  7  0  5  0 -1  6  7  0  0  2 -1  7  0  2  9]
Cell Value 2
Board [ 7  7  0  0  8  0  1  5  2  0  5  0  6  0 -1  0]
Cell Value 0
Board [ 0  4  8  0  0  0  4  6  1  0  5  4  0  7 -1  0]
Cell Value 5
Board [ 8  2  0  0  9  0  8  3 -1  4  0  8  0  7  0  0]
Cell Value 0
Board [ 1  5  9  0  0  0  1  2  3  0  4  0 -1  0  0  8]
Cell Value 1
Board [ 4  2  3  2  0  0 -1  5  4  2  0

Training Progress:  96%|███████████████████▎| 965/1000 [00:03<00:00, 249.62it/s]

Cell Value 9
Board [ 6  0  5  0 -1  6  0  2  2  0  2  0  9  0  6  0]
Cell Value 0
Board [ 5  0  2  0 -1  0  8  4  0  0  0  7  8  0  2  6]
Cell Value -1
Board [-1  2  4  0  5  6  0  6  0  1  6  0  0  1  0  0]
Cell Value 5
Board [ 9  8  4  6  0  0  0  0  0 -1  7  0  8  9  9  0]
Cell Value 5
Board [ 8  0  0  5  8  7  0  0  9  0  1  8  0  1 -1  0]
Cell Value 0
Board [ 0  1  6  3 -1  0  2  0  0  7  0  8  0  0  1  2]
Cell Value 4
Board [ 9  0  9  0  9  0  0 -1  4  0  4  9  9  0  7  0]
Cell Value 3
Board [ 7  7  0 -1  0  0  1  0  4  5  3  2  7  0  0  0]
Cell Value 0
Board [ 0  0 -1  8  0  0  6  6  2  0  5  8  0  9  6  0]
Cell Value 0
Board [ 0  1  6  8  5  5  0  8 -1  0  0 -1  0  0  3  8]
Cell Value 0
Board [ 0  9  3  2 -1  0  9  0  0  1  5  0  6  0  0  4]
Cell Value 0
Board [ 1  0  0 -1  0  7  1  0  0  0  2  3  8  9  0  9]
Cell Value 0
Board [ 7  3  2  0  0  0  0  2  3  0  0  5 -1  1  0  2]
Cell Value 7
Board [ 0  0  0  4  0  5  0  0  5  7  5  1  0  9 -1  7]
Cell Value 2
Board [ 0  4  4  0  

Training Progress: 100%|███████████████████| 1000/1000 [00:04<00:00, 248.31it/s]

Cell Value 7
Board [ 0  0  0  0  1  4  6  9  0  6  4 -1  1  0  0  2]
Cell Value 0
Board [ 4 -1  2  0  0  7  3  7  1  0  0  8  0  0  5  0]
Cell Value 0
Board [ 0  6  2  0  0  0  1  0  0  3  3  9  9 -1  0  5]
Cell Value 6
Board [ 4  0  4  0  6  0  0 -1  0  0  6  0  9  9  9  2]
Cell Value 0
Board [ 7  0  0  8  7  0  8  1  4  0  0  0  0  3  1 -1]
Cell Value -1
Board [ 4  9  0  3  2  0  0  0  1  6 -1  9  0  0  0  2]
Cell Value 0
Board [ 1  0  5  3  0  8  0  0  0 -1  5  3  9  0  0  8]
Cell Value 3
Board [ 7  8  6  4  0 -1  5  0  4  0  7  0  0  0  3  0]
Cell Value 0
Board [ 9  0  7  0  9  2  2  7  2  0  0  0 -1  0  0  9]
Cell Value 0
Board [ 7  0  0  2  3  0  0  0  1  6  8  1  3  0  0 -1]
Cell Value 9
Board [ 0  6  2  1  5  0  0 -1  9  5  0  0  8  8  0  0]
Cell Value 3
Board [ 0  0  3  1  0  0  2  0  2 -1  9  0  8  8  0  4]
Cell Value 0
Board [ 8  2  9  2  8  0  0  0  0  1  0 -1  0  0  3  8]
Cell Value -1
Board [ 7  6  5  5  0  0  0  0 -1  0  4  8  0  2  7  0]
Cell Value 0
Board [ 5  9  9  0 




In [15]:
# build an game play module
def GameModule(env, agent):
  # validate the agent and its working
  player = "Agent"

  # intialize the state with reset the environment
  # state = env.reset()

  print("----------------------------")
  print("Agent starts playing the coin conquest game")

  while True:
    print(env.board)
    state = env.board.flatten() # pass the board values as an 1-D array
    state = torch.tensor(state, dtype = torch.float32).view(1, -1)
    # get the q values
    qValues = agent(state)
    # get the action and perform it on the environment
    action = torch.argmax(qValues, dim = 1).item()
    print(action)
    move = get_action_str(action)  # it would be the data type of string

    print(f"{player} moves in {move} direction to get the maximum reward")

    # chosen action and moves outcome
    next_state, reward, done = env.step(action)

    print("Reward", reward)

    # change the next state
    # env.board = next_state

    if done:
      # Game over, calculate the winner and score
      winner = "AI" if np.sum(env.board) > 0 else "Random AI"
      score = np.sum(env.board[env.board > 0])
      print(f"Game Over! Winner: {winner}, Coin: {reward}")
      break

  print("Agent ends the game")
  print("----------------------------")

In [16]:
# call the main method
if __name__ == "__main__":
  # load the trained agent
  agent = QLearningModel(env.board_size * env.board_size, 5)
  agent.load_state_dict(torch.load("coin_conquest_dqn_model.pt"))

  # evaluate the model
  agent.eval()

  # test the evaluation
  # print("Evaluation : ", agent.eval())
  GameModule(env, agent)

----------------------------
Agent starts playing the coin conquest game
[[ 0  0  0  4]
 [ 2  0 -1  4]
 [ 7  1  0  0]
 [ 4  6  0  6]]
1
Agent moves in Move South direction to get the maximum reward
Cell Value 6
Board [ 0  0  0  4  2  0 -1  4  7  1  0  0  4  6  0  6]
Reward 6
Game Over! Winner: AI, Coin: 6
Agent ends the game
----------------------------
