In [None]:
!pip install gym
!pip install tensorflow
!pip install stable-baselines

import numpy as np
import gym
from gym import spaces
from stable_baselines.common.env_checker import check_env
import matplotlib.pyplot as plt
from google.colab import files

# Battleship Environment




In [57]:
class BatteshipEnv(gym.Env):
  def __init__(self, grid_size, ships_board):
    super(BatteshipEnv, self).__init__()
    self.grid_size = grid_size  # With grid_size = n we have a board of n*n
    self.max_health = int(np.max(ships_board)) # The maximum health h among the ships
    self.observation_space = spaces.Discrete(self.grid_size * self.grid_size) # Obeservation space
    self.action_space = spaces.Tuple((spaces.Discrete(self.grid_size),spaces.Discrete(self.grid_size))) # Action space
    self.nS = self.grid_size * self.grid_size * (self.max_health+3) # Size of state space n*n*h
    self.nA = self.grid_size * self.grid_size # Size of action space n*n
    self.ships_board = ships_board # Given board with placed ships
    self.board = np.zeros((self.grid_size,self.grid_size), dtype='int') # Currently playing board
    self.s = (0,0,0)  # Currently state: a tuple representing (x,y,health)
    self.number_of_hit_to_win = np.sum(self.ships_board) # Number of hit to win the game = sum of health of all ships   
    self.number_of_hit = 0  # Current number of hit 
    self.destroyed = int(self.max_health + 1)
    self.empty =  int(self.max_health + 1)
    
  def calculate_reward(self, action):
    """ return reward for an action
    """ 
    x = action[0]
    y = action[1]
    reward = 0
    if (self.board[x,y] == self.empty) | (self.board[x,y] == self.destroyed) :
      reward -= 1
    return reward

  def update_board(self, action):
    x = action[0]
    y = action[1]

    if self.board[x,y] == self.empty:
      self.s = (action[0], action[1], int(self.empty))
    elif self.board[x,y] == self.destroyed:
      self.s = (action[0], action[1], int(self.destroyed))
    else:
      if self.ships_board[x,y] != 0:
        if self.board[x,y] < self.ships_board[x,y]:
          self.board[x,y] += 1 # One more hit
          self.number_of_hit += 1
        else:
          self.board[x,y] = self.destroyed
      else:
        self.board[x,y] = self.empty
      self.s = (action[0], action[1], int(self.board[x,y]))
    return self.s

  def step(self, action):
    """ input: action
        return: next_state, reward, done, info
    """
    next_state = self.update_board(action)
    reward = self.calculate_reward(action)
    done = bool(self.number_of_hit == self.number_of_hit_to_win)
    info = {}
    return next_state, reward, done, info

  def render(self, mode='human'):
    for i in range(self.grid_size):
      print("-------------------------------------------")
      line = ""
      
      for j in range(self.grid_size):     
        line += " | "
        if self.board[i,j] == -1:
          line += "O"
        elif self.board[i,j] == 0:
          line += " "
        else:
          line += str(int(self.board[i,j]))

      line += " | "
      print(line)
  def observe(self):
    return self.s

  def reset(self):
    self.board = np.zeros((self.grid_size,self.grid_size))
    self.number_of_hit = 0
    return self.board

def plot_results(list_cumulative_rewards, list_cumulative_steps, algo_name, download_plots):
  # for reward in zip(list_cumulative_rewards):
  plt.plot(list_cumulative_rewards, label="legend")
  plt.xlabel('Episode')
  plt.ylabel('Cumulative rewards')
  if download_plots:
    plt.savefig(algo_name, format='eps', dpi=1200)
    files.download(algo_name)
    plt.clf()
  else:
    plt.show()
  plt.plot(list_cumulative_steps, label="legend")
  plt.xlabel('Episode')
  plt.ylabel('Number of actions to finish the game')
  if download_plots:
    plt.savefig(algo_name, format='eps', dpi=1200)
    files.download(algo_name)
    plt.clf() 
  else:
    plt.show()


# Q-Learning

In [75]:
class QLearning():
  def __init__(self, env):
    self.env = env
    self.learning_rate = 0.05
    self.epsilon = 0.9
    self.epsilon_decay_rate = 0.99
    self.discount = 0.5

    self.Q_values = self.Q_values = np.zeros([self.env.nS, self.env.nA])

  def update(self, state, action, reward):
    """ Update Q-table
        :param state: previous state before taking the action (x,y,h)
        :param action: action taken (x,y)
        :param reward: reward for taking action a from state s
    """
    new_state = self.env.observe()
    idx_new_state = np.ravel_multi_index([new_state[0], new_state[1], new_state[2]],(env.grid_size, env.grid_size, env.max_health+3))

    idx_state = np.ravel_multi_index([state[0], state[1], state[2]],(env.grid_size, env.grid_size, env.max_health+3))
    idx_action = np.ravel_multi_index([action[0], action[1]],(env.grid_size, env.grid_size))

    #Update Q values
    self.Q_values[idx_state][idx_action] = self.Q_values[idx_state][idx_action] + self.learning_rate * (reward + self.discount * max(self.Q_values[idx_new_state]) - self.Q_values[idx_state][idx_action])

  def get_action(self, state):
    """ Return an action to take based on epsilon (greedy or random action)
        :param state: current state
        :return action: action to take in the next time step
    """
    # Choose random action or best action
    idx_state = np.ravel_multi_index([state[0], state[1], state[2]],(env.grid_size, env.grid_size, env.max_health+3))
    random_number = np.random.uniform()
    if random_number < self.epsilon:
      return (np.random.randint(0, self.env.grid_size), np.random.randint(0, self.env.grid_size)) # Random action
    else:
      idx_action = np.argmax(self.Q_values[idx_state]) # Greedy action
      return np.unravel_index(idx_action,(self.env.grid_size,self.env.grid_size))

def test_Q_Learning(number_of_episodes, download_plots):
  env = BatteshipEnv(grid_size, ships_board)
  agent = QLearning(env)  
  list_cumulative_rewards = []
  list_cumulative_steps = []  
  for ep in range (number_of_episodes):
    sum_reward = 0
    done = False
    number_of_steps = 0  
    while not done:
      current_state = env.observe()
      action = agent.get_action(current_state)
      observation, reward, done, info = env.step(action)
      agent.update(current_state, action, reward)
      if ep==number_of_episodes-1:
        i, j = np.unravel_index(action, (env.grid_size, env.grid_size))
        print("Action: (" + str(j) + ") Reward: ", reward)
      sum_reward += reward
      number_of_steps += 1
    list_cumulative_rewards.append(sum_reward)
    list_cumulative_steps.append(number_of_steps)
    print("Episode ", ep+1, " Cumulative reward: ", sum_reward, " Number of actions: ", number_of_steps)
    env.reset()
    # epsilon is decayed since the agent is having more and more knowledge
    agent.epsilon = agent.epsilon * agent.epsilon_decay_rate
  plot_results(list_cumulative_rewards, list_cumulative_steps, "Q-Learning", download_plots)


# SARSA

In [78]:
class Sarsa():
  def __init__(self, env):
    self.env = env
    self.learning_rate = 0.05
    self.epsilon = 0.9
    self.epsilon_decay_rate = 0.99
    self.discount = 0.5

    self.Q_values = np.zeros([self.env.nS, self.env.nA])

  def update(self, state, state2, reward, action, action2):
    """ Update Q-table
        :param state: previous state before taking the action
        :param action: action taken
        :param reward: reward for taking action a from state s
    """
    
    idx_state = np.ravel_multi_index([state[0], state[1], state[2]],(env.grid_size, env.grid_size, env.max_health+3))
    idx_state2 = np.ravel_multi_index([state2[0], state2[1], state2[2]],(env.grid_size, env.grid_size, env.max_health+3))
    idx_action = np.ravel_multi_index([action[0], action[1]],(env.grid_size, env.grid_size))
    idx_action2 = np.ravel_multi_index([action2[0], action2[1]],(env.grid_size, env.grid_size))

    #Update Q values
    self.Q_values[idx_state][idx_action] = self.Q_values[idx_state][idx_action] \
                                          + self.learning_rate * (reward + self.discount * self.Q_values[idx_state2][idx_action2] - self.Q_values[idx_state][idx_action])

  def get_action(self, state):
    """ Return an action to take based on epsilon (greedy or random action)
        :param state: current state
        :return action: action to take in the next time step
    """
    # Choose random action or best action
    idx_state = np.ravel_multi_index([state[0], state[1], state[2]],(env.grid_size, env.grid_size, env.max_health+3))
    random_number = np.random.uniform()
    if random_number < self.epsilon:
      return (np.random.randint(0, self.env.grid_size), np.random.randint(0, self.env.grid_size)) # Random action
    else:
      idx_action = np.argmax(self.Q_values[idx_state]) # Greedy action
      return np.unravel_index(idx_action,(self.env.grid_size,self.env.grid_size))

def test_SARSA(number_of_episodes, download_plots):
  env = BatteshipEnv(grid_size, ships_board)
  agent = Sarsa(env)
  list_cumulative_rewards = []
  list_cumulative_steps = []

  for ep in range (number_of_episodes):
    sum_reward = 0
    done = False
    number_of_steps = 0  
    while not done:
      state = env.observe()
      action = agent.get_action(state)
      state2, reward, done, info = env.step(action)
      action2 = agent.get_action(state2)
      agent.update(state, state2, reward, action, action2)
      if ep==number_of_episodes-1:
        i, j = np.unravel_index(action2, (env.grid_size, env.grid_size))
        print("Action: (" + str(j) + ") Reward: ", reward)
      sum_reward += reward
      number_of_steps += 1
    list_cumulative_rewards.append(sum_reward)
    list_cumulative_steps.append(number_of_steps)
    print("Episode ", ep+1, " Cumulative reward: ", sum_reward, " Number of actions: ", number_of_steps)
    env.reset()
    # epsilon is decayed since the agent is having more and more knowledge
    agent.epsilon = agent.epsilon * agent.epsilon_decay_rate
  
  plot_results(list_cumulative_rewards, list_cumulative_steps, "SARSA", download_plots)



# Main Function

In [81]:
if __name__ == "__main__":
  grid_size = 10
  ships_board = np.zeros((grid_size, grid_size), dtype='int')
  ships_board[0,9] = 1 
  ships_board[1,3] = 2 
  ships_board[7,2] = 2 
  ships_board[3,1] = 2
  ships_board[6,7] = 2
  ships_board[3,5] = 3 
  ships_board[9,9] = 3 
  ships_board[4,5] = 3 
  ships_board[9,4] = 3
  ships_board[5,8] = 3
  print(ships_board)

  ### Test Q-Learning + SARSA ###
  number_of_episodes = 2000
  download_plots = False
  test_Q_Learning(number_of_episodes, download_plots)
  test_SARSA(number_of_episodes, download_plots)

[[0 0 0 0 0 0 0 0 0 1]
 [0 0 0 2 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 2 0 0 0 3 0 0 0 0]
 [0 0 0 0 0 3 0 0 0 0]
 [0 0 0 0 0 0 0 0 3 0]
 [0 0 0 0 0 0 0 2 0 0]
 [0 0 2 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 3 0 0 0 0 3]]
