<a href="https://colab.research.google.com/github/inforeqd512/QLearning/blob/main/RL_TicTacToe.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [181]:
import numpy as np

# Game

On the Board

Player playing $X$ will be denoted by 1

Player playing $O$ will be denoted by -1

Board in the environment. State is the position of the board after a move from any player. Board also provides the possible set of actions

Action to be performed is chosen by the agent on the current state of the board

# Board

In [182]:
class Board:
  """ Class that represents the game board of Tic Tac Toe """

  playerX = 1
  playerO = -1

  def __init__(self, rows=3, cols=3):
    self.rows = rows
    self.cols = cols 
    self.resetGame()

  def resetGame(self):
    self.state = np.zeros((self.rows, self.cols), dtype=np.int8)

  def checkWinner(self):
    """  return winner symbol, if one exists """

    symbols = np.unique(self.state) #unique values , 0, 1, -1
    symbols = symbols[np.nonzero(symbols)] #remove 0's
    winning_symbol = 0 #no winner yet

    for symbol in symbols:
      #check rows
      row = np.any(np.all(self.state == symbol, axis=1))

      #check cols
      col = np.any(np.all(self.state == symbol, axis=0))

      #check diagonals
      diag1 = np.array([self.state[0,0], self.state[1,1], self.state[2,2]])
      diag1 = np.all(diag1 == symbol)

      diag2 = np.array([self.state[2,0], self.state[1,1], self.state[0,2]])
      diag2 = np.all(diag2 == symbol)

      # Check if state has winner and return winner in that case
      if row or col or diag1 or diag2:
        winning_symbol = symbol
        break
  
    return winning_symbol

  def getAvailablePos(self):
    """  Get state positions that have no value ie zeros """
    return np.argwhere(self.state == 0)

  def checkGameEnded(self):
    """ Check if game has ended by observing if there any possible moves left """
    return len(self.getAvailablePos()) == 0 

  def setPosition(self, x, y, symbol):
    """  Set state at position (x,y) with symbol """
    self.state[x,y] = symbol

  def getStateHash(self):
    """  Get hash key of state """
    return np.array2string(self.state)

  def getActionsHash(self):
    actions = self.getAvailablePos()
    hash_list = []
    for action in actions:
      hash_list.append(np.array2string(action))
    return hash_list
  
  def getActionHash(self, action):
    return np.array2string(action)

  def performAction(self, action_to_perform, symbol):
    self.setPosition(action_to_perform[0], action_to_perform[1], symbol)
    return self
      

In [183]:
class Agent:

  def __init__(self, symbol, exploration_probability):
    self.symbol = symbol
    self.policy_trainer = PolicyTrainer(exploration_probability=exploration_probability)
    return

  def performActionPerPolicy(self, state_hash, actions_hash, possible_actions, current_state):
    action = self.policy_trainer.chooseAction(state_hash, actions_hash, possible_actions)
    self.policy_trainer.performAction(current_state, action, self.symbol)

In [184]:
class PolicyTrainer:
  """
      learning_parameter (float)
      discount_factor (float)
      Q (dict)
  """
  def __init__(self, exploration_probability, learning_rate = 0.1, discount_factor = 0.9, Q = {}):
    self.Q = Q
    self.learning_rate = learning_rate
    self.discount_factor = discount_factor
    self.exploration_probability = exploration_probability
    self.currentStateActionKey = None
    # self.numIterations()
    return


  def getStateActionPairKey(self, current_state_hash, current_action_hash):
    """ Returns state-pair hash key, requires separate state and action hash keys first """
    return current_state_hash + current_action_hash

  def getValueQ(self, current_state_hash, current_action_hash):
    """ Get expected reward given an action in a given state,
            returns 0 if the state-action pair has not been seen before.
            Input is state and action hash key                          """
    stateActionPairKey = self.getStateActionPairKey(current_state_hash, current_action_hash)
    qValue = 0 
    if stateActionPairKey in self.Q:
      qValue = self.Q[stateActionPairKey]
    else:
      self.setValueQ(current_state_hash, current_action_hash, qValue)

    return qValue

  def setValueQ(self, current_state_hash, current_action_hash, value):
    """ Set value in Q """
    stateActionPairKey = self.getStateActionPairKey(current_state_hash, current_action_hash)
    self.Q[stateActionPairKey] = value
    return

  def rewardFunction(self, next_state, symbol):
    """ Returns positive value actions turns into win, else zero """
    winner = next_state.checkWinner()
    reward = 0 
    if winner == symbol:
      reward = 1
    
    return reward


  def chooseAction(self, state_hash, actions_hash, possible_actions):
    #Explore
    if random.random() < self.exploration_probability:
      action = self.chooseRandomAction(possible_actions)
    else:
      #Exploit
      action = self.chooseBestAction(state_hash, actions_hash, possible_actions)
    return action

  def chooseRandomAction(self, possible_actions):
    random_idx = np.random.choice(possible_actions.shape[0])
    action_pos = possible_actions[random_idx]
    return action_pos

  def chooseBestAction(self, current_state_hash, actions_hash, possible_actions):
    """ Get best action given a set of possible actions in a given state """
    # Pick a random action at first
    random_idx = np.random.choice(possible_actions.shape[0])
    best_action = possible_actions[random_idx]

    # Find action that given largest Q in given state
    maxQ = 0 
    for action_hash, action in zip(actions_hash, possible_actions):
      tmpQ = self.getValueQ(current_state_hash, action_hash)
      if maxQ < tmpQ:
        maxQ = tmpQ
        best_action = action

    return best_action

  def getMaxQ(self, next_state):
    actions_hash = next_state.getActionsHash()
    state_hash = next_state.getStateHash()

    # Find action that given largest Q in given state
    maxQ = 0 
    for action_hash in actions_hash:
      tmpQ = self.getValueQ(state_hash, action_hash)
      if maxQ < tmpQ:
        maxQ = tmpQ

    return maxQ

  def performAction(self, current_state, current_action, symbol):
    """ Implements Q-learning iterative algorithm """
    current_state_hash = current_state.getStateHash()
    current_action_hash = current_state.getActionHash(current_action)

    # Get current Q Value
    currentQ = self.getValueQ(current_state_hash, current_action_hash)

    next_state = current_state.performAction(current_action, symbol)

    newQ = (1 - self.learning_rate) * currentQ
    newQ += self.learning_rate * (self.rewardFunction(next_state, symbol) + self.discount_factor * self.getMaxQ(next_state) - currentQ)

    self.setValueQ(current_state_hash, current_action_hash, newQ)


# Q-Learning : Training

In [185]:
from tqdm import tqdm
import random

In [186]:
def simulate(iterations):
  """ iterations (int) """

  # Construct game board
  game = Board()

 # Epsilon-greedy 
  exploration_probability = 1.0

  # Initiatlise players
  playerX = Agent(Board.playerX, exploration_probability)
  playerO = Agent(Board.playerO, exploration_probability)

  # Counters for wins of each agent and total number of games
  nbr_wins_playerX = 0
  nbr_wins_playerO = 0
  nbr_games = 0

  # Pick current player
  current_player = playerX

  for i in tqdm(range(iterations)):

    print("\nGame :", i)

    # Check if games has ended, reset if True
    while not game.checkGameEnded():
      possible_actions = game.getAvailablePos()
      state_hash = game.getStateHash()
      actions_hash = game.getActionsHash()
      current_player.performActionPerPolicy(state_hash, actions_hash, possible_actions, game)

      # Reduce probability to explore during training
      # Do not remove completely
      if exploration_probability > 0.2:
          exploration_probability -= 1/iterations

      # Check if there is a winner
      winner = game.checkWinner() # Returns 0 if there is no winner
      if winner != 0: #if a winner
          # Add to count for corresponding winner
          if winner == playerX.symbol:
              nbr_wins_playerX += 1
          else:
              nbr_wins_playerO += 1
          break

      # Swap player
      if current_player == playerX:
          current_player = playerO
      else:
          current_player = playerX

    nbr_games += 1
    if winner == 0:
      print("\nDraw")
    else:
      print(winner, "wins")
    print("board :\n", game.state)
    game.resetGame()
    
  # Print outcome
  print(nbr_wins_playerX, nbr_wins_playerO, nbr_games)    
  print("Win percentage: Agent X {:.2%}, Agent O {:.2%}.".format(nbr_wins_playerX/nbr_games, nbr_wins_playerO/nbr_games))



# Testing

In [194]:
simulate(50)

 14%|█▍        | 7/50 [00:00<00:00, 64.43it/s]


Game : 0

Draw
board :
 [[-1  1  1]
 [ 1  1 -1]
 [-1 -1  1]]

Game : 1

Draw
board :
 [[ 1 -1  1]
 [-1 -1  1]
 [-1  1 -1]]

Game : 2
-1 wins
board :
 [[ 1  1  0]
 [ 0  0  1]
 [-1 -1 -1]]

Game : 3
1 wins
board :
 [[ 1  1  1]
 [-1  0 -1]
 [ 0  0 -1]]

Game : 4
1 wins
board :
 [[ 0 -1  1]
 [-1  1  1]
 [ 1  0 -1]]

Game : 5
-1 wins
board :
 [[ 1 -1  1]
 [-1 -1 -1]
 [ 0  1  1]]

Game : 6
-1 wins
board :
 [[-1  1  1]
 [ 1 -1 -1]
 [ 1 -1 -1]]

Game : 7
-1 wins
board :
 [[-1  1  0]
 [-1  1  1]
 [-1 -1  0]]

Game : 8
-1 wins
board :
 [[-1 -1  1]
 [-1  0  1]
 [-1  1  0]]

Game : 9
-1 wins
board :
 [[ 1  0 -1]
 [ 0  0 -1]
 [ 1  0 -1]]

Game : 10

Draw
board :
 [[-1 -1  1]
 [ 1  1 -1]
 [-1 -1  1]]

Game : 11
1 wins
board :
 [[ 0  1 -1]
 [-1  1  0]
 [ 0  1  0]]

Game : 12
-1 wins
board :
 [[ 1 -1  1]
 [ 0 -1  1]
 [ 1 -1 -1]]

Game : 13


 42%|████▏     | 21/50 [00:00<00:00, 63.82it/s]

1 wins
board :
 [[ 1  1  1]
 [-1  0 -1]
 [-1  1 -1]]

Game : 14

Draw
board :
 [[ 1  1 -1]
 [-1 -1  1]
 [ 1 -1  1]]

Game : 15
-1 wins
board :
 [[-1 -1 -1]
 [ 0 -1  1]
 [ 1  0  1]]

Game : 16
-1 wins
board :
 [[-1  0  0]
 [ 0 -1  1]
 [ 1  0 -1]]

Game : 17
1 wins
board :
 [[ 1 -1  1]
 [-1  1 -1]
 [-1  0  1]]

Game : 18
1 wins
board :
 [[-1  1  1]
 [ 0  0  1]
 [-1 -1  1]]

Game : 19

Draw
board :
 [[-1  1 -1]
 [-1  1  1]
 [ 1 -1  1]]

Game : 20
-1 wins
board :
 [[ 1  0 -1]
 [ 0  1 -1]
 [ 0  0 -1]]

Game : 21
-1 wins
board :
 [[ 1 -1 -1]
 [-1 -1  1]
 [ 1 -1  1]]

Game : 22
1 wins
board :
 [[ 1 -1 -1]
 [ 1  1 -1]
 [ 0 -1  1]]

Game : 23
1 wins
board :
 [[ 1 -1  1]
 [-1 -1  1]
 [-1  1  1]]

Game : 24
1 wins
board :
 [[-1  0  1]
 [-1  1 -1]
 [ 1  1  0]]

Game : 25
-1 wins
board :
 [[ 1  1 -1]
 [ 0  0 -1]
 [ 0  1 -1]]

Game : 26
1 wins
board :
 [[ 1 -1  0]
 [ 1  0  0]
 [ 1 -1 -1]]

Game : 27
-1 wins
board :
 [[-1 -1 -1]
 [ 1  1 -1]
 [ 0  1  1]]

Game : 28


 72%|███████▏  | 36/50 [00:00<00:00, 60.20it/s]

1 wins
board :
 [[-1  0  1]
 [ 0  1  0]
 [ 1 -1 -1]]

Game : 29
-1 wins
board :
 [[ 1  0  0]
 [ 1  0  1]
 [-1 -1 -1]]

Game : 30
1 wins
board :
 [[-1  1  1]
 [ 0  1 -1]
 [ 1 -1 -1]]

Game : 31
1 wins
board :
 [[-1  1 -1]
 [-1 -1  1]
 [ 1  1  1]]

Game : 32
-1 wins
board :
 [[-1 -1 -1]
 [ 1  1 -1]
 [ 0  1  1]]

Game : 33
-1 wins
board :
 [[-1  1 -1]
 [-1 -1  1]
 [-1  1  1]]

Game : 34
-1 wins
board :
 [[ 1 -1  1]
 [ 1  0  0]
 [-1 -1 -1]]

Game : 35
-1 wins
board :
 [[-1 -1  0]
 [ 1 -1  0]
 [ 1 -1  1]]

Game : 36
1 wins
board :
 [[ 1  1  0]
 [-1  1 -1]
 [-1  1 -1]]

Game : 37
1 wins
board :
 [[ 1 -1 -1]
 [-1  1  1]
 [ 0  0  1]]

Game : 38
1 wins
board :
 [[ 1  0 -1]
 [ 1  1  1]
 [-1 -1  0]]

Game : 39
1 wins
board :
 [[-1 -1  1]
 [ 1  1  1]
 [-1  1 -1]]

Game : 40


100%|██████████| 50/50 [00:00<00:00, 61.12it/s]

1 wins
board :
 [[-1 -1  1]
 [ 1 -1 -1]
 [ 1  1  1]]

Game : 41
1 wins
board :
 [[-1 -1  1]
 [ 1 -1 -1]
 [ 1  1  1]]

Game : 42

Draw
board :
 [[ 1 -1 -1]
 [-1  1  1]
 [ 1  1 -1]]

Game : 43
1 wins
board :
 [[ 1 -1 -1]
 [-1  1  1]
 [ 0 -1  1]]

Game : 44
1 wins
board :
 [[-1 -1  1]
 [-1  1  1]
 [ 1  1 -1]]

Game : 45
-1 wins
board :
 [[-1  1 -1]
 [ 1  0 -1]
 [ 1  1 -1]]

Game : 46
1 wins
board :
 [[ 1 -1  1]
 [-1  1 -1]
 [-1  0  1]]

Game : 47

Draw
board :
 [[ 1 -1  1]
 [ 1 -1 -1]
 [-1  1  1]]

Game : 48
-1 wins
board :
 [[ 0  0  0]
 [-1 -1 -1]
 [ 1  0  1]]

Game : 49
-1 wins
board :
 [[-1 -1 -1]
 [ 1  0  1]
 [ 0  1 -1]]
22 21 50
Win percentage: Agent X 44.00%, Agent O 42.00%.





In [188]:
board=Board()
print(board.checkWinner())

board.state = np.array(((-1,-1,-1), (0,0,0), (0,0,0)))
print(board.checkWinner())
print(board.getStateHash())
pos = board.getAvailablePos()
print("getAvailablePos\n", pos)
print("checkGameEnded\n", board.checkGameEnded())


list1 = board.getAvailablePos()
print("shape\n",list1.shape[0])
ch1= np.random.choice(list1.shape[0])
print("random.choice\n",ch1)
print("random action\n", list1[ch1])

board.state = np.array(((-1,0,0), (0,1,0), (0,0,-1)))
print(board.getStateHash())

board.state = np.array(((-1,1,1), (1,1,1), (1,-1,-1)))
print(board.checkGameEnded())





0
-1
[[-1 -1 -1]
 [ 0  0  0]
 [ 0  0  0]]
getAvailablePos
 [[1 0]
 [1 1]
 [1 2]
 [2 0]
 [2 1]
 [2 2]]
checkGameEnded
 False
shape
 6
random.choice
 0
random action
 [1 0]
[[-1  0  0]
 [ 0  1  0]
 [ 0  0 -1]]
True
