<a href="https://colab.research.google.com/github/inforeqd512/QLearning/blob/main/RL_TicTacToe.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
import numpy as np

# Game

On the Board

Player playing $X$ will be denoted by 1

Player playing $O$ will be denoted by -1

Board in the environment. State is the position of the board after a move from any player. Board also provides the possible set of actions

Action to be performed is chosen by the agent on the current state of the board

# Board

In [14]:
class Board:
  """ Class that represents the game board of Tic Tac Toe """

  playerX = 1
  playerO = -1

  def __init__(self, rows=3, cols=3):
    self.rows = rows
    self.cols = cols 
    self.resetGame()

  def resetGame(self):
    self.state = np.zeros((self.rows, self.cols), dtype=np.int8)

  def checkWinner(self):
    """  return winner symbol, if one exists. 0 if no winner"""

    symbols = np.unique(self.state) #unique values , 0, 1, -1
    symbols = symbols[np.nonzero(symbols)] #remove 0's
    winning_symbol = 0 #no winner yet

    for symbol in symbols:
      #check rows
      row = np.any(np.all(self.state == symbol, axis=1))

      #check cols
      col = np.any(np.all(self.state == symbol, axis=0))

      #check diagonals
      diag1 = np.array([self.state[0,0], self.state[1,1], self.state[2,2]])
      diag1 = np.all(diag1 == symbol)

      diag2 = np.array([self.state[2,0], self.state[1,1], self.state[0,2]])
      diag2 = np.all(diag2 == symbol)

      # Check if state has winner and return winner in that case
      if row or col or diag1 or diag2:
        winning_symbol = symbol
        break
  
    return winning_symbol

  def getAvailablePos(self):
    """  Get state positions that have no value ie zeros """
    return np.argwhere(self.state == 0)

  def checkGameEnded(self):
    """ Check if game has ended by observing if there any possible moves left """
    return len(self.getAvailablePos()) == 0 

  def setPosition(self, x, y, symbol):
    """  Set state at position (x,y) with symbol """
    self.state[x,y] = symbol

  def getStateHash(self):
    """  Get hash key of state """
    return np.array2string(self.state)

  def getActionsHash(self):
    """  Get list of hash for all positions where no symbols are yet put """
    actions = self.getAvailablePos()
    hash_list = []
    for action in actions:
      hash_list.append(self.getActionHash(action))
    return hash_list
  
  def getActionHash(self, action):
    """  Get hash key of single action """
    return np.array2string(action)

  def performAction(self, action_to_perform, symbol):
    """  Perform the action on current board """
    self.setPosition(action_to_perform[0], action_to_perform[1], symbol)
    return self
      

In [15]:
class Agent:
  """ Class that represents the player 
      symbol is 1 for 'X' or -1 for 'O' 
      policy_trainer is initialised with epsilon greedy set to exploration_probability which is reduced overtime"""

  def __init__(self, symbol, exploration_probability):
    self.symbol = symbol
    self.policy_trainer = PolicyTrainer(exploration_probability=exploration_probability)
    return

  def performActionPerPolicy(self, state_hash, actions_hash, possible_actions, current_state):
    """ per the explore and exploitation with current epsilon """
    action = self.policy_trainer.chooseAction(state_hash, actions_hash, possible_actions)
    self.policy_trainer.performAction(current_state, action, self.symbol)

  def epsilonDecayPerIterations(self, num_iterations):
    """ gradually reduces probabilities per iteration but not completely eliminate it"""
    # Reduce probability to explore during training
    # Do not remove completely 
    
    # Decaying epsilon-greedy: Does the same as epsilon-greedy, however, the epsilon value starts out near 1,
    # and decays over time according to γ to power of x where x represents the iteration the agent is in. 
    # For γ 0.99 is used per https://theses.ubn.ru.nl/bitstream/handle/123456789/5216/Nieuwdorp,%20T._BSc_Thesis_2017.pdf?sequence=1
        
    if self.policy_trainer.exploration_probability > 0.2:
      decay = 0.99 ** num_iterations
      self.policy_trainer.exploration_probability = decay


In [16]:
class PolicyTrainer:
  """
      exploration_probability (float) epsilon greedy value
      learning_rate (float)
      discount_factor (float)
      Q (dict)
  """
  def __init__(self, exploration_probability = 1.0, learning_rate = 0.1, discount_factor = 0.9, Q = {}):
    self.Q = Q
    self.learning_rate = learning_rate
    self.discount_factor = discount_factor
    self.exploration_probability = exploration_probability
    return

  def getStateActionPairKey(self, current_state_hash, current_action_hash):
    """ Returns state-pair hash key, requires separate state and action hash keys first """
    return current_state_hash + current_action_hash

  def getValueQ(self, current_state_hash, current_action_hash):
    """ Get the quality value of a given action in a given state,
            returns 0 if the state-action pair has not been seen before.
            Input is state and action hash key                          """
    state_action_pair_key = self.getStateActionPairKey(current_state_hash, current_action_hash)
    q_value = 0 
    if state_action_pair_key in self.Q:
      q_value = self.Q[state_action_pair_key]
    else:
      self.setValueQ(current_state_hash, current_action_hash, q_value)

    return q_value

  def setValueQ(self, current_state_hash, current_action_hash, value):
    """ Set value in Q """
    state_action_pair_key = self.getStateActionPairKey(current_state_hash, current_action_hash)
    self.Q[state_action_pair_key] = value
    return

  def rewardFunction(self, next_state, symbol):
    """ when the chosen action is performed on a state, then we get a new state
    and associated reward from this transition is computed here
    if next state is winner then reward is 1 else 0 """
    winner = next_state.checkWinner()
    reward = 0 
    if winner == symbol:
      reward = 1
    
    return reward

  def chooseAction(self, state_hash, actions_hash, possible_actions):
    """ choose action per epsilon greedy explore/exploit policy """
    #Explore
    if random.random() < self.exploration_probability:
      action = self.chooseRandomAction(possible_actions)
    else:
      #Exploit
      action = self.chooseBestAction(state_hash, actions_hash, possible_actions)
    return action

  def chooseRandomAction(self, possible_actions):
    """ choose random action from list of possible actions in a state """
    random_idx = np.random.choice(possible_actions.shape[0])
    action_pos = possible_actions[random_idx]
    return action_pos

  def chooseBestAction(self, current_state_hash, actions_hash, possible_actions):
    """ Get best action given a set of possible actions in a given state """
    # Pick a random action at first
    random_idx = np.random.choice(possible_actions.shape[0])
    best_action = possible_actions[random_idx]

    # Find action that given largest Q in given state
    maxQ = 0 
    for action_hash, action in zip(actions_hash, possible_actions):
      tmpQ = self.getValueQ(current_state_hash, action_hash)
      if maxQ < tmpQ:
        maxQ = tmpQ
        best_action = action

    return best_action

  def getMaxQ(self, next_state):
    """ go through all possible actions in a state 
    and pick the one with the highest quality value """
    actions_hash = next_state.getActionsHash()
    state_hash = next_state.getStateHash()

    # Find action that given largest Q in given state
    maxQ = 0 
    for action_hash in actions_hash:
      tmpQ = self.getValueQ(state_hash, action_hash)
      if maxQ < tmpQ:
        maxQ = tmpQ

    return maxQ

  def performAction(self, current_state, current_action, symbol):
    """ Implements Q-learning iterative algorithm """
    current_state_hash = current_state.getStateHash()
    current_action_hash = current_state.getActionHash(current_action)

    # Get current Q Value
    currentQ = self.getValueQ(current_state_hash, current_action_hash)

    next_state = current_state.performAction(current_action, symbol)

    newQ = (1 - self.learning_rate) * currentQ
    newQ += self.learning_rate * (self.rewardFunction(next_state, symbol) + self.discount_factor * self.getMaxQ(next_state) - currentQ)

    self.setValueQ(current_state_hash, current_action_hash, newQ)


# Q-Learning : Training

In [17]:
from tqdm import tqdm
import random

In [18]:
def simulate(iterations):
  """ iterations (int) """

  # Construct game board
  game = Board()

 # Epsilon-greedy 
  exploration_probability = 1.0

  # Initiatlise players
  playerX = Agent(Board.playerX, exploration_probability)
  playerO = Agent(Board.playerO, exploration_probability)

  # Counters for wins of each agent and total number of games
  nbr_wins_playerX = 0
  nbr_wins_playerO = 0
  nbr_games = 0

  # Pick current player
  current_player = playerX

  for i in tqdm(range(iterations)):

    print("\nGame :", i)

    # play full games in each iteration
    while not game.checkGameEnded():
      possible_actions = game.getAvailablePos()
      state_hash = game.getStateHash()
      actions_hash = game.getActionsHash()
      current_player.performActionPerPolicy(state_hash, actions_hash, possible_actions, game)

      # Check if there is a winner
      winner = game.checkWinner() # Returns 0 if there is no winner
      if winner != 0: #winner is when it's 1 or -1
          # Add to count for corresponding winner
          if winner == playerX.symbol:
              nbr_wins_playerX += 1
          else:
              nbr_wins_playerO += 1
          break

      # Swap player
      if current_player == playerX:
          current_player = playerO
      else:
          current_player = playerX

    #when the full game is finished, then increment and print metrics
    nbr_games += 1
    if winner == 0:
      print("\nDraw")
    else:
      print(winner, "wins")
    print("board :\n", game.state)
    print("\nnumber of games :", nbr_games)
    print("\nplayerX epsilon :", playerX.policy_trainer.exploration_probability)
    print("\nplayerO epsilon :", playerO.policy_trainer.exploration_probability)
    game.resetGame()
    playerX.epsilonDecayPerIterations(nbr_games)
    playerO.epsilonDecayPerIterations(nbr_games)

  # Print outcome
  print(nbr_wins_playerX, nbr_wins_playerO, nbr_games)    
  print("Win percentage: Agent X {:.2%}, Agent O {:.2%}.".format(nbr_wins_playerX/nbr_games, nbr_wins_playerO/nbr_games))



# Testing

In [19]:
simulate(50)

 12%|█▏        | 6/50 [00:00<00:00, 51.43it/s]


Game : 0
-1 wins
board :
 [[-1  0  1]
 [-1  1  0]
 [-1  1  0]]

playerX epsilon : 1.0

playerO epsilon : 1.0

Game : 1
-1 wins
board :
 [[-1  1 -1]
 [ 1 -1  1]
 [-1 -1  1]]

playerX epsilon : 0.99

playerO epsilon : 0.99

Game : 2
-1 wins
board :
 [[ 1  1  0]
 [-1 -1 -1]
 [ 1  0 -1]]

playerX epsilon : 0.9801

playerO epsilon : 0.9801

Game : 3
-1 wins
board :
 [[ 1  0  0]
 [-1 -1 -1]
 [ 0  1  0]]

playerX epsilon : 0.970299

playerO epsilon : 0.970299

Game : 4
-1 wins
board :
 [[-1  1  1]
 [-1 -1  1]
 [ 1 -1 -1]]

playerX epsilon : 0.96059601

playerO epsilon : 0.96059601

Game : 5
1 wins
board :
 [[ 0  0  1]
 [-1 -1  1]
 [ 0 -1  1]]

playerX epsilon : 0.9509900498999999

playerO epsilon : 0.9509900498999999

Game : 6
1 wins
board :
 [[-1  1 -1]
 [ 0 -1  0]
 [ 1  1  1]]

playerX epsilon : 0.941480149401

playerO epsilon : 0.941480149401

Game : 7
1 wins
board :
 [[ 1  1 -1]
 [ 1 -1  0]
 [ 1 -1  0]]

playerX epsilon : 0.9320653479069899

playerO epsilon : 0.9320653479069899

Game : 8

 36%|███▌      | 18/50 [00:00<00:00, 51.20it/s]

1 wins
board :
 [[ 0 -1  1]
 [-1  1  0]
 [ 1  0 -1]]

playerX epsilon : 0.9135172474836408

playerO epsilon : 0.9135172474836408

Game : 10
-1 wins
board :
 [[ 1  1  0]
 [-1 -1 -1]
 [ 0  0  1]]

playerX epsilon : 0.9043820750088044

playerO epsilon : 0.9043820750088044

Game : 11
-1 wins
board :
 [[-1  0  1]
 [-1  0  0]
 [-1  1  0]]

playerX epsilon : 0.8953382542587164

playerO epsilon : 0.8953382542587164

Game : 12
-1 wins
board :
 [[-1  1  1]
 [ 1 -1 -1]
 [-1  1 -1]]

playerX epsilon : 0.8863848717161292

playerO epsilon : 0.8863848717161292

Game : 13

Draw
board :
 [[-1  1  1]
 [ 1 -1 -1]
 [-1 -1  1]]

playerX epsilon : 0.8775210229989678

playerO epsilon : 0.8775210229989678

Game : 14
-1 wins
board :
 [[-1  1  1]
 [ 1 -1  0]
 [ 1 -1 -1]]

playerX epsilon : 0.8687458127689782

playerO epsilon : 0.8687458127689782

Game : 15

Draw
board :
 [[-1 -1  1]
 [ 1 -1 -1]
 [-1  1  1]]

playerX epsilon : 0.8600583546412884

playerO epsilon : 0.8600583546412884

Game : 16
-1 wins
board :
 [

 60%|██████    | 30/50 [00:00<00:00, 52.04it/s]


Draw
board :
 [[-1  1  1]
 [ 1 -1 -1]
 [ 1 -1  1]]

playerX epsilon : 0.8179069375972308

playerO epsilon : 0.8179069375972308

Game : 21
-1 wins
board :
 [[ 1 -1  1]
 [ 0 -1  0]
 [ 0 -1  0]]

playerX epsilon : 0.8097278682212584

playerO epsilon : 0.8097278682212584

Game : 22
1 wins
board :
 [[ 1  1  1]
 [ 0 -1  0]
 [-1  0 -1]]

playerX epsilon : 0.8016305895390459

playerO epsilon : 0.8016305895390459

Game : 23
1 wins
board :
 [[ 1  0  0]
 [ 1 -1 -1]
 [ 1  0  0]]

playerX epsilon : 0.7936142836436554

playerO epsilon : 0.7936142836436554

Game : 24

Draw
board :
 [[-1  1  1]
 [ 1  1 -1]
 [-1 -1  1]]

playerX epsilon : 0.7856781408072188

playerO epsilon : 0.7856781408072188

Game : 25

Draw
board :
 [[ 1 -1 -1]
 [-1  1  1]
 [ 1 -1 -1]]

playerX epsilon : 0.7778213593991467

playerO epsilon : 0.7778213593991467

Game : 26
1 wins
board :
 [[ 0  0 -1]
 [ 1  1  1]
 [-1 -1  1]]

playerX epsilon : 0.7700431458051551

playerO epsilon : 0.7700431458051551

Game : 27

Draw
board :
 [[ 1 -1

 84%|████████▍ | 42/50 [00:00<00:00, 52.96it/s]

1 wins
board :
 [[-1  1  1]
 [-1  1  1]
 [ 1 -1 -1]]

playerX epsilon : 0.7249803359578534

playerO epsilon : 0.7249803359578534

Game : 33
1 wins
board :
 [[ 1  1  1]
 [-1  1 -1]
 [ 1 -1 -1]]

playerX epsilon : 0.7177305325982749

playerO epsilon : 0.7177305325982749

Game : 34
1 wins
board :
 [[-1 -1  0]
 [ 1  1  1]
 [-1  1  0]]

playerX epsilon : 0.7105532272722921

playerO epsilon : 0.7105532272722921

Game : 35
1 wins
board :
 [[-1  1 -1]
 [ 1  1  1]
 [-1 -1  1]]

playerX epsilon : 0.7034476949995692

playerO epsilon : 0.7034476949995692

Game : 36
1 wins
board :
 [[ 0  0  1]
 [ 0  1 -1]
 [ 1  0 -1]]

playerX epsilon : 0.6964132180495735

playerO epsilon : 0.6964132180495735

Game : 37
1 wins
board :
 [[ 1 -1  1]
 [-1  1 -1]
 [-1  1  1]]

playerX epsilon : 0.6894490858690777

playerO epsilon : 0.6894490858690777

Game : 38
1 wins
board :
 [[ 1  1  1]
 [ 0  0 -1]
 [ 0  0 -1]]

playerX epsilon : 0.682554595010387

playerO epsilon : 0.682554595010387

Game : 39
1 wins
board :
 [[ 1 -

100%|██████████| 50/50 [00:00<00:00, 50.60it/s]

1 wins
board :
 [[ 0 -1  0]
 [ 1 -1 -1]
 [ 1  1  1]]

playerX epsilon : 0.6491026283684022

playerO epsilon : 0.6491026283684022

Game : 44
1 wins
board :
 [[-1  0  0]
 [ 0  0 -1]
 [ 1  1  1]]

playerX epsilon : 0.6426116020847181

playerO epsilon : 0.6426116020847181

Game : 45
-1 wins
board :
 [[ 0 -1  1]
 [ 1 -1  0]
 [ 1 -1  0]]

playerX epsilon : 0.6361854860638709

playerO epsilon : 0.6361854860638709

Game : 46
1 wins
board :
 [[-1  0 -1]
 [ 1  1  1]
 [-1 -1  1]]

playerX epsilon : 0.6298236312032323

playerO epsilon : 0.6298236312032323

Game : 47
1 wins
board :
 [[ 1  1 -1]
 [-1  1 -1]
 [ 1 -1  1]]

playerX epsilon : 0.6235253948912

playerO epsilon : 0.6235253948912

Game : 48

Draw
board :
 [[ 1 -1  1]
 [ 1  1 -1]
 [-1  1 -1]]

playerX epsilon : 0.617290140942288

playerO epsilon : 0.617290140942288

Game : 49
-1 wins
board :
 [[-1  1 -1]
 [ 1  1 -1]
 [ 1 -1 -1]]

playerX epsilon : 0.611117239532865

playerO epsilon : 0.611117239532865
23 19 50
Win percentage: Agent X 46.00%,




In [20]:
board=Board()
print(board.checkWinner())

board.state = np.array(((-1,-1,-1), (0,0,0), (0,0,0)))
print(board.checkWinner())
print(board.getStateHash())
pos = board.getAvailablePos()
print("getAvailablePos\n", pos)
print("checkGameEnded\n", board.checkGameEnded())


list1 = board.getAvailablePos()
print("shape\n",list1.shape[0])
ch1= np.random.choice(list1.shape[0])
print("random.choice\n",ch1)
print("random action\n", list1[ch1])

board.state = np.array(((-1,0,0), (0,1,0), (0,0,-1)))
print(board.getStateHash())

board.state = np.array(((-1,1,1), (1,1,1), (1,-1,-1)))
print(board.checkGameEnded())





0
-1
[[-1 -1 -1]
 [ 0  0  0]
 [ 0  0  0]]
getAvailablePos
 [[1 0]
 [1 1]
 [1 2]
 [2 0]
 [2 1]
 [2 2]]
checkGameEnded
 False
shape
 6
random.choice
 0
random action
 [1 0]
[[-1  0  0]
 [ 0  1  0]
 [ 0  0 -1]]
True
