In [1]:
import numpy as np
import random
import pickle
import json

1. get possible moves
2. get max q value for each move
3. check winner or draw
4. play an random move then check for winner or draw


In [2]:
class Game():
  def __init__(self,CP):
    self.board = '         '
    self.CP = CP

  def get_opponent(CP):
    return 'X' if CP == 'O' else 'O';

  def get_state(self):
    return ''.join(self.board);

  def get_moves(self):
    return [index for index, char in enumerate(self.board) if char == ' ']

  def make_move(self,pos,computer):
    self.board = self.board[:pos]+self.CP+self.board[pos+1:]
    winner = Game.check_winner(self.board)
    done = winner is not None or ' ' not in self.board
    reward = 0.0
    if winner is not None and winner == computer:
      reward = 1.0
    elif winner is not None and winner != computer:
      reward = -1.0

    self.CP = 'O' if self.CP == 'X' else 'X'
    return self.get_state(),winner,reward,done

  def is_valid(self,pos,player):
    return 0<=pos<=8 and self.board[pos] == ' '

  def check_winner(gameState):
    lines = [[0,1,2],[3,4,5],[6,7,8],[0,3,6],[1,4,7],[2,5,8],[0,4,8],[2,4,6]]
    for line in lines:
      st = gameState[line[0]] + gameState[line[1]] + gameState[line[2]]
      if  st == 'XXX':
        return 'X'
      elif st == 'OOO':
        return 'O'
    return None

  def print(self):
    for i in range(9):
      if i%3==0:
        print("\n",end=' ')
      if self.board[i] == ' ':
        print('- ',end=' ')
      else:
        print(self.board[i]+' ',end=' ')
    print('')

In [3]:

class QLearningAgent():
  def __init__(self, learning_rate=0.1, discount_factor=0.95, exploration_rate=0.5):
    self.learning_rate = learning_rate  # Alpha: Learning rate
    self.discount_factor = discount_factor  # Gamma: Discount factor
    self.exploration_rate = exploration_rate  # Epsilon: Exploration rate

    # Initialize the Q-table as a dictionary with default Q-values of 0.0
    self.q_table = {}
    self.q_table_json = {}

  def choose_action(self, state, actions, CP):
    # Explore (random action) or exploit (best known action) based on epsilon
    if random.uniform(0, 1) < self.exploration_rate:
      return random.choice(actions)
    else:
      return self.get_best_action(state,actions,CP)

  def get_danger(self,state,actions,CP):
    OP = Game.get_opponent(CP)
    res = []
    for move in actions:
      winner = Game.check_winner(state[:move]+OP+state[move+1:])
      if winner == OP:
        res.append(move)
    return res

  def get_best_action(self, state, actions, CP):
    # Find the action with the highest Q-value for the given state
    best_action = None
    best_q_value = float('-inf')  # Initialize with negative infinity

    # check if we can win
    for action in actions:
        winner = Game.check_winner(state[:action]+CP+state[action+1:])
        if winner == CP:
          return action

    # check danger positions
    OP = Game.get_opponent(CP)
    dangers = self.get_danger(state,actions,CP)
    if len(dangers)>0:
      return random.choice(dangers)
    else:
      # get best move
      for action in actions:
        q_value = self.get_q_value(state, action, CP)
        if q_value > best_q_value:
          best_action = action
          best_q_value = q_value

    if best_action is None:
      return random.choice(actions)

    # get all possible best moves and select randomly
    best_actions = []
    for action in actions:
      q_value = self.get_q_value(state, action, CP)
      if q_value == best_q_value:
        best_actions.append(action)

    return random.choice(best_actions)

  def get_q_value(self, state, action, CP):
    # Retrieve the Q-value for a specific state-action pair
    return self.q_table.get((state, str(action)+CP), 0.0)

  def update_q_value(self, state, action, new_q_value, CP):
    # Update the Q-value for a specific state-action pair
    self.q_table[(state, str(action)+CP)] = new_q_value
    self.q_table_json[state+"|"+str(action)+CP] = new_q_value

  def learn(self, CP, OP, computer, state, action, reward, next_state, actions, winner):
    # Q-learning update rule to adjust Q-values
    current_q_value = self.get_q_value(state, action, CP)
    max_q_value = 0.0
    # if winner is not None and winner == computer:
    #   max_q_value = 2 * reward
    # elif winner is not None and winner != computer:
    #   max_q_value = -5 * reward

    if len(actions)>0:
      max_q_value = max([self.get_q_value(next_state, a, OP) for a in actions])

    new_q_value = (1 - self.learning_rate) * current_q_value + self.learning_rate * ( reward + self.discount_factor * max_q_value )
    # new_q_value = (1 - self.learning_rate) * current_q_value + self.learning_rate * (reward + self.discount_factor * max_q_value)
    self.update_q_value(state, action, new_q_value, CP)

  def save_agent(self, filename):
    # Saving the Trained Model
    with open(filename+'.json', 'w') as json_file:
      json.dump(self.q_table_json, json_file)
    with open(filename, 'wb') as f:
      pickle.dump(self.q_table, f)

In [11]:
class Train():
  def __init__(self):
    self.agent = QLearningAgent()
    self.agentTrained = QLearningAgent()
    self.init()

  def init(self, filename="RLModelTrained"):
    # Loading the Trained Model
    with open(filename, 'rb') as f:
      self.agentTrained.q_table = pickle.load(f)

  def train(self,ite):
    for i in range(ite):
      self.playGame(i)
      if i%50000 == 0:
        print('learnt ' + str(i))
      if i>100000:
        self.agent.exploration_rate = 0.6
      if i>200000:
        self.agent.exploration_rate = 0.4
      if i>300000:
        self.agent.exploration_rate = 0.2
      if i>400000:
        self.agent.exploration_rate = 0.1
    self.agent.save_agent("RLModel")

  def trainMinMax(self):
    game= Game('X')

    for i in range(8):
      score = self.minmax(game,1,True)

  def playGame(self,i):
    game = Game('X' if i%2==0 else 'O')
    done = False
    computer = 'X'
    while not done:
      CP = game.CP
      OP = Game.get_opponent(CP)
      pState = game.get_state()
      action = None

      if CP != computer:
        action = self.agentTrained.get_best_action(pState, game.get_moves(), CP)
      else:
        action = self.agent.choose_action(pState, game.get_moves(), CP)

      nState,winner,reward,done = game.make_move(action,computer)
      possibleMoves = []
      if not done:
        possibleMoves = game.get_moves()
      self.agent.learn(CP, OP, computer, pState,action,reward,nState,possibleMoves,winner)

In [12]:
train = Train()
train.train(500000)

learnt 0
learnt 50000
learnt 100000
learnt 150000
learnt 200000
learnt 250000
learnt 300000
learnt 350000
learnt 400000
learnt 450000


In [8]:
class GamePlayer():
  def __init__(self):
    self.agent = None
    self.CNT = {}
    self.D = 0
    self.init()

  def init(self, filename="RLModel"):
    # Loading the Trained Model
    self.agent = QLearningAgent()
    with open(filename, 'rb') as f:
      self.agent.q_table = pickle.load(f)

  def play(self,i,computer,log=True):
    game = Game('X' if i%2==0 else 'O')
    done = False
    while not done:
      moves = game.get_moves()
      pState = game.get_state()
      move = None
      CP = game.CP
      move = None
      if CP != computer:
        move = random.choice(moves)
      else:
        move = self.agent.get_best_action(pState, moves, CP)
      nState,winner,reward,done = game.make_move(move,computer)
      # print(nState, winner)
      if log:
        game.print()
      if winner != None:
        if log:
          print("Player "+winner+" Won")
        self.CNT[winner] = self.CNT.get(winner,0)+1
        break
      elif done:
        if log:
          print("Game drawn")
        self.D = self.D + 1
        break
      if log:
        print()

  def printQ(self,state,CP,moves):
    for i in moves:
      print(self.agent.get_q_value(state,i,CP))

  def printStat(self):
    print("Player X won " + str(self.CNT.get('X')))
    print("Player O won " + str(self.CNT.get('O')))
    print("Draw " + str(self.D))

In [None]:
player = GamePlayer()
player.printQ('XXO O    ','X',[3,5,6,7,8])

0.0
0.0
0.0
0.0
0.0


In [None]:
player = GamePlayer()
player.play(1,'X')

In [15]:
cnt = 0.0
player = GamePlayer()
for i in range(1000):
  player.play(i,'X',log = False)
player.printStat()

Player X won 827
Player O won 38
Draw 135


O
