In [1]:
import numpy as np #create array
import random

# Create Q-Table

## Represent 0 - empty, 1 - O, 2 - X

In [None]:
qTable = {}#creat table to store state and action

In [None]:
representStates = [0, 1, 2]# empty = 0, O = 1, x = 2

In [None]:
def getHashValue(hash):#creat new state
  if not hash in qTable:
    qTable[hash] = [0, 0, 0, 0, 0, 0, 0, 0, 0]

  return qTable[hash]

In [None]:
def updateHash(hash, newValue):
  qTable[hash] = newValue #input value of reward to table

In [None]:
def getPossibilityActions(hash):#find empty block and return that
  possibilityActions = []
  for stringValue in hash:
    value = int(stringValue) 
    if value != 0:
      possibilityActions.append(0)
    else:
      possibilityActions.append(1)
  return np.array(possibilityActions)

In [None]:
def stateToHash(state):#change state to hash
  hash = ""
  for s in state:
    hash += str(int(s))
  return hash

# Create Agent

In [5]:
class Agent:
  def __init__(self, epsilon=0.3, lr=0.3, gamma=0.99, isPlay=False):
    """
    epsilon = if random number < epsilon then do random action else use qtable
    lr = ?
    gamma = ?
    isPlay = ?
    """
    self.epsilon = epsilon
    self.lr = lr
    self.gamma = gamma
    self.isPlay = isPlay

  def act(self, state):
    rand = random.uniform(0, 1)#random number to use with epsilon
    # convert state to hash
    hash = stateToHash(state)

    # get possibility actions
    possibilityActions = getPossibilityActions(hash)#get list of empty block

    # get Q value
    qValues = getHashValue(hash)

    # random Q value
    if rand < self.epsilon and not self.isPlay:
      qValues = np.random.rand(9)
    
    # avoid choice same action when qValue is negative
    qValues = np.array(qValues)
    if qValues.min() < 0:
      base = abs(qValues.min())
      qValues += base * 2

    # dot product
    qValues = np.multiply(qValues, possibilityActions)

    # avoid use first action when nothing to choose
    if qValues.sum() == 0:
      qValues = possibilityActions

    # random if have multiple best action
    if np.count_nonzero(qValues == qValues.max()) > 1:
      bestActions = [i for i in range(len(qValues)) if qValues[i] == qValues.max()]
      return random.choice(bestActions)

    # print(qValues)
    # choose best action
    return np.argmax(qValues)

  def learn(self, state, nextState, action, reward, isDone):
    hashState = stateToHash(state)
    hashNextState = stateToHash(nextState)

    qState = getHashValue(hashState)
    qNextState = getHashValue(hashNextState)

    possibilityActions = getPossibilityActions(hashNextState)
    qNextState = np.multiply(qNextState, possibilityActions)

    tmpQNextState = np.array(qNextState, copy=True)
    if qNextState.min() < 0:
      base = abs(qNextState.min())
      tmpQNextState += base * 2

    qState[action] += self.lr * (reward + self.gamma * qNextState[np.argmax(tmpQNextState)] - qState[action])
    if isDone:
      qState[action] = reward

    updateHash(hashState, qState)
  

SyntaxError: invalid syntax (4266044864.py, line 20)

# Create Env

In [None]:
class Env:
  def __init__(self):
    self.reset()

  def reset(self):
    self.board = np.zeros((9,))
    self.isXTurn = True
    return self.getState()

  def checkRows(self, board):
    for row in board:
        if len(set(row)) == 1:
            return row[0]
    return 0

  def checkDiagonals(self, board):
    if len(set([board[i][i] for i in range(len(board))])) == 1:
        return board[0][0]
    if len(set([board[i][len(board)-i-1] for i in range(len(board))])) == 1:
        return board[0][len(board)-1]
    return 0

  def checkWin(self):
    board = self.board.reshape((3,3))
    for newBoard in [board, np.transpose(board)]:
        result = self.checkRows(newBoard)
        if result:
            return result
    return self.checkDiagonals(board)

  def checkDraw(self):
    return self.checkWin() == 0

  def checkDone(self):
    return self.board.min() != 0 or self.checkWin() != 0

  def getState(self):
    return np.array(self.board, copy=True)

  def showBoard(self):
    prettyBoard = self.board.reshape((3, 3))
    for row in prettyBoard:
      print("|", end='')
      for col in row:
        symbol = "*"
        if col == 1:
          symbol = "X"
        elif col == 2:
          symbol = "O"
        print(symbol, end='')
        print("|", end='')
      print("")

  def act(self, action):
    reward = 0
    player = 2
    if self.isXTurn:
      player = 1

    self.board[action] = player
    self.isXTurn = not self.isXTurn

    winner = self.checkWin()
    isDraw = self.checkDraw()
    isDone = self.checkDone()

    if winner:
      reward = 1
    
    if isDraw:
      reward = 0.5

    nextState = np.array(self.board, copy=True)
    return nextState, reward, isDone, {}


# Train

In [None]:
env = Env()
agent = Agent()

In [None]:
env.getState()

array([0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [None]:
episodes = 50000
winner_history = []

In [None]:
def swapSide(state):
  newState = np.array(state, copy=True)

  for i in range(len(newState)):
    if newState[i] == 1:
      newState[i] = 2
    elif newState[i] == 2:
      newState[i] = 1

  return newState

In [None]:
def rotage(state, n = 1):
  return np.rot90(state.reshape((3,3)), n).reshape((9,))

In [None]:
def rotageAction(action, n = 1):
  board = np.zeros((9,))
  board[action] = 1
  board = rotage(board, n)
  return np.argmax(board)

In [None]:
for episode in range(episodes):
  isDone = False
  state = env.reset()
  prevState = state
  prevAction = -1
  isShouldLearn = False
  
  if episode % 1000 == 0:
    print("episode:", episode)

  while not isDone:
    state = env.getState()

    if not env.isXTurn:
      state = swapSide(state)
    
    action = agent.act(state)
    nextState, reward, isDone, _ = env.act(action)
    # env.showBoard()

    # if X turn mean before act is not X turn
    if env.isXTurn:
      nextState = swapSide(nextState)

    if isShouldLearn:
      if isDone and not env.checkDraw():
        prevReward = -1
      elif isDone and env.checkDraw():
        prevReward = 0.5
      agent.learn(prevState, swapSide(nextState), prevAction, prevReward, isDone)
      agent.learn(rotage(prevState, 1), rotage(swapSide(nextState), 1), rotageAction(prevAction, 1), prevReward, isDone)
      agent.learn(rotage(prevState, 2), rotage(swapSide(nextState), 2), rotageAction(prevAction, 2), prevReward, isDone)
      agent.learn(rotage(prevState, 3), rotage(swapSide(nextState), 3), rotageAction(prevAction, 3), prevReward, isDone)
      

    if isDone:
      agent.learn(state, nextState, action, reward, isDone)
      agent.learn(rotage(state, 1), rotage(nextState, 1), rotageAction(action, 1), reward, isDone)
      agent.learn(rotage(state, 2), rotage(nextState, 2), rotageAction(action, 2), reward, isDone)
      agent.learn(rotage(state, 3), rotage(nextState, 3), rotageAction(action, 3), reward, isDone)

    prevState = state
    prevAction = action
    prevReward = reward
    isShouldLearn = True

  winner_history.append(env.checkWin())

episode: 0
episode: 1000
episode: 2000
episode: 3000
episode: 4000
episode: 5000
episode: 6000
episode: 7000
episode: 8000
episode: 9000
episode: 10000
episode: 11000
episode: 12000
episode: 13000
episode: 14000
episode: 15000
episode: 16000
episode: 17000
episode: 18000
episode: 19000
episode: 20000
episode: 21000
episode: 22000
episode: 23000
episode: 24000
episode: 25000
episode: 26000
episode: 27000
episode: 28000
episode: 29000
episode: 30000
episode: 31000
episode: 32000
episode: 33000
episode: 34000
episode: 35000
episode: 36000
episode: 37000
episode: 38000
episode: 39000
episode: 40000
episode: 41000
episode: 42000
episode: 43000
episode: 44000
episode: 45000
episode: 46000
episode: 47000
episode: 48000
episode: 49000


In [None]:
# qTable['000000000']

In [None]:
len(qTable)

6477

In [None]:
class TigTagToeGame:
  def __init__(self):
    self.reset()

  def reset(self):
    self.board = np.zeros((9,))
    self.isXTurn = True
    return self.getState()

  def checkRows(self, board):
    for row in board:
        if len(set(row)) == 1:
            return row[0]
    return 0

  def checkDiagonals(self, board):
    if len(set([board[i][i] for i in range(len(board))])) == 1:
        return board[0][0]
    if len(set([board[i][len(board)-i-1] for i in range(len(board))])) == 1:
        return board[0][len(board)-1]
    return 0

  def checkWin(self):
    board = self.board.reshape((3,3))
    for newBoard in [board, np.transpose(board)]:
        result = self.checkRows(newBoard)
        if result:
            return result
    return self.checkDiagonals(board)

  def checkDraw(self):
    return self.checkWin() == 0

  def checkDone(self):
    return self.board.min() != 0 or self.checkWin() != 0

  def getState(self):
    return np.array(self.board, copy=True)

  def showBoard(self):
    prettyBoard = self.board.reshape((3, 3))
    for row in prettyBoard:
      print("|", end='')
      for col in row:
        symbol = "*"
        if col == 1:
          symbol = "X"
        elif col == 2:
          symbol = "O"
        print(symbol, end='')
        print("|", end='')
      print("")


  def play(self, action):
    player = 2
    if self.isXTurn:
      player = 1

    self.board[action] = player
    self.isXTurn = not self.isXTurn

    winner = self.checkWin()
    isDone = self.checkDone()

    nextState = np.array(self.board, copy=True)
    return nextState, isDone

In [None]:
game = TigTagToeGame()
agent = Agent(isPlay=True)

In [None]:
game.showBoard()

|*|*|*|
|*|*|*|
|*|*|*|


In [None]:
isDone = False
game.reset()

while not isDone:
  state = game.getState()
  print("--- AI vs Human ---")
  game.showBoard()

  action = 0
  if game.isXTurn:
    action = agent.act(state)
    # isInputValidate = False
    # while not isInputValidate:
    #   action = int(input("player turn (X):"))
    #   if len(state) > action and state[action] == 0:
    #     isInputValidate = True
    print("thinking x", getHashValue(stateToHash(state)))
    if state[4] == 0:
      action = 4
  else:
    sstate = swapSide(state)
    print("thinking", getHashValue(stateToHash(sstate)))
    action = agent.act(swapSide(state))
  print(action)
  state, isDone = game.play(action)

print("game end")
game.showBoard()
winner = game.checkWin()
if winner == 1:
  print("Congratulation the player win.")
elif winner == 2:
  print("AI is the winner, We'll conquer the world")
else:
  print("Draw!!")

--- AI vs Human ---
|*|*|*|
|*|*|*|
|*|*|*|
thinking x [2.5486516652777658, 2.5613031642264725, 2.5486516652777658, 2.5613031642264725, 2.5233456822791234, 2.5613031642264725, 2.5486516652777658, 2.5613031642264725, 2.5486516652777658]
4
--- AI vs Human ---
|*|*|*|
|*|X|*|
|*|*|*|
thinking [1.9814297492040436, 1.3680741084176464, 1.9814297492040436, 1.3680741084176464, 0, 1.3680741084176464, 1.9814297492040436, 1.3680741084176464, 1.9814297492040436]
0
--- AI vs Human ---
|O|*|*|
|*|X|*|
|*|*|*|
thinking x [0, 2.011396460074203, 1.9347372561319678, 2.034971170476954, 0, 1.9774592720400521, 1.8578920915367867, 1.9718494659689485, 2.008680969715967]
3
--- AI vs Human ---
|O|*|*|
|X|X|*|
|*|*|*|
thinking [0, 1.0160304153785504, 1.1831117339411237, 0, 0, 1.4886514125203743, 1.3841095458629282, -0.23500000000000004, -1]
5
--- AI vs Human ---
|O|*|*|
|X|X|O|
|*|*|*|
thinking x [0, 1.4906572543472842, 1.5245092848051347, 0, 0, 0, 1.0527658171269818, 1.523418385516253, 1.5083810336863364]
2
--

# Validate
## Expected that Q-learning agent will never lose!!

In [6]:
for i in range(1000):
  isDone = False
  game.reset()

  # print("game No.", i)
  while not isDone:
    state = game.getState()
    action = 0
    if game.isXTurn:
      hash = stateToHash(state)
      possibleActions = getPossibilityActions(hash)
      idx = [i for i in range(len(possibleActions)) if possibleActions[i] == 1]
      action = random.choice(idx)
    else:
      action = agent.act(swapSide(state))
      if state[4] == 0:
        action = 4
    # game.showBoard()

    state, isDone = game.play(action)
  winner = game.checkWin()
  if winner == 1:
    print("What!! AI Lose a randomness ?")
    game.showBoard()
    break
  elif winner == 2:
    # pass
    print("AI is the winner, We'll conquer the world")
  else:
    print("Draw!!")

NameError: name 'game' is not defined