In [443]:
from random import choice as random_choice

import numpy as np
import pandas as pd

In [546]:
class Board:
    
    def __init__(self):
        self.squares = np.zeros((3,3), dtype=int)
        self.state = ''.join([str(i) for i in self.squares]).replace('0', '-').replace(' ', '')
        
    def get_valid_moves(self):
        return list(zip(*np.where(self.squares == 0)))
         
    def update_square(self, loc, value):
        self.squares[loc] = value
        self.state = (''.join([str(i) for i in self.squares])
                      .replace('-1', 'O')
                      .replace('1', 'X')
                      .replace('0', '-')
                      .replace(' ', ''))
        
    def get_winner(self):
        for i in range(3):
            if (self.squares[i,:] == self.squares[i,0]).all() and self.squares[i,0] != 0:
                return self.squares[i,0]
            if (self.squares[:,i] == self.squares[0,i]).all() and self.squares[0,i] != 0:
                return self.squares[0,i]
        if (self.squares.diagonal() == self.squares[0,0]).all() and self.squares[0,0] != 0:
            return self.squares[0,0]
        if (self.squares[::-1].diagonal() == self.squares[-1,0]).all() and self.squares[-1,0] != 0:
            return self.squares[-1,0]
        else:
            return False
        
    def is_game_over(self):
        return bool(self.get_winner() or not self.get_valid_moves())

In [547]:
class Player:
    
    def __init__(self, name, policy=None):
        self.name = name
        self.policy = policy
        
    def make_move(self, board):
        if self.policy:
            return self.policy.make_move(board)
        else:
            valid_moves = board.get_valid_moves()
            return random_choice(valid_moves)

In [548]:
class QLearner:
    
    def __init__(self, gamma=1, alpha=0.5):
        self.q_dict = {}
        self.gamma = gamma # discount rate for future rewards
        self.alpha = alpha # learning rate
        
    def update(self, state, action, reward, new_state):
        self.q_dict[state] = self.q_dict.get(state, {})
        self.q_dict[new_state] = self.q_dict.get(new_state, {})
        self.q_dict[state][action] = (
            self.q_dict[state].get(action, 0)
            + self.alpha * (
                reward
                + (self.gamma * max(self.q_dict[new_state].values(), default=0))
                - self.q_dict[state].get(action, 0)
            )
        )

In [592]:
class Game:
    
    def __init__(self, player1, player2, q_learner=None):
        self.players = {1: player1, -1: player2}
        self.board = Board()
#         self.mover = random_choice([1,2])
        self.mover = 1
        self.q_learner = q_learner if q_learner else QLearner()
        self.prev_sap = {1: None, -1: None}
    
    def play_turn(self, player):
        start_state = self.board.state
        
        if self.prev_sap[player]:
            self.q_learner.update(self.prev_sap[player]['state'],
                                  self.prev_sap[player]['action'],
                                  0, start_state)
        
        action = self.players[player].make_move(self.board)
        self.board.update_square(action, player)
        reward = self.board.get_winner() == player
        
        if self.board.is_game_over():
            self.q_learner.update(start_state, action, reward, 'END')
            self.q_learner.update(self.prev_sap[-player]['state'],
                                  self.prev_sap[-player]['action'],
                                  -1 * reward, 'END')
        else:
            self.prev_sap[player] = {'state': start_state, 'action': action}
    
    def play_game(self):
        self.play_turn(self.mover)
        while not self.board.is_game_over():
            self.mover *= -1
            self.play_turn(self.mover)
        return getattr(self.players.get(self.board.get_winner()), 'name', 'Tie'), self.board.squares

In [593]:
class Policy:
    
    def __init__(self, q_dict, epsilon=0.2):
        self.q_dict = q_dict
        self.epsilon = epsilon
        
    def make_move(self, board):
        valid_moves = board.get_valid_moves()
        if np.random.rand() < self.epsilon:
            return random_choice(valid_moves)
        else:
            argmax = np.argmax([self.q_dict.get(board.state, {}).get(move, 0) for move in valid_moves])
            return valid_moves[argmax]

In [612]:
jack = Player(name='jack')
kate = Player(name='kate')
# jack = Player(name='Jack', policy=Policy(q_dict=q_learner.q_dict, epsilon=0.1))
# kate = Player(name='Kate', policy=Policy(q_dict=q_learner.q_dict, epsilon=0.1))

In [617]:
N = 1000
winners = []
# q_learner = QLearner()

for g in range(N):
    result = Game(jack, kate, q_learner).play_game()
    winners.append(result[0])
#     display(result)

In [614]:
pd.Series(winners).value_counts()

jack    590
kate    279
Tie     131
dtype: int64

In [619]:
len(q_learner.q_dict)

4632

In [618]:
q_learner.q_dict['[XO-][X-X][-O-]']

{(1, 1): 0.8125, (2, 2): -0.30712890625, (2, 0): 0.46875}