In [296]:
from copy import deepcopy
import time

In [303]:
# X is the bot, O is the opponent
class Board():
    def __init__(self, board=None, first_move=True):
        self.board = board
        if not board:
            self.board = [[''] * 3 for _ in range(3)]
        self.first_move = first_move
    
    def __str__(self):
        return f'X{str(self.board)}' if self.first_move else f'O{str(self.board)}'
    def __hash__(self):
        return hash(self.__str__())
    def __eq__(self, other):
        return self.__str__() == other.__str__()
    def __ne__(self, other):
        return not(self == other)
    
    # Returns 'X' if X wins, 'O' if O wins, None if no one wins
    def game_over(self):
        xsum = 0
        osum = 0
        
        def three_in_row(xsum, osum):
            if xsum == 3:
                return 'X'
            elif osum == 3:
                return 'O'
            else:
                return None
        
        # Check row-wise
        for i in range(3):
            for j in range(3):
                xsum += (self.board[i][j] == 'X')
                osum += (self.board[i][j] == 'O')
            if three_in_row(xsum, osum):
                return three_in_row(xsum, osum)
            else:
                xsum, osum = (0, 0)
        
        # Check col-wise
        for j in range(3):
            for i in range(3):
                xsum += (self.board[i][j] == 'X')
                osum += (self.board[i][j] == 'O')
            if three_in_row(xsum, osum):
                return three_in_row(xsum, osum)
            else:
                xsum, osum = (0, 0)
        
        # Check diag left-to-right
        for i in range(3):
            xsum += (self.board[i][i] == 'X')
            osum += (self.board[i][i] == 'O')
        if three_in_row(xsum, osum):
            return three_in_row(xsum, osum)
        else:
            xsum, osum = (0, 0)
        
        # Check diag right-to-left
        for i in range(3):
            xsum += (self.board[i][2 - i] == 'X')
            osum += (self.board[i][2 - i] == 'O')
        if three_in_row(xsum, osum):
            return three_in_row(xsum, osum)
        else:
            xsum, osum = (0, 0)
        
        return None
    
    # Gets count of pieces on the board
    def p_count(self, player):
        psum = 0
        for i in range(3):
            for j in range(3):
                psum += (self.board[i][j] == player)
        return psum       
    def x_count(self):
        return self.p_count('X')
    def o_count(self):
        return self.p_count('O')
    
    def p_moves(self, player):
        moves = {}
        for i in range(3):
            for j in range(3):
                if self.board[i][j] == '':
                    new_state = deepcopy(self)
                    new_state.board[i][j] = player
                    moves[f'{i},{j}'] = new_state
        return moves
    
    def moves(self, opponent=False):
        x_cnt = self.x_count()
        o_cnt = self.o_count()
        #print(f'Opponent Move: {opponent} ; xcnt = {x_cnt} ; o_cnt = {o_cnt}')
        #print(self.board)
        if self.game_over():
            return {}
        if not opponent:
            if x_cnt > o_cnt:
                return {}
            elif x_cnt == o_cnt and not self.first_move:
                return {}
        else:
            if o_cnt > x_cnt:
                return {}
            elif x_cnt == o_cnt and self.first_move:
                return {}
        
        player = 'O' if opponent else 'X'
        return self.p_moves(player)
    
    def actions(self):
        actions = list(self.moves().keys())
        if not actions and not self.game_over():
            actions = ['wait']
        return actions

class TicTacToe:
    def __init__(self):
        self.reward_win = 1
        self.reward_lose = -1
        self.reward_tie = 0
        self.states = []
        self.rewards = {}

    def reward(self, board):
        if board in self.rewards:
            return self.rewards[board]
        else:
            return 0
    
    # For a given state and action, return pairs of (result-state, probability)
    def state_action_prob(self, board, action):
        # If action is wait, equal chance opponent makes any available move
        if action == 'wait':
            moves = board.moves(opponent=True)
            return [(state, 1/len(moves)) for state in moves.values()]
        # Otherwise, specified action will always occur
        else:
            moves = board.moves()
            return [(moves[action], 1.0)]
    
    def _generate_action_rewards(self, board, depth):
        #time.sleep(1)
        self.states.append(board)
        
        moves = board.moves()
        if not moves:
            game_over_output = board.game_over()
            if game_over_output == 'X':
                self.rewards[board] = 1
            elif game_over_output == 'O':
                self.rewards[board] = -1
            else:
                moves = board.moves(opponent=True)

        for board_t in moves.values():
            self._generate_action_rewards(board_t, depth+1)

    def generate_action_rewards(self, player='X'):
        board_1 = Board(first_move=True)
        board_2 = Board(first_move=False)
        self._generate_action_rewards(board_1, depth=0)
        self._generate_action_rewards(board_2, depth=0)
    
    def state_values(self, gamma=0.9, epsilon=0.001):
        V = {s: 0 for s in self.states}
        epoch = 0
        
        while True:
            epoch += 1
            V1 = deepcopy(V)
            delta = 0
            
            for s in self.states:
                V[s] = self.reward(s) + gamma * max( [sum( [p * V[s_] for (s_, p) in self.state_action_prob(s, a)] ) for a in s.actions()], default=0)
                delta = max(delta, V1[s] - V[s])
            
            print(f'Epoch {epoch}; delta = {delta}')
            if delta < epsilon:
                return V1
    
    def policy(self):
        V = self.state_values()
        P = {}
        
        def expected_state_value(s, a):
            return sum([ p * V[s_] for (s_, p) in self.state_action_prob(s, a) ])
        
        for s in self.states:
            P[s] = max(s.actions(), key=lambda a: expected_state_value(s, a), default=None)
        
        return P

In [304]:
t = TicTacToe()
t.generate_action_rewards()

In [305]:
P = t.policy()

Epoch 1; delta = 1.0
Epoch 2; delta = 0.0005623714285714065


In [354]:
b = Board(board=[['', '', ''],
                 ['', '', ''],
                 ['', '', '']], first_move=True)
P[b]

'0,0'

In [356]:
b = Board(board=[['X', '', ''],
                 ['',  '', ''],
                 ['',  '', '']], first_move=True)
P[b]

'wait'

In [357]:
b = Board(board=[['X', '',  ''],
                 ['',  'O', ''],
                 ['',  '',  '']], first_move=True)
P[b]

'0,1'

In [358]:
b = Board(board=[['X', 'X',  ''],
                 ['',  'O', ''],
                 ['',  '',  '']], first_move=True)
P[b]

'wait'

In [361]:
b = Board(board=[['X', 'X', 'O'],
                 ['',  'O', ''],
                 ['',  '',  '']], first_move=True)
P[b]

'2,0'

In [362]:
b = Board(board=[['X', 'X', 'O'],
                 ['',  'O', ''],
                 ['X', '',  '']], first_move=True)
P[b]

'wait'

In [363]:
b = Board(board=[['X', 'X', 'O'],
                 ['O', 'O', ''],
                 ['X', '',  '']], first_move=True)
P[b]

'1,2'

In [364]:
b = Board(board=[['X', 'X', 'O'],
                 ['O', 'O', 'X'],
                 ['X', '',  '']], first_move=True)
P[b]

'wait'

In [365]:
b = Board(board=[['X', 'X', 'O'],
                 ['O', 'O', 'X'],
                 ['X', 'O', '']], first_move=True)
P[b]

'2,2'

In [366]:
b = Board(board=[['X', 'X', 'O'],
                 ['O', 'O', 'X'],
                 ['X', 'O', 'X']], first_move=True)
P[b]

'wait'