#### Tic-tac-toe game mechanics

In [1]:
%reset -f
#=====================================================================================
class game:

    def __init__(self):
        import numpy as np
        self.board = np.zeros([9], dtype='int')
        
    def reset(self):
        self.board = np.zeros([9], dtype='int')
       
    def isEnd(self):
        for plyr in [1,2]:
            #--- three ways of winning
            w1 = np.array([plyr,plyr,plyr,0,0,0,0,0,0])
            w2 = np.array([0,0,0,plyr,plyr,plyr,0,0,0])
            w3 = np.array([plyr,0,0,0,plyr,0,0,0,plyr])
            if self.match_pattern(w1) or self.match_pattern(w2) or self.match_pattern(w3):
                return True, plyr
        if np.nonzero(self.board != 0)[0].shape[0] == 9: return True, 0
        else:                                            return False, 0 
       
    def move(self, field):
        #-- change situation
        if self.isEnd()[0]:
            return 0
            #raise Exception('Game already ended...')
        elif self.board[field] == 0 :
            if np.count_nonzero(self.board) % 2 == 0:
                self.board[field] = 1
            else:
                self.board[field] = 2
        else:
            raise Exception('Forbidden move!')
        #-- feedback player
        if self.isEnd()[1] != 0: return +1   # return reward
        return 0
         
    def show_board(self):
        inrow = 1
        print('–––––')
        for field in self.board :
            #=== convert ints to characters
            if   field == 0: sign = '•'
            elif field == 1: sign = 'X'
            else:            sign = 'O'
            #=== display board
            if inrow == 3:
                print(sign)
                inrow = 1
            else:
                print(sign, end=" ")
                inrow += 1
        print('–––––')
        
    ### PRIVATE ###
    def match_pattern(self, pattern):
        #-- function looking for pattern in current board state
        def find_pattern(target, pattern):
            # <!> both input args have to be vectors
            match = True
            for index in range(pattern.shape[0]):
                if pattern[index] != 0:
                    if pattern[index] != target[index]:
                        match = False
                        break
            return match
        #-- for all possible transformations
        # <!> only board is transformed 
        for flipud in [True,False]:
            for fliplr in [True,False]:
                for transpose in [True,False]:
                    board_temp = self.board.reshape([3,3]) # reshape to 2-D array
                    #-- apply tranformations
                    if flipud: board_temp = np.flipud(board_temp)
                    if fliplr: board_temp = np.fliplr(board_temp)
                    if transpose: board_temp = board_temp.T
                    board_temp = board_temp.reshape(9)     # reshape back to vector
                    if find_pattern(board_temp, pattern): return True
        return False
#=====================================================================================
class player:

    def __init__(self, player_id):
        import numpy as np
        #--- book of states and actions
        self.states = np.zeros([1,9], dtype='int')
        if player_id == 2:
            self.states[0,0] = 1
            self.actions = [(np.array([1,2,3,4,5,6,7,8]), np.zeros(8))]
        else:
            self.actions = [(np.array([0,1,2,3,4,5,6,7,8]), np.zeros(9))]
        self.id = player_id
        self.nofstates = 1
        self.nofgames = 0

    def addstate(self, game):
        board = game.board
        #-- search for that that state
        for row in self.states:
            if isSameState(row, board)[0]: return 
        #-- add state if not seen
        self.states = np.vstack((self.states, board)) # add state to 'memory'
        if not game.isEnd()[0]:
            possible_act = np.nonzero(board == 0)[0]      # possible actions from that state
            self.actions.append( tuple([possible_act, np.zeros(possible_act.shape[0])]) )    
        else: 
            self.actions.append( tuple([np.empty(0,dtype='int'), np.empty(0,dtype='float')]) )
        self.nofstates += 1
        
    def state_info(self, index):
        for idx, move in enumerate(self.actions[index][0]):
            print('field: ' + str(move) + ' (' + str(self.actions[index][1][idx]) + ')')
        show_state(self.states[index])
            
    #=== policy ===
    def eps_greedy_move(self, board, eps):
        import random
        
        #--- find state
        index = 0
        for row in self.states:
            if isSameState(row, board)[0]: rotation = isSameState(row, board)[1]; break
            index += 1   
        #--- choose action
        if random.uniform(0,1) > eps:
            action_index = np.argmax(self.actions[index][1]) # greedy move
        else:
            action_index = random.randint(0,self.actions[index][0].shape[0]-1) # random move
            
        #--- determine move
        move = self.actions[index][0][action_index]
        #-- rotate
        move = np.zeros(9)
        move[self.actions[index][0][action_index]] = 1
        move = move.reshape([3,3])
        if rotation[0]: move = np.flipud(move)
        if rotation[1]: move = np.fliplr(move)
        if rotation[2]: move = move.T
        move = move.reshape(9)
        move = np.nonzero(move)[0][0]
        # <!> return ( action_number, state_index, action_index )
        return move, index, action_index
    
    #=== SARSA update
    def sarsa_improve(self, reward, index_t, action_index_t, board_t1, alpha, isEnd):
        #--- find current state
        index_t1 = 0
        while not isSameState(self.states[index_t1], board_t1)[0]:
            index_t1 += 1
        #=== TD update [ index_t1, index_t, action_index_t, action_index_t1 ]
        if isEnd: 
            targetTD = reward 
        else:     
            targetTD = reward + np.max(self.actions[index_t1][1])
        self.actions[index_t][1][action_index_t] = \
        self.actions[index_t][1][action_index_t] + \
        alpha * ( targetTD - self.actions[index_t][1][action_index_t] )
#=====================================================================================        
def show_state(state):
    inrow = 1
    print('–––––')
    for field in state :
        #=== convert ints to characters
        if   field == 0: sign = '•'
        elif field == 1: sign = 'X'
        else:            sign = 'O'
        #=== display board
        if inrow == 3:
            print(sign)
            inrow = 1
        else:
            print(sign, end=" ")
            inrow += 1
    print('–––––')

def show_dial():
    state = np.arange(9)
    inrow = 1
    print('–––––')
    for field in state :
        sign = str(field)
        #=== display board
        if inrow == 3:
            print(sign)
            inrow = 1
        else:
            print(sign, end=" ")
            inrow += 1
    print('–––––')    
    
def isSameState(state1, state2):
#-- function looking for pattern in current board state
    def isSameVector(v1, v2):
        # <!> both input args have to be vectors
        match = True
        for index in range(v1.shape[0]):
            if v1[index] != v2[index]:
                match = False
                break
        return match
    #-- for all possible transformations
    if isSameVector(state1, state2): return True, [False, False, False] # identity tranformation
    for flipud in [True,False]:
        for fliplr in [True,False]:
            for transpose in [True,False]:
                state1_temp = state1.reshape([3,3]) # reshape to 2-D array
                #-- apply tranformations
                if flipud: state1_temp = np.flipud(state1_temp)
                if fliplr: state1_temp = np.fliplr(state1_temp)
                if transpose: state1_temp = state1_temp.T
                state1_temp = state1_temp.reshape(9)     # reshape back to vector
                if isSameVector(state1_temp, state2): return True, [flipud, fliplr, transpose]
    return False, 

In [2]:
#=== LEARNING ===
from IPython.display import clear_output
import numpy as np
#--- parameters
N_sim = 20000
g = game()
p1 = player(player_id=1); eps1 = 0.1; alpha1 = .5
p2 = player(player_id=2); eps2 = 0.1; alpha2 = .5

for sim in range(N_sim):
    g = game()
    clear_output()
    print('Game number: '+str(sim))
    #--- firstmove
    p1.nofgames += 1
    p2.nofgames += 1
    #--- 'X' player
    m1, s_index1, act_index1 = p1.eps_greedy_move(g.board, eps1)
    r1 = g.move(m1)
    #--- 'O' player
    p2.addstate(g)
    m2, s_index2, act_index2 = p2.eps_greedy_move(g.board, eps2)
    r2 = g.move(m2)

    while not g.isEnd()[0]:    
        #--- 'X' player
        p1.addstate(g)
        p1.sarsa_improve(r1, s_index1, act_index1, g.board, alpha1, g.isEnd()[0])
        m1, s_index1, act_index1 = p1.eps_greedy_move(g.board, eps1)
        r1 = g.move(m1)
        if g.isEnd()[0]:
            p1.addstate(g)
            p1.sarsa_improve(r1, s_index1, act_index1, g.board, alpha1, g.isEnd()[0])   
            if r1 == 1: r2 = -1  
        #--- 'O' player
        p2.addstate(g)
        p2.sarsa_improve(r2, s_index2, act_index2, g.board, alpha2, g.isEnd()[0])
        if not g.isEnd()[0]:
            m2, s_index2, act_index2 = p2.eps_greedy_move(g.board, eps2)
            r2 = g.move(m2)
            if g.isEnd()[0]:
                p2.addstate(g)
                p2.sarsa_improve(r2, s_index2, act_index2, g.board, alpha2, g.isEnd()[0])
                if r2 == 1: r1 = -1
                p1.addstate(g)
                p1.sarsa_improve(r1, s_index1, act_index1, g.board, alpha1, g.isEnd()[0])    

Game number: 19999


In [6]:
#=== PLAY AGAINST SARSA AGENT ===
from IPython.display import clear_output
#=== choose site
player_id = input('Choose X or O...\n')
g = game()  # create empty game
#=== human is 'X'
if player_id == 'X':
    while not g.isEnd()[0]:
        clear_output()
        show_dial()
        g.show_board()
        m = input("Next move?\n")
        g.move(int(m))
        if not g.isEnd()[0]:
            g.move(p2.eps_greedy_move(g.board,0)[0])
    clear_output()
    g.show_board()
    if g.isEnd()[1] == 0:
        print('Tie!')
    elif g.isEnd()[1] == 1:
        print('You won!')
    else: print('You lost... :(')
#=== human is 'O'
if player_id == 'O':
    while not g.isEnd()[0]:
        g.move(p1.eps_greedy_move(g.board,0)[0])
        if not g.isEnd()[0]:
            clear_output()
            show_dial()
            g.show_board()
            m = input("Next move?\n")
            g.move(int(m))
    clear_output()
    g.show_board()
    if g.isEnd()[1] == 0:
        print('Tie!')
    elif g.isEnd()[1] == 1:
        print('You lost... :(')
    else: print('You won!')        

–––––
O • X
O X X
X • O
–––––
You lost... :(
