In [1]:
import numpy as np
import pickle

BOARD_ROWS = 3
BOARD_COLS = 3

class State:
    def __init__(self,p1,p2):
        self.board = np.zeros((BOARD_ROWS,BOARD_COLS))
        self.p1 = p1 #set child classes
        self.p2 = p2
        self.isEnd = False
        self.boardHash = False
        self.playerSymbol = 1
       
    def availablePositions(self):
        positions = []
        for i in range(BOARD_ROWS):
            for j in range(BOARD_COLS):
                if self.board[i,j] == 0:
                    positions.append((i,j))
        return positions
    
    def updateState(self,position):
        self.board[position] = self.playerSymbol
        self.playerSymbol = -1 if self.playerSymbol == 1 else 1
        
    def winner(self):
        for i in range(BOARD_ROWS):
            if sum(self.board[i,:]) == 3:
                self.isEnd = True
                return 1
            if sum(self.board[i,:]) == -3:
                self.isEnd = True
                return -1
        for i in range(BOARD_COLS):
            if sum(self.board[:,i]) == 3:
                self.isEnd = True
                return 1
            if sum(self.board[:,i]) == -3:
                self.isEnd = True
                return -1
        diag_sum1 = sum([self.board[i,i] for i in range(BOARD_COLS)])
        if diag_sum1 == 3:
            self.isEnd = True
            return 1
        if diag_sum1 == -3:
            self.isEnd = True
            return -1
        diag_sum2 = sum([self.board[i,BOARD_COLS - i - 1] for i in range(BOARD_COLS)])    
        if diag_sum2 == 3:
            self.isEnd = True
            return 1
        if diag_sum2 == -3:
            self.isEnd = True
            return -1
        #ties
        if len(self.availablePositions()) == 0:
            self.isEnd = True
            return 0
        self.isEnd = False
        return None
    
    def giveReward(self):
        result = self.winner()
        if result == 1:
            self.p1.feedReward(1)
            self.p2.feedReward(0)
        elif result == -1:
            self.p1.feedReward(0)
            self.p2.feedReward(1)
        else:
            self.p1.feedReward(0.1)
            self.p2.feedReward(0.5)
            
    def play(self,rounds = 100): #play with self
        for i in range(rounds):
            if i % 1000 == 0:
                print("Rounds {}".format(i))
            while not self.isEnd:
                # Player 1
                positions = self.availablePositions()
                p1_action = self.p1.chooseAction(positions,self.board,self.playerSymbol) #player takes action
                self.updateState(p1_action)
                board_hash = self.p1.getHash(self.board) #update board
                self.p1.addState(board_hash) #add a state to list
                
                win = self.winner()
                if win is not None:
                    self.giveReward()
                    self.p1.reset()
                    self.p2.reset()
                    self.reset()
                    break
                    
                else:
                    #Player 2
                    positions = self.availablePositions()
                    p2_action = self.p2.chooseAction(positions,self.board,self.playerSymbol)
                    self.updateState(p2_action)
                    board_hash = self.p2.getHash(self.board)
                    self.p2.addState(board_hash)
                    
                    win = self.winner()
                    if win is not None:
                        self.giveReward()
                        self.p1.reset()
                        self.p2.reset()
                        self.reset()
                        break
    
    def play2(self,rounds = 100):
        while not self.isEnd:
            #Player 1
            positions = self.availablePositions()
            p1_action = self.p1.chooseAction(positions,self.board,self.playerSymbol)
            self.updateState(p1_action)
            self.showBoard()
            win = self.winner()
            if win is not None:
                if win == 1:
                    print(self.p1.name, " wins!")
                else:
                    print("tie!")
                self.reset()
                break
            else:
                #Player 2
                positions = self.availablePositions()
                p2_action = self.p2.chooseAction(positions)
                self.updateState(p2_action)
                self.showBoard()
                win = self.winner()
                if win is not None:
                    if win == -1:
                        print(self.p2.name, " wins!")
                    else:
                        print("tie!")
                    self.reset()
                    break

    def play3(self,rounds = 100):
        while not self.isEnd:
            #Player 2
            positions = self.availablePositions()
            p1_action = self.p1.chooseAction(positions)
            self.updateState(p1_action)
            self.showBoard()
            win = self.winner()
            if win is not None:
                if win == 1:
                    print(self.p1.name, " wins!")
                else:
                    print("tie!")
                self.reset()
                break            
            else:
                positions = self.availablePositions()
                p2_action = self.p2.chooseAction(positions,self.board,self.playerSymbol)
                self.updateState(p2_action)
                self.showBoard()
                win = self.winner()
                if win is not None:
                    if win == -1:
                        print(self.p2.name, " wins!")
                    else:
                        print("tie!")
                    self.reset()
                    break

                    
    
    def showBoard(self):
        # p1: x  p2: o
        for i in range(0, BOARD_ROWS):
            print('-------------')
            out = '| '
            for j in range(0, BOARD_COLS):
                if self.board[i, j] == 1:
                    token = 'x'
                if self.board[i, j] == -1:
                    token = 'o'
                if self.board[i, j] == 0:
                    token = ' '
                out += token + ' | '
            print(out)
        print('-------------')

    def reset(self):
        self.board = np.zeros((BOARD_ROWS,BOARD_COLS))
        self.boardHash = None
        self.isEnd = False
        self.playerSymbol = 1
        
        
class Player:

    def __init__(self, name, exp_rate = 0.3):
        self.name = name
        self.states = []
        self.lr = 0.2
        self.exp_rate = exp_rate
        self.decay_gamma = 0.9
        self.states_value = {}

    def getHash(self,board):
        boardHash = str(board.reshape(BOARD_ROWS*BOARD_COLS))
        return boardHash        
        
    def chooseAction(self,positions,current_board,symbol):
        if np.random.uniform(0,1) <= self.exp_rate: #random action (explore)
            idx = np.random.choice(len(positions))
            action = positions[idx]
        else:
            value_max = -999
            for p in positions: #iterate across potential positions
                next_board = current_board.copy()
                next_board[p] = symbol
                next_boardHash = self.getHash(next_board)
                value = 0 if self.states_value.get(next_boardHash) is None else self.states_value.get(next_boardHash)
                if value >= value_max: #make move based on maximizing next state's value
                    value_max = value
                    action = p
        return action
    
    def feedReward(self, reward): #backpropogate to update state values
        for st in reversed(self.states):
            if self.states_value.get(st) is None: #no value means 0 value
                self.states_value[st] = 0
            self.states_value[st] += self.lr * (self.decay_gamma * reward - self.states_value[st])
            reward = self.states_value[st]
            
    def addState(self,state):
        self.states.append(state)
        
    def reset(self):
        self.states = []
        
    def savePolicy(self):
        fw = open("policy_" + str(self.name), "wb")
        pickle.dump(self.states_value, fw)
        fw.close()
        
    def loadPolicy(self, file):
        fr = open(file, "rb")
        self.states_value = pickle.load(fr)
        fr.close()
        
        
class HumanPlayer:

    def __init__(self, name):
        self.name = name
        
    def chooseAction(self,positions):
        while True:
            row = int(input("Input your action row:"))
            col = int(input("Input your action column:"))
            action = (row,col)
            if action in positions:
                return action
            
    def addState(self, state):
        pass

    # at the end of game, backpropagate and update states value
    def feedReward(self, reward):
        pass

    def reset(self):
        pass
    
if __name__ == "__main__":
    #training
    p1 = Player("p1") #set child classes
    p2 = Player("p2")
    
    st = State(p1, p2)
    print("training...")
    st.play(50000)
    p2.savePolicy()
    
    #play with human
    #p1 = Player("p1",exp_rate=0)
    #p1.loadPolicy("policy_p1")
    
    #p2 = HumanPlayer("p2")
    
    #st = State(p1, p2)
    #st.play2()

training...
Rounds 0
Rounds 1000
Rounds 2000
Rounds 3000
Rounds 4000
Rounds 5000
Rounds 6000
Rounds 7000
Rounds 8000
Rounds 9000
Rounds 10000
Rounds 11000
Rounds 12000
Rounds 13000
Rounds 14000
Rounds 15000
Rounds 16000
Rounds 17000
Rounds 18000
Rounds 19000
Rounds 20000
Rounds 21000
Rounds 22000
Rounds 23000
Rounds 24000
Rounds 25000
Rounds 26000
Rounds 27000
Rounds 28000
Rounds 29000
Rounds 30000
Rounds 31000
Rounds 32000
Rounds 33000
Rounds 34000
Rounds 35000
Rounds 36000
Rounds 37000
Rounds 38000
Rounds 39000
Rounds 40000
Rounds 41000
Rounds 42000
Rounds 43000
Rounds 44000
Rounds 45000
Rounds 46000
Rounds 47000
Rounds 48000
Rounds 49000


In [2]:
#Human starts
p2 = Player("p2",exp_rate=0)
p2.loadPolicy("policy_p2")
    
p1 = HumanPlayer("p1")
    
st = State(p1, p2)
st.play3()

Input your action row:1
Input your action column:1
-------------
|   |   |   | 
-------------
|   | x |   | 
-------------
|   |   |   | 
-------------
-------------
| o |   |   | 
-------------
|   | x |   | 
-------------
|   |   |   | 
-------------
Input your action row:0
Input your action column:2
-------------
| o |   | x | 
-------------
|   | x |   | 
-------------
|   |   |   | 
-------------
-------------
| o |   | x | 
-------------
|   | x |   | 
-------------
| o |   |   | 
-------------
Input your action row:1
Input your action column:0
-------------
| o |   | x | 
-------------
| x | x |   | 
-------------
| o |   |   | 
-------------
-------------
| o |   | x | 
-------------
| x | x | o | 
-------------
| o |   |   | 
-------------
Input your action row:2
Input your action column:1
-------------
| o |   | x | 
-------------
| x | x | o | 
-------------
| o | x |   | 
-------------
-------------
| o | o | x | 
-------------
| x | x | o | 
-------------
| o | x |   | 
--

In [4]:
#Computer starts
p1 = Player("p1",exp_rate=0)
p1.loadPolicy("policy_p1")
    
p2 = HumanPlayer("p2")
    
st = State(p1, p2)
st.play2()

-------------
|   |   |   | 
-------------
|   | x |   | 
-------------
|   |   |   | 
-------------
Input your action row:0
Input your action column:0
-------------
| o |   |   | 
-------------
|   | x |   | 
-------------
|   |   |   | 
-------------
-------------
| o |   |   | 
-------------
|   | x |   | 
-------------
|   |   | x | 
-------------
Input your action row:2
Input your action column:0
-------------
| o |   |   | 
-------------
|   | x |   | 
-------------
| o |   | x | 
-------------
-------------
| o |   |   | 
-------------
|   | x | x | 
-------------
| o |   | x | 
-------------
Input your action row:1
Input your action column:0
-------------
| o |   |   | 
-------------
| o | x | x | 
-------------
| o |   | x | 
-------------
p2  wins!


In [None]:
p1.states_value