# Backgomman Data Structure And Agent Definitions

## State Definition

There are 4 sets of 6 locations, and +2 BAR locations for each player.

Each of these locations can take between valuse between -15 and +15, Where + means one player and - means other player's cheker 0 means the location is empty.

0th location is the - players BAR and 25th locations is the + players BAR.

the 26th and 27th indexed integers are the rolled dice to be played. The last two number represents the dice outcome each can take values between 1 and 6.

General state outcome is ((15+15+1)^26)*(6^2)

This data structure is inspired from: https://scholarworks.rit.edu/cgi/viewcontent.cgi?referer=https://www.google.com/&httpsredir=1&article=7617&context=theses

In [None]:
example_state = [0,  -2, 0, 0, 0, 0,+5,   0,+3, 0, 0, 0,-5,   +5, 0, 0, 0, -3, 0,  -5, 0, 0, 0, 0,+2,  0,   4,6]

## Action Definition

Action space is a touple each represents the dice, where each element is between 0 and 25 (the index of the location where a checker to advance as the dice number)

The following example is the perfect action for the initial state to take a gate at your closest home area if the player is + and -.

Action Space is: 26^2

In [None]:
#TODO: if the dice is same the action must not be two element but 4 element tuple
actionw = (6,8)      #best known action for + player
actionb = (19,17)    #best known action for - player

## Agent Class Definition

In [5]:
import numpy as np
import random
from os import path
from keras.models import Sequential
from keras.layers import Dense, Conv2D, Flatten, MaxPooling2D, Conv2DTranspose, BatchNormalization, UpSampling2D, Reshape
from keras.optimizers import Adam
from keras.models import load_model
from keras.activations import softmax
import wandb

class agent:

    def __init__(self,color, discount=0.95,exploration_rate=0.9,decay_factor=0.9999):
        self.color = color # value must be -1 or +1 (-1 for black and +1 is for white)
        self.discount = discount # How much we appreciate future reward over current
        self.exploration_rate = exploration_rate # Initial exploration rate
        self.decay_factor = decay_factor

        #wandb.config.update({'model_name':'Dense,208-104-52 with relu, adam, mean_squared_error'})
        if(path.exists(self._getModelFilename())):
            self.model = load_model(self._getModelFilename())
        else:
            self.model = Sequential()
            self.model.add(Dense(208,activation="relu", input_shape=(28,)))
            self.model.add(Dense(104,activation="relu"))
            self.model.add(Dense(52))
            self.model.add(Reshape((2,26)))
            self.model.compile(optimizer='adam', loss='mean_squared_error', metrics=['accuracy'])


    def _getModelFilename(self):
        return "agent_model%d.h5" % self.color
    
    def _getBARi(self):
        if(self.color == 1):
            return 25
        return 0
    def _getLOpponentBARi(self):
        if(self.color == 1):
            return 0
        return 25


    def _getHomeRange(self):
        if(self.color == 1): 
            return range(1,7)
        return range(19,25)

    def _getNotHomeRange(self):
        if(self.color == 1): 
            return range(7,26)
        return range(0,19)


    def _getTotalCheckersAtHome(self,state):
        t1 = 0
        for i in self._getHomeRange():
            if((state[i] * self.color) > 0):
                t1 += (state[i] * self.color)
        return t1
    def _getTotalCheckersAtNotHome(self,state):
        t1 = 0
        for i in self._getNotHomeRange():
            if((state[i] * self.color) > 0):
                t1 += (state[i] * self.color)
        return t1


    def play(self,old_state,action):
        di1,di2 = action
        dice1 = old_state[26] 
        dice2 = old_state[27]

        reward = 0
        new_state,r = self.playSingle(old_state,dice1,di1)
        reward += r
        new_state,r = self.playSingle(new_state,dice2,di2)
        reward += r
        if(dice1 == dice2):
            di3,di4 = self.get_next_action(new_state)
            new_state,r = self.playSingle(new_state,dice1,di3)
            reward += r
            new_state,r = self.playSingle(new_state,dice2,di4)
            reward += r
            
        return new_state,reward


    def playSingle(self,old_state,dice,action):
        state = old_state.copy()
        dice = dice * self.color
        bari = self._getBARi()

        #These are invalid plays and return negative points.
        #can not play other players checker or empty location
        if(state[action] * self.color <= 0):
            return state,-20        
        #can not play outside from 26 locations (including bars).
        if(action not in range(26)):
            return state,-20
        #can not play outside to 24 locations. TODO: this is not valid when collecting checkers at the end of the game.
        if(action-dice not in range(1,25)):
            return state,-20
        #can not play from other than BAR if there is a checker in BAR
        if(state[bari] != 0 and action != bari):
            return state,-20
        #if BAR has checker and can not play to destination it is not invalid. do not punish
        if(state[bari] != 0 and (state[action-dice] * self.color < -1)):
            return state, 0
        #can not play to location where opponent's has more than 1 checkers.
        if(state[action-dice] * self.color < -1):
            return state,-20        

        #advance state
        reward = 20
        state[action] -= self.color
        if(state[action-dice] * self.color == -1):
            state[action-dice] = 0
            state[self._getLOpponentBARi()] -= self.color
            reward += 5
        state[action-dice] += self.color
        #TODO: check if checkers must be collected or not according to if checkers are at the home or not.
       

        #punish if target or source location has only one checker
        if(abs(state[action-dice]) == 1):
            reward -= 3
        if(abs(state[action]) == 1):
            reward -= 3


        #declaring win when all chekers at home.TODO: check and give max point all checkers are collected. win game.
        t1 = self._getTotalCheckersAtHome(state)
        if(t1 == 15):
            reward = 50

        return state,reward
        

    def playAll(self,state):
        y = np.empty((26,26))        
        for i in range(26):
            for j in range(26):
                _,reward = self.play(state,(i,j))
                y[i][j] = reward
        return y


    def get_next_action(self, state):
        if random.random() > self.exploration_rate: # Explore (gamble) or exploit (greedy)
            return self.greedy_action(state)
        else:
            return self.random_action()

    def greedy_action(self, state):
        return tuple(np.argmax(self.getQ(state),axis=1))
    def random_action(self):
        return (random.randint(0,25),random.randint(0,25))
    def best_action(self,state):
        imax = np.argmax(self.playAll(state))        
        return ( int((imax-(imax%26))/26) ,int(imax%26))

    def getQ(self,state):
        state_to_predict = np.expand_dims(state,0)
        action_prediction = self.model.predict(state_to_predict)
        return action_prediction[0]

    def train(self, old_state, new_state, action, reward):
        
        old_state_prediction = self.getQ(old_state)
        new_state_prediction = self.getQ(new_state)
        a1,a2 = action

        old_state_prediction[0][a1] = reward + self.discount * np.amax(new_state_prediction[0])
        old_state_prediction[1][a2] = reward + self.discount * np.amax(new_state_prediction[1])

        x = np.expand_dims(old_state,0)
        y = np.expand_dims(old_state_prediction,0)
        self.model.fit(x,y,verbose=0)

    def update(self, old_state, new_state, action, reward):        
        self.train(old_state, new_state, action, reward)
        self.exploration_rate *= self.decay_factor
        #self.saveModel()

    def saveModel(self):
        #self.model.save(self._getModelFilename())
        #wandb.save(self._getModelFilename())
        return True


## Environment Board Definition Class

In [6]:
from IPython.display import clear_output
import random
import time
import matplotlib
import matplotlib.pyplot as plt
import wandb
import argparse

class environment:

    def __init__(self,args=''):
        self.args = self._parseArgs(args)
        #wandb.init(project="tavla2",name=self.args.run_name)
        #wandb.config.update(self.args)
        self.white_agent = agent(color=1,discount=self.args.discount,exploration_rate=self.args.exploration_rate,decay_factor=self.args.decay_factor)
        self.black_agent = agent(color=-1,discount=self.args.discount,exploration_rate=self.args.exploration_rate,decay_factor=self.args.decay_factor)
        self.white_max_reward = 0
        self.black_max_reward = 0
        self.white_tot_reward = 0
        self.black_tot_reward = 0
        self.r_avg_list = [[],[]]
        self.total_penalty = 0
        self.total_valid = 0        
        self._initGame()


    def _initGame(self):
        self.white_reward = 0
        self.black_reward = 0
        self.state = [0,  -2, 0, 0, 0, 0,+5,   0,+3, 0, 0, 0,-5,   +5, 0, 0, 0, -3, 0,  -5, 0, 0, 0, 0,+2,  0,   4,6]  #initial sate
        #self.state = [0,   4, 0, 6, 0, 0, 0,   1, 1, 0, 0, 0, 0,    0, 0, 0, 1, -1, 1,   1, 0, 0, 0, 0,-14, 0,   2,3]  #test state
        self.turn = random.randrange(-1,2,2) #The first round is decided number (-1 or 1)
    
    def roll(self):
        self.state[26] = random.randint(1,6)
        self.state[27] = random.randint(1,6)


    def start(self):
        for game_no in range(1,self.args.episode+1):
            reward = 0
            self._initGame()
            self.roll()
            i = 1
            while reward != 50:
                clear_output(wait=True)
                self.render()
                #self._plot()
                if(self.turn == 1):
                    if( np.amax(self.white_agent.playAll(self.state)) < 0 ):
                        break
                    action_to_play = self.white_agent.get_next_action(self.state)
                    new_state, reward = self.white_agent.play(self.state, action_to_play)
                    self.white_agent.update(old_state=self.state,new_state=new_state,action=action_to_play,reward=reward)
                    self.state = new_state
                    self.white_reward += reward
                    self.white_tot_reward += reward
                    self.white_max_reward = max(self.white_max_reward,self.white_reward)
                if(self.turn == -1):
                    if( np.amax(self.black_agent.playAll(self.state)) < 0 ):
                        break
                    action_to_play = self.black_agent.get_next_action(self.state)
                    new_state, reward = self.black_agent.play(self.state, action_to_play)
                    self.black_agent.update(old_state=self.state,new_state=new_state,action=action_to_play,reward=reward)
                    self.state = new_state
                    self.black_reward += reward
                    self.black_tot_reward += reward
                    self.black_max_reward = max(self.black_max_reward,self.black_reward)
                print(action_to_play)
                self.render()            
                print(reward)
                print(self.state)
                if(reward >= 0):
                    self.turn *= -1
                    self.roll()
                    i = 1
                    self.total_valid += 1
                    input("continue...")
                else:
                    i += 1
                    self.total_penalty += 1
                    #break
                    

            metrics = {
                'full-game' : (game_no-self.total_penalty),
                'full-game-rate' : ((game_no-self.total_penalty)/game_no),
                'valid-total' : self.total_valid,
                'valid-avarage' : self.total_valid/game_no,
                'exploration-rate-white' : self.white_agent.exploration_rate,
                'exploration-rate-black' : self.black_agent.exploration_rate,
                'max-reward-white' : self.white_max_reward,
                'max-reward-black' : self.black_max_reward,
                'tot-reward-white' : self.white_tot_reward,
                'tot-reward-black' : self.black_tot_reward,
                'avg-reward-white' : self.white_tot_reward/game_no,
                'avg-reward-black' : self.black_tot_reward/game_no,
                'reward-white':self.white_reward,
                'reward-black':self.black_reward
            }
            if False:#game_no % 1000 == 0:
                clear_output(wait=True)
                print("Game           : %d"%game_no)
                print("Full Game      : %d - %f "%(metrics['full-game'],metrics['full-game-rate']))
                print("Valid          : %d - %f "%(metrics['valid-total'],metrics['valid-avarage']))
                print("Exp Rate       : \x1b[6;30;47m%f\x1b[0m \x1b[0;37;40m%f\x1b[0m" % (metrics['exploration-rate-white'],metrics['exploration-rate-black']))
                print("Max Rewards    : \x1b[6;30;47m%f\x1b[0m \x1b[0;37;40m%f\x1b[0m" % (metrics['max-reward-white'],metrics['max-reward-black']))
                print("Tot Rewards    : \x1b[6;30;47m%f\x1b[0m \x1b[0;37;40m%f\x1b[0m" % (metrics['tot-reward-white'],metrics['tot-reward-black']))
                print("Avg Rewards    : \x1b[6;30;47m%f\x1b[0m \x1b[0;37;40m%f\x1b[0m" % (metrics['avg-reward-white'],metrics['avg-reward-black']))
            #wandb.log(metrics,step=game_no)
            self.r_avg_list[0].append(self.white_reward)
            self.r_avg_list[1].append(self.black_reward)

        self._plotAvgReward()
        self.white_agent.saveModel()
        self.black_agent.saveModel()
            


                    

    def _getChecker(self,number):
        _sign = lambda x: x and (1, -1)[x<0]
        if(_sign(number) == 1):
            return("\x1b[6;30;47m%d\x1b[0m" % number)
        elif(_sign(number) == -1):
            return("\x1b[0;37;40m%d\x1b[0m" % abs(number))
        return "+"

    def render(self):

        print("Dice: [%d] [%d]" % (self.state[26],self.state[27]))
        print("Reward: \x1b[6;30;47m%d\x1b[0m \x1b[0;37;40m%d\x1b[0m" % (self.white_reward,self.black_reward))
        print("")

        print("++432109++876543++")
        print("++------++------++")
        _line_to_print  = self._getChecker(self.state[25])
        _line_to_print += "+"
        for i in reversed(range(19,25)): ##########
            if(self.state[i] != 0):
                _line_to_print += self._getChecker(self.state[i])
            else:
                _line_to_print += " "
        _line_to_print += "++"

        for i in reversed(range(13,19)): ##########
            if(self.state[i] != 0):
                _line_to_print += self._getChecker(self.state[i])
            else:
                _line_to_print += " "
        _line_to_print += "++"
        print(_line_to_print)
        print("++      ++      ++")

        _line_to_print  = self._getChecker(self.state[0])
        _line_to_print += "+"
        for i in range(1,7): ##########
            if(self.state[i] != 0):
                _line_to_print += self._getChecker(self.state[i])
            else:
                _line_to_print += " "
        _line_to_print += "++"

        for i in range(7,13): ##########
            if(self.state[i] != 0):
                _line_to_print += self._getChecker(self.state[i])
            else:
                _line_to_print += " "
        _line_to_print += "++"
        print(_line_to_print)
        print("++------++------++")
        print("++123456++789012++")

    def _plot(self):
        fig, ax = plt.subplots()
        if(self.turn == 1):
            data = self.white_agent.getQ(self.state)
            fig.suptitle("White")
        if(self.turn == -1):
            data = self.black_agent.getQ(self.state)
            fig.suptitle("Black")
        for line in data:
            keys = range(len(line))
            ax.plot(keys,line)
        plt.show()
    
    def _plotAvgReward(self):
        fig, ax = plt.subplots()
        for line in self.r_avg_list:
            keys = range(len(line))
            ax.plot(keys,line)
        plt.show()

    def _parseArgs(self,args=''):
        parser = argparse.ArgumentParser(description='Backgammon trainer program')
        parser.add_argument(
            '-d','--discount',
            type=float,default=0.95,metavar='D',
            help='Discount ratio for feature considiration (Default:0.95)'
        )
        parser.add_argument(
            '-er','--exploration_rate',
            type=float,default=0.9,metavar='ER',
            help='Initial exploration rate (Default:0.9) 1 is all actions random(always explore), 0 is no random (always exploit)'
        )
        parser.add_argument(
            '-df','--decay_factor',
            type=float,default=0.9999,metavar='DF',
            help='This number is multiplied with exploration rate in each action (Default:0.9999) If smaller that 1 exploration rate will decrease and every step system will use more exploit then explore.'
        )
        parser.add_argument(
            '-n','--run_name',
            type=str,metavar='N',
            help='Give an explenatory run name'
        )
        parser.add_argument(
            '-e','--episode',
            type=int,default=100000,metavar='E',
            help='Number of episodes (games) to try'
        )
        return parser.parse_args(args)



## Main Test

In [7]:
board = environment('-d 0.4 -er 0 -df 1'.split())

board.start()

Dice: [6] [2]
Reward: [6;30;47m333[0m [0;37;40m293[0m

++432109++876543++
++------++------++
++[6;30;47m2[0m[0;37;40m2[0m[0;37;40m2[0m[0;37;40m4[0m[0;37;40m1[0m ++[0;37;40m3[0m[0;37;40m1[0m   [6;30;47m1[0m++
++      ++      ++
++[0;37;40m2[0m[6;30;47m3[0m[6;30;47m3[0m[6;30;47m3[0m[6;30;47m2[0m ++ [6;30;47m1[0m    ++
++------++------++
++123456++789012++
(8, 4)
Dice: [6] [2]
Reward: [6;30;47m373[0m [0;37;40m293[0m

++432109++876543++
++------++------++
++[6;30;47m2[0m[0;37;40m2[0m[0;37;40m2[0m[0;37;40m4[0m[0;37;40m1[0m ++[0;37;40m3[0m[0;37;40m1[0m   [6;30;47m1[0m++
++      ++      ++
++[0;37;40m2[0m[6;30;47m5[0m[6;30;47m3[0m[6;30;47m2[0m[6;30;47m2[0m ++      ++
++------++------++
++123456++789012++
40
[0, -2, 5, 3, 2, 2, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, -1, -3, 0, -1, -4, -2, -2, 2, 0, 6, 2]


KeyboardInterrupt: Interrupted by user

In [None]:
import math 
import matplotlib.pyplot as plt

y = []
x = range(10000)
for i in x:
    y.append((math.sin(i/500)/(1.5999*1000)*-1)+1)

plt.plot(x, y, color = 'red', marker = "o")  
plt.title("math.sin()")  
plt.xlabel("X")  
plt.ylabel("Y")  
plt.show()  


In [None]:
import wandb
api = wandb.Api()
run = api.run("hakanonal/tavla2/zcj977l6")
run.file("agent_model1.h5").download()