In [1]:
import numpy as np
import pickle



BOARD_ROWS = 4 #Define amount of rounds, 4 in this case.
BOARD_COLS = 2 #Define amount of players, 2 in this case


observation_space_high = np.array(10*(1.1**8)),(10*(1.1**8)) #The obervations space is defined to have a discretized states space. 
observation_space_low = np.array([(10*(0.9**8)),(10*0.9**8)]).astype(float)

#6.727499-1   --> 0.2863 - 0.05 ==> 0.2363
DISCRETE_OS_SIZE = [BOARD_ROWS,BOARD_ROWS] 

DISCOUNT = 0.8 #The Discount applied within the Q-learning Formular.
LEARNING_RATE = 0.3 #The Learning rate defined within the Q-learning Formular. 

# Exploration settings


In [3]:
#Take a look of the obvervation_space_high if it is the maximum reachable within the game. If not, if will give an error. 
observation_space_high

(array(21.4358881), 21.435888100000014)

In [4]:
#Discretize the data

discrete_os_win_size = (observation_space_high-observation_space_low)/DISCRETE_OS_SIZE
print(discrete_os_win_size)
     
discrete_state = ([30.8832,30.8832] - observation_space_low)/discrete_os_win_size

print(tuple(discrete_state.astype(np.int)))

[4.282804 4.282804]
(6, 6)


In [14]:
class State:
    def __init__(self, p1, p2):
        
        self.LEARNING_RATE = 0.3 #Set the learning rate to 0.3 
        self.DISCRETE_OS_SIZE = [BOARD_ROWS,BOARD_ROWS]
        self.EPISODES = 10000 #Number of games need to be played to train the model
        self.q_table = np.random.uniform(low=-2, high=0, size=([4,4] + [3])) #Define the Q-table in the beginning and fill with random values
        self.epsilon = 0.9  
        self.START_EPSILON_DECAYING = 1
        self.END_EPSILON_DECAYING = self.EPISODES//2
        self.epsilon_decay_value = self.epsilon/(self.END_EPSILON_DECAYING - self.START_EPSILON_DECAYING)
        self.board = np.zeros((BOARD_ROWS, BOARD_COLS))
        self.p1 = p1
        self.p2 = p2
        self.isEnd = False #Boolean needed to define that the game has finished. 
        self.round = 0 #Number of rounds. 
        # init p1 plays first
        self.playerSymbol = 1
        self.discrete_state = (1,1)
        
        # Define the initial values of the rewards, and the number of wins for each player 
        self.reward = 0
        self.wins_human=1
        self.wins_comp=1
        self.berabere = 1
        
        self.strategy = "cooperate" #The initial strategy will be cooperating.
        self.one_time_defect=0
        self.p2_old_Price  = 10 #Set the initial price to 10 
        self.p1_old_Price   = 10 #Set the initial price to 10 



    def winner(self):
        #This function checks if the game has finished or not and if the game has finished, it returns the winner. 
        winner = 0
        # no available positions
        if self.board[-1, -1] != 0:
            self.isEnd = True
            Sum_of_board = np.sum(self.board,axis=0)
            
            if Sum_of_board[0] < Sum_of_board[1]:
                self.reward = 10 #Give reward of 10 for each round if he wins that round. 
                winner = 1
            if Sum_of_board[0] > Sum_of_board[1]:
                self.reward = -10 #Give reward of -10 for each round if he looses that round. 
                winner = -1
                
            if Sum_of_board[0] == Sum_of_board[1]:
                self.reward = 5 #Give reward of 5 for each round if it is a draw. 
                winner = 0

            return winner
                
        # not end   
        self.isEnd = False
        return None

    def availablePositions(self):
        #Check if there are still available rounds to play within the game.
        positions = []
        for i in range(BOARD_COLS):
            for j in range(BOARD_ROWS):
                if self.board[j, i] == 0:
                    positions.append((j,i))  # 
                    break
                    
        return positions

    def updateBoard(self, position, Price):
        #Update the board of the current game with the prices of each player for each round. 
        self.board[position] = Price
        # switch to another player
        self.playerSymbol = -1 if self.playerSymbol == 1 else 1

    # only when game ends
    def giveReward(self,position_1,position_2):
        #Sum_of_game = np.sum(self.board,axis=1)
        
        if self.board[position_1] < self.board[position_2]:
            self.reward = 0.2 #Give reward of 0.2 for each round if he wins that round. 
        if self.board[position_1] > self.board[position_2]:
            self.reward = -0.2  #Give reward of -0.2 for each round if he looses that round. 
        if self.board[position_1] == self.board[position_2]:
            self.reward = 0.1  #Give reward of 0.1 for each round if it is a draw. 
        
        # backpropagate reward
    

    def pricing_to_action(self,Price,action):
        #Define the actions: increase, decrease or match the price. 
        if action==0:
            Price = Price* 0.9
        if action==1:
            Price = Price
        if action==2:
            Price = Price* 1.1
        return Price
    
    def reset(self):
        #Reset all the values after each game has finished. 
        self.discrete_state = (1,1)
        self.board = np.zeros((BOARD_ROWS, BOARD_COLS))
        self.boardHash = None
        self.isEnd = False
        self.playerSymbol = 1
        self.states = []
        self.round_no = 0
        self.p2.Price = 10
        self.round = 0
        self.strategy = "cooperate"
        self.one_time_defect= 0 
        self.p2_old_Price   = 10
        self.p1_old_Price   = 10
        self.strategy_table = []
        
        

    # play with human

        
    def play2(self):   
        self.wins_human=1
        self.wins_comp=1
        #fr = open("/Users/ege/Downloads/policy_2_computer",'rb')
        #self.q_table = pickle.load(fr)
        print("----QTABLE----")
        print(self.q_table)
        #fr.close()
        
        for episode in range(self.EPISODES):
            self.reset()
            
            for i in range(0,21):
                self.strategy_table.append(self.strategy)
                win = self.winner()
                # If there is a winner after the game has finished, add 1 to the winner so that we have can calculate the percentage later.
                if win is not None:
                    if win == 1:
                        self.wins_comp=self.wins_comp+1
                    if win == -1:
                        self.wins_human=self.wins_human+1
                    if win == 0:
                        self.berabere = self.berabere+1
                        
                    max_future_q = np.max(self.q_table[new_discrete_state])
                    # Current Q value (for current state and performed action)
                    current_q = self.q_table[self.discrete_state + (self.p1.action,)]
                    # And here's our equation for a new Q value for current state and action
                    new_q = (1 - LEARNING_RATE) * current_q + LEARNING_RATE * (self.reward + DISCOUNT * max_future_q)
                    # Update Q table with new Q value
                    self.q_table[self.discrete_state + (self.p1.action,)] = new_q
                    self.reset()
                else:
                    #We check if we have rounds left to play. 
                    positions = self.availablePositions()
                    
                
                    #save the old price which is needed to check for the strategy if the player has defected or not in the previous round. 
                    self.p2_old_Price   = self.p2.Price 
                    self.p1_old_Price   = self.p1.Price 
                    
                    
                    self.p1.action, self.p1.Price = self.p1.chooseAction(self.p2_old_Price,self.discrete_state,self.q_table) #choose action for player 1.
                    self.updateBoard(positions[0],self.p1.Price) #Update the board of the game, 4x2 matrix for player 1 
                
                    self.p2.action, self.p2.Price = self.p2.chooseAction(self.p1.Price,self.strategy) #choose action for player 2.
                    self.updateBoard(positions[1], self.p2.Price) #Update the board of the game, 4x2 matrix for player 2. 
                    
                    
                    #This is the strategy of player 2, cooperate if the opponent cooperates, and defect if he has defected 2 times. 
                    if  self.p2_old_Price == self.p1.Price:
                        if self.one_time_defect>1:
                            self.strategy = "defect"
                        else:
                            self.strategy = "cooperate"
                    else: 
                        if self.one_time_defect>1:
                            self.strategy = "defect"
                        else:
                            self.strategy = "cooperate"
                            self.one_time_defect = self.one_time_defect + 1


                    #self.giveReward(positions[0],positions[1])
                    
                    self.states = [self.p1.Price, self.p2.Price]
                    
                    self.discrete_state = (self.states - observation_space_low)/discrete_os_win_size
                    new_discrete_state = tuple(self.discrete_state.astype(np.int))
                    
                    self.discrete_state = new_discrete_state
                    

                    if episode % 1000==0:
                        print("Episode:")
                        print(episode)
                        print("Match:")
                        print((self.berabere)*100/(self.wins_comp+self.wins_human+self.berabere))
                        print("Comp")
                        print((self.wins_comp)*100/(self.wins_comp+self.wins_human+self.berabere))
                        print("Human")
                        print((self.wins_human)*100/(self.wins_comp+self.wins_human+self.berabere))
                        print(self.berabere)
                        print(self.wins_comp)
                        print(self.wins_human)
                        print("------------")
                    
                        #print((self.wins_comp)*100/(self.wins_comp+self.wins_human))
                
                    # Apply the Q-Learning Formula
                    max_future_q = np.max(self.q_table[new_discrete_state])
                    # Current Q value (for current state and performed action)
                    current_q = self.q_table[self.discrete_state + (self.p1.action,)]
                    # And here's our equation for a new Q value for current state and action
                    new_q = (1 - LEARNING_RATE) * current_q + LEARNING_RATE * (self.reward + DISCOUNT * max_future_q)
                    # Update Q table with new Q value
                    self.q_table[self.discrete_state + (self.p1.action,)] = new_q
                    
                    
                    
                
                if self.END_EPSILON_DECAYING >= episode >= self.START_EPSILON_DECAYING:
                    self.epsilon -= self.epsilon_decay_value



In [15]:
class Player:
    def __init__(self, name, exp_rate=0.3):
        self.name = name
        self.states = []  # record all positions taken
        self.lr = 0.2
        self.exp_rate = exp_rate
        self.decay_gamma = 0.9
        self.states_value = {}  # state -> value
        self.Price = 10
        self.q_table = []


    def pricing_to_action(self,Price,action): #The possible actions that the player can do 
        if action==0:
            Price = Price*1.1 #Increase the price by 10 percent(Defect)
        if action==1:
            Price = Price #Match the price(Cooperate)
        if action==2:
            Price = Price*0.9#Decrease the price by 10 percent(Defect)
        return Price
    
    
    def chooseAction(self, Price, discrete_state,q_table):
        #ChooseAction 0,1 or 2. If the exploration rate is high then it can explore the environment and the state space randomly. 
        if np.random.random() > self.exp_rate:
            action = np.argmax(q_table[discrete_state]) #Get the maximum state within the q-table that give the best response. 
        else:
            action = np.random.randint(0, 3)
            
        Price = self.pricing_to_action(Price,action) #Take action within the price function. 
        
        self.q_table = q_table
        return action, Price
    
    def feedReward(self, reward):
        self.reward = reward
            
    def reset(self):
        self.states = []
        self.Price = 1
        
    def savePolicy(self): #Save the policy with the new Q-table to somewhere 
        fw = open('/Users/ege/Downloads/policy_2_' + str(self.name), 'wb')
        pickle.dump(self.q_table, fw)
        fw.close()

    def loadPolicy(self, file): #load the policy with the new Q-table
        fr = open(file,'rb')
        self.q_table = pickle.load(fr)
        fr.close()

In [16]:
from random import choice

class HumanPlayer:
    def __init__(self, name):
        self.name = name
        self.defect = [self.price_lower]
        self.cooperate = [self.price_match]
        self.actions_player = []
        self.action = 0 
        
    def price_lower(self,Price):
            Price_New = Price*0.9
            return Price_New
    def price_match(self,Price):
            Price_New = Price*1
            return Price_New



        
    def pricing_to_action(self,Price,action):     
        if action==0:
            Price = Price* 1.1
        if action==1:
            Price = Price
        if action==2:
            Price = Price* 0.9            
        return Price
    
    def chooseAction(self, Price,strategy):
        #action = np.random.randint(0, 1)
        strategy_function,action = self.choose_strategy(strategy)
        Price = strategy_function(Price)
        return action, Price
    


    # append a hash state


    # at the end of game, backpropagate and update states value
    def feedReward(self, reward):
        pass

    def reset(self):
        pass
    

    #Chose between defect and cooperation. 
    def choose_strategy(self,strategy):
        if strategy=="defect":
            strategy_function = choice(self.defect)
            self.action = 2
        else: 
            strategy_function = choice(self.cooperate)
            self.action = 1
        return strategy_function,self.action


    #Define a random player for test purposes. 
    def Random_player(self,price):
        strategy_random_player = choice(["defect","cooperate","cooperate","cooperate","cooperate","cooperate"])
        price = self.choose_strategy(strategy_random_player)(price)
        actions_player.append(strategy_random_player)
        return price



    #Grim Trigger: Cooperate in the first round and continue cooperation as long as the other player cooperate. Otherwise, defect forever.
    def grim_trigger(self,price):
        if  Price_1 == Price_2:
            price = choose_strategy("cooperate")(price)
        else:
            price = choose_strategy("defect")(price)
        return price


    #Tit-for-Tat: Cooperate in the first round and then play the action of the other player in the previous round.
    def Tit_for_Tat(self,price):
        if not actions_player:
            price = choose_strategy("cooperate")(price)
        else:
            price = choose_strategy(actions_player[-1])(price)
        return price

    #Forgiving Trigger: Cooperate in the first round and continue cooperation as long as the other player cooperate. Otherwise, defect for some specific number of rounds, followed by reverting to cooperation. The length of punishment phase is crucial, since it must ensure that deviation remains unworthy for players.
    def Forgiving_Trigger(self,price,punishment):
        if not actions_player:
            price = choose_strategy("cooperate")(price)
            actions_forgiving_trigger.append("cooperate")
        else:
            if (actions_player[-1]=="cooperate" and punishment==0):
                price = choose_strategy("cooperate")(price)
                actions_forgiving_trigger.append("cooperate")
            else: 
                price = choose_strategy("defect")(price) 
                actions_forgiving_trigger.append("defect")
                if punishment == 3:
                    punishment = 0
                else:
                    punishment = punishment + 1
        return price,punishment

#Grim Devil: Play Grim Trigger for some specific number of rounds, and then defect thereafter.



In [17]:
p1 = Player("computer", exp_rate=0.4)
p2 = HumanPlayer("human")

st = State(p1, p2)
st.play2()


----QTABLE----
[[[-1.52951842 -1.30284392 -1.83688515]
  [-1.0708437  -1.42271941 -0.88171456]
  [-0.39649307 -1.67473866 -1.67283498]
  [-1.06425841 -1.88106613 -0.51116271]]

 [[-1.08790753 -0.46874282 -0.71981151]
  [-0.3702865  -0.32984655 -1.31265949]
  [-1.71538569 -1.98889403 -1.38363637]
  [-1.45322917 -0.72100544 -1.25094113]]

 [[-0.85282514 -0.96803124 -1.5312383 ]
  [-1.36387522 -1.14382741 -1.85268545]
  [-1.27743334 -1.89706407 -0.77132087]
  [-1.9873933  -0.46091109 -0.92833155]]

 [[-1.3936566  -1.29431922 -0.87036079]
  [-1.31393614 -0.75647087 -0.84367787]
  [-1.01120087 -0.7262845  -1.38829951]
  [-1.75843304 -1.1604053  -0.76901561]]]
Episode:
0
Match:
33.333333333333336
Comp
33.333333333333336
Human
33.333333333333336
1
1
1
------------
Episode:
0
Match:
33.333333333333336
Comp
33.333333333333336
Human
33.333333333333336
1
1
1
------------
Episode:
0
Match:
33.333333333333336
Comp
33.333333333333336
Human
33.333333333333336
1
1
1
------------
Episode:
0
Match:
33.3

In [19]:
# Take a look at the Q-table.
st.q_table


array([[[ -8.91053968, -12.55433518,   9.22857661],
        [ -0.79806804,  -0.85461472,  -0.65235335],
        [ -1.52278078,  -1.33051527,  -1.00555035],
        [ -1.47177348,  -1.46717441,  -0.40108038]],

       [[ -0.46819278,  -8.79881387, -10.19296299],
        [  8.556172  ,   6.32107046,   5.45175798],
        [ -1.77700542,  -0.70413883,  -1.1681213 ],
        [ -0.39892129,  -1.39219068,  -1.3638184 ]],

       [[ -0.57941631,  -0.50574804,  -0.95781963],
        [ -5.68686869,  -4.54356054,  -1.82279276],
        [ -0.40915426,  -0.67918936,  -0.08295389],
        [ -1.23189152,  -0.62539974,  -1.41225304]],

       [[ -1.22723762,  -1.21067074,  -1.05968185],
        [ -1.48958641,  -1.3401334 ,  -1.0868015 ],
        [ -1.38246849,  -0.68790855,  -0.04808901],
        [ -0.87825305,  -1.30506936,  -1.20055104]]])

In [None]:
#Save the policity that has been derived from the training. 
p1.savePolicy()


In [18]:
# Decrease the exp_rate, since we already have figured out the environment space during the training. 
p1 = Player("computer", exp_rate=0.05)
p2 = HumanPlayer("human")

st = State(p1, p2)
st.play2()

----QTABLE----
[[[-0.22821526 -0.78459258 -0.86007965]
  [-0.79806804 -0.85461472 -0.65235335]
  [-1.52278078 -1.33051527 -1.00555035]
  [-1.47177348 -1.46717441 -0.40108038]]

 [[-0.46819278 -0.67322853 -1.007642  ]
  [-0.16392415 -1.70735877 -0.44500378]
  [-1.77700542 -0.70413883 -1.1681213 ]
  [-0.39892129 -1.39219068 -1.3638184 ]]

 [[-0.57941631 -0.50574804 -0.95781963]
  [-0.37980276 -1.40399364 -1.82279276]
  [-0.72349521 -0.67918936 -0.08295389]
  [-1.23189152 -0.62539974 -1.41225304]]

 [[-1.22723762 -1.21067074 -1.05968185]
  [-1.48958641 -1.3401334  -1.0868015 ]
  [-1.38246849 -0.68790855 -0.04808901]
  [-0.87825305 -1.30506936 -1.20055104]]]
Episode:
0
Match:
33.333333333333336
Comp
33.333333333333336
Human
33.333333333333336
1
1
1
------------
Episode:
0
Match:
33.333333333333336
Comp
33.333333333333336
Human
33.333333333333336
1
1
1
------------
Episode:
0
Match:
33.333333333333336
Comp
33.333333333333336
Human
33.333333333333336
1
1
1
------------
Episode:
0
Match:
33.3