In [27]:
"""
This part of code is the Q learning brain, which is a brain of the agent.
All decisions are made in here.

View more ``on my tutorial page: https://morvanzhou.github.io/tutorials/
"""
import numpy as np
import pandas as pd
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [31]:
class QLearningTable:
    def __init__(self,learning_rate=0.05, reward_decay=0.9, e_greedy=0.3):
        self.actions = ["c","d"] 
        self.lr = learning_rate
        self.gamma = reward_decay
        self.epsilon = e_greedy
        self.q_table = pd.DataFrame(columns=self.actions, dtype=np.float64)

    def choose_action(self, observation,g):
        self.check_state_exist(observation,g)
        # action selection
        if np.random.uniform(0,1) > self.epsilon:
            # choose best action
            state_action = self.q_table.loc[observation, :]
            # some actions may have the same value, randomly choose on in these actions
            action = np.random.choice(state_action[state_action == np.max(state_action)].index)
        else:
            # choose random action
            action = np.random.choice(self.actions)
        return action
    
    def learn(self,current_state,action,g,next_state,reward):
        self.check_state_exist(next_state,g)
        q_predict = self.q_table.loc[current_state, action]
        q_target = q_predict + self.lr*(reward + self.gamma*np.max(self.q_table.loc[next_state,:]) - q_predict )
#         if action == "c":
#             q_target = q_predict + self.lr*(1- self.epsilon/2)*((1- self.epsilon/2)*2*g + self.epsilon/2 * g + (self.gamma - 1)*q_predict)
#         elif action == "d":
#             q_target = q_predict + self.lr* self.epsilon/2 *((1- self.epsilon/2)*(2+g) + self.epsilon * g + (self.gamma*self.q_table.loc[next_state, "c"] - self.q_table.loc[next_state, action]))

        
        
        self.q_table.loc[current_state,action] = q_target

    def check_state_exist(self, state,g):
        if state not in self.q_table.index:
            # append new state to q table
            self.q_table = self.q_table.append(
                pd.Series(
                    [2*g/(1-self.gamma)]*len(self.actions),
                    index=self.q_table.columns,
                    name=state,
                )
            )

In [34]:
#create the environment for IPD and run the experiment and plot the graphs. 
import pdb
g = 1.8
state_data = {}

def collect(observation):
    if observation not in state_data:
        state_data[observation] = 1
    else: 
        state_data[observation]+=1

def reward_joint(action1,action2,g):
    payoff_matrix = {
                     "cc": [2*g,2*g],
                     "cd":[g,2+g],
                     "dc": [2+g,g],
                     "dd":[2,2]
                    }
    return payoff_matrix[action1 + action2]
    

for episode in range(10):
    # initial observation

    agent1 = QLearningTable()
    agent2 = QLearningTable()
    observation = "cc"
#     pdb.set_trace()
    for iterations in range(100000):

        # RL choose action based on observation
        action1 = agent1.choose_action(str(observation),g)
        action2 = agent2.choose_action(str(observation),g)

        # RL take action and get next observation and reward
        observation_ = action1 + action2
        reward = reward_joint(action1,action2,g)
#         pdb.set_trace()
        # RL learn from this transition
        agent1.learn(str(observation), action1,g,str(observation_),reward[0])
        agent2.learn(str(observation), action2,g,str(observation_),reward[1])

        # swap observation
        observation = observation_
        collect(observation)

        # break while loop when end of this episode
#             if done:
#                 break
    print(f"Game {episode} ends in state {observation} for player 1 and player 2")

print(state_data)
# end of game
print('game over')


Game 0 ends in state cc for player 1 and player 2
Game 1 ends in state cc for player 1 and player 2
Game 2 ends in state cd for player 1 and player 2
Game 3 ends in state dd for player 1 and player 2
Game 4 ends in state cc for player 1 and player 2
Game 5 ends in state dc for player 1 and player 2
Game 6 ends in state dd for player 1 and player 2
Game 7 ends in state dc for player 1 and player 2
Game 8 ends in state cd for player 1 and player 2
Game 9 ends in state cd for player 1 and player 2
{'dc': 182046, 'cd': 190651, 'dd': 250810, 'cc': 376493}
game over
