In [4]:
"""
This part of code is the Q learning brain, which is a brain of the agent.
All decisions are made in here.
View more on my tutorial page: https://morvanzhou.github.io/tutorials/
"""
import import_ipynb
import numpy as np
import pandas as pd


class QLearningTable:
    def __init__(self, actions, learning_rate=0.5, reward_decay=0.9, e_greedy=0.6, tow=3, min_tow=0.01):
        self.actions = actions  # a list
        self.lr = learning_rate
        self.gamma = reward_decay
        self.epsilon = e_greedy
        self.decay = reward_decay
        self.t = tow
        self.t_min = min_tow
        self.update_times = 0
        self.change_threshold = 30
        self.q_table = pd.DataFrame(columns=self.actions, dtype=np.float64)
        #記得把8跟16的action2加回來
        self.action_space = {0:[1],1:[1],2:[2],3:[0,2],4:[2],5:[0,2],6:[2],7:[1],8:[0],9:[3],10:[1,3],11:[3],
                            12:[0,2],13:[0,2],14:[2],15:[1],16:[0],17:[3],18:[1,3],19:[3],20:[1,3],21:[3],22:[0],23:[0]}

    def epsilon_greedy(self,node_ind,observation):
        self.check_state_exist(observation)
        # action selection
        if np.random.uniform() < self.epsilon:
            # choose best action
            state_action = self.q_table.loc[observation, self.action_space[node_ind]]
            # some actions may have the same value, randomly choose on in these actions
            action = np.random.choice(state_action[state_action == np.min(state_action)].index)
        else:
            # choose random action
            action = np.random.choice(self.action_space[node_ind])
            
        return action
    
    def boltzmann_softmax(self,node_ind,observation):
        self.check_state_exist(observation)
        q_value = np.array(self.q_table.loc[observation, self.action_space[node_ind]])
        q_avg = abs(np.average(q_value))
        if q_avg != 0:
            q_value = q_value/q_avg
        proba = np.exp(q_value/self.t)/sum(np.exp(q_value/self.t))
        #proba = np.exp(-1*q_value/q_avg)/sum(-1*np.exp(q_value/q_avg))
        ran = np.random.uniform()
        if ran <= proba[0]:
            action = self.action_space[node_ind][0]
        else:
            action = self.action_space[node_ind][1]
        
        return action

    def learn(self, s, a, r, s_, trans_fini,node_ind,now_time):
        self.check_state_exist(s_)
        q_predict = self.q_table.loc[s, a]
        if not trans_fini:
            q_target = r + self.gamma * self.q_table.loc[s_, self.action_space[node_ind]].max()  # next state is not terminal
        else:
            q_target = r  # next state is terminal
        self.q_table.loc[s, a] += self.lr * (q_target - q_predict)  # update
        
        self.update_times+=1
        if (now_time%self.change_threshold)==0 and self.t>self.t_min:
            self.t = self.t*self.decay
            print('tow decay')

    def check_state_exist(self, state):
        if state not in self.q_table.index:
            # append new state to q table
            self.q_table = self.q_table.append(
                pd.Series(
                    [0]*len(self.actions),
                    index=self.q_table.columns,
                    name=state,
                )
            )
            