In [1]:
import numpy as np
import pandas as pd
import time

In [2]:
N_STATES = 6 # 一维世界的宽度
ACTIONS= ['left', 'right'] # 探索者的可用动作
EPSILON = 0.9 # 贪婪度 greedy
ALPHA = 0.1 # 学习率
GAMMA = 0.9 # 奖励递减值
MAX_EPISODES = 13 # 最大回合数
FRESH_TIME = 0.3 # 移动间隔时间

In [8]:
def build_q_table(n_states, actions):
    table = pd.DataFrame(
        np.zeros((n_states, len(actions))), # 全0初始化
        columns=actions, # columns对应的是行为名称
    )
    return table

def choose_action(state, q_table):
    state_actions = q_table.iloc[state, :]
    if(np.random.uniform() > EPSILON or (
        state_actions.all() == 0)): # 非贪婪 or 该state未探索
        action_name = np.random.choice(ACTIONS) # 随机选择
    else:
        action_name = state_actions.argmax() # 贪婪，选择最大
    return action_name
       
def get_env_feedback(S, A):
    if A == 'right':
       if S == N_STATES - 2:
           S_ = 'terminal'
           R = 1
       else:
           S_ = S + 1
           R = 0
    else:
       R = 0
       if S == 0:
           S_ = S
       else:
           S_ = S - 1
    return S_, R
       
def update_env(S, episode, step_counter):
    env_list = ['-']*(N_STATES-1) + ['T'] # '------T' 环境
    if S == 'terminal':
       interaction = 'Episode %s: total_steps = %s' % (
           episode + 1, step_counter)
       print('\r{}'.format(interaction), end='')
       time.sleep(2)
       print('\r                      ', end='')
    else:
       env_list[S] = 'o'
       interaction = ''.join(env_list)
       print('\r{}'.format(interaction), end='')
       time.sleep(FRESH_TIME)  

In [9]:
def rl():
    q_table = build_q_table(N_STATES, ACTIONS)
    for episode in range(MAX_EPISODES): # 初始化 q table
        step_counter = 0
        S = 0 # 回合初始位置
        is_terminated = False # 是否结束回合
        update_env(S, episode, step_counter) # 环境更新
        while not is_terminated:
            A = choose_action(S, q_table) # 选择行为
            S_, R = get_env_feedback(S, A) # 实施行为，得到反馈
            q_predict = q_table.loc[S, A] # 估算的（状态-行为）值
            if S_ != 'terminal':
                # 实际的（状态-行为）值
                q_target = R + GAMMA * q_table.iloc[S_, :].max()
            else:
                q_target = R # s实际的（状态=行为）值
                is_terminated = True # terminate this episode
                
            # q_table 更新
            q_table.loc[S,A] += ALPHA * (q_target - q_predict)
            S = S_ # 状态更新
            
            update_env(S, episode, step_counter+1) # 环境更新
            
            step_counter += 1
    
    return q_table

In [11]:
if __name__ == '__main__':
    q_table = rl()
    print('\r\nQ-table: \n')
    print(q_table)

----oT                 = 25

The current behaviour of 'Series.argmax' is deprecated, use 'idxmax'
instead.
The behavior of 'argmax' will be corrected to return the positional
maximum in the future. For now, use 'series.values.argmax' or
'np.argmax(np.array(values))' to get the position of the maximum
row.
  


                      s = 5
Q-table: 

       left     right
0  0.000023  0.005042
1  0.000005  0.027061
2  0.000007  0.111953
3  0.000204  0.343331
4  0.000810  0.745813
5  0.000000  0.000000
