In [1]:
import numpy as np
import pandas as pd
import time

In [2]:
N_STATES = 6
ACTIONS = ['left', 'right']

EPSILON = 0.9 #greedy policy
ALPHA = 0.1 #learning rate
LAMBDA = 0.9 #discount

MAX_EPISODES = 13
FRESH_TIME = 0.3

np.random.seed(3)

In [3]:
def make_Qtable(n_states, actions):
    qtable = pd.DataFrame(
        np.zeros((n_states, len(actions))),
        columns=actions
    )
    return qtable

# make_Qtable(N_STATES, ACTIONS)

In [4]:
def choose_action(q_table, states):
    '''epsilon-贪心选择策略，选择动作
    '''
    action_series = q_table.loc[states]
    if np.random.uniform() > 0.9 or action_series.all() == 0:
        action = np.random.choice(ACTIONS)
    else:
        action = action_series.argmax()
    return action
    

In [5]:
def get_env_feedback(states, action):
    '''智能体与环境进行交互
    '''
    if action == 'right':
        if states == N_STATES - 2:
            states_ = 'Terminal'
            reward = 1
        else:
            states_ = states + 1
            reward = 0
    else:
        if states == 0:
            states_ = states
            reward = 0
        else:
            states_ = states - 1
            reward = 0
    return states_, reward
    

In [6]:
def update_env(S, episode, step_counter):
    # This is how environment be updated
    env_list = ['-']*(N_STATES-1) + ['T']   # '---------T' our environment
    if S == 'Terminal':
        interaction = 'Episode %s: total_steps = %s' % (episode+1, step_counter)
        print('\r{}'.format(interaction), end='')
        time.sleep(2)
        print('\r                                ', end='')
    else:
        env_list[S] = 'o'
        interaction = ''.join(env_list)
        print('\r{}'.format(interaction), end='')
        time.sleep(FRESH_TIME)
        

In [7]:
def rl():
    qtable = make_Qtable(N_STATES, ACTIONS)
    for episode in range(MAX_EPISODES):
        S = 0 #初始状态
        step_counter = 0
        is_terminated = False
        A = choose_action(qtable, S)

        update_env(S, episode, step_counter) #更新环境

        while not is_terminated:
            S_, R = get_env_feedback(S, A)
            # A_ = choose_action(qtable, S_)
            q_predict = qtable.loc[S, A]
            if S_ == 'Terminal':
                q_target = R
                is_terminated = True
            else:
                A_ = choose_action(qtable, S_)
                q_target = R + LAMBDA * qtable.loc[S_, A_]
            qtable.loc[S, A] += ALPHA * (q_target - q_predict)
            S, A = S_, A_
            print('\n')
            print(qtable)

            update_env(S, episode, step_counter+1)
            step_counter += 1

rl()

9
4   0.0  0.190
5   0.0  0.000
o----T

   left  right
0   0.0  0.000
1   0.0  0.000
2   0.0  0.000
3   0.0  0.009
4   0.0  0.190
5   0.0  0.000
o----T

   left  right
0   0.0  0.000
1   0.0  0.000
2   0.0  0.000
3   0.0  0.009
4   0.0  0.190
5   0.0  0.000
o----T

   left  right
0   0.0  0.000
1   0.0  0.000
2   0.0  0.000
3   0.0  0.009
4   0.0  0.190
5   0.0  0.000
o----T

   left  right
0   0.0  0.000
1   0.0  0.000
2   0.0  0.000
3   0.0  0.009
4   0.0  0.190
5   0.0  0.000
o----T

   left  right
0   0.0  0.000
1   0.0  0.000
2   0.0  0.000
3   0.0  0.009
4   0.0  0.190
5   0.0  0.000
o----T

   left  right
0   0.0  0.000
1   0.0  0.000
2   0.0  0.000
3   0.0  0.009
4   0.0  0.190
5   0.0  0.000
o----T

   left  right
0   0.0  0.000
1   0.0  0.000
2   0.0  0.000
3   0.0  0.009
4   0.0  0.190
5   0.0  0.000
o----T

   left  right
0   0.0  0.000
1   0.0  0.000
2   0.0  0.000
3   0.0  0.009
4   0.0  0.190
5   0.0  0.000
-o---T

   left  right
0   0.0  0.000
1   0.0  0.000
2   0.0  0.