## 4.7 强化学习Q-Learning求解最佳路径

强化学习中的状态、决策、状态转移、奖励等可以用马尔可夫决策过程（Markov decision processes，简称MDP）来刻画。MDP可以用一个五元组（S,A,P_s a,γ,R）表示。

```
S是所有状态的集合。例如机器人寻宝中机器人的位置。
	A是所有动作的集合。例如机器人的（向东、向南、向西、向北）的动作。
	P_sa (s')是状态转移概率。给出了对于任何一个状态s和该状态下的动作a，将转移到的下一个状态s'的概率。
	γ∈[0,1] 是“折扣因子”。表示未来奖励对于当前动作的作用有多大。
	R是一个S×A→R的奖励函数，即在状态s下采取动作a所得到的直接奖励。
```
![](imgs/rl.png)

### 4.7.2 Q-Learning

Q-Learning算法的过程如下：
```
初始化Q(s,a)=0
多次(如200次)episode:
对每个episode，选择一个出发状态s，执行下面的循环: 
                     ϵ贪婪法选择一个动作a
得到环境反馈的(r,s')
如果s'不是结束状态，则更新Q(s,a)，s = s'。 否则，这次episode结束
```


### 4.7.3 Q-Learning的Python实现

In [8]:
# q_Table
def build_Q_table(state_actions = None):
    Q = {}
    if state_actions == None:
        Q =  [ {'s':0,'e':0},{'w':0,'e':0},{'w':0,'s':0,'e':0},{'w':0,'e':0},{'w':0,'s':0}]
    else:
        for actions in state_actions:
            for action in actions:
                action.append(0)
    return Q


#初始化游戏环境：状态转移及奖励、终止状态
def build_T_table(transit_table = None,terminal_states = None):
    if transit_table == None:
        transit_table =  {(0,'s'):(5,-1),(0,'e'):(1,0),(1,'w'):(0,0),(1,'e'):(2,0),(2,'s'):(6,1),(2,'w'):(1,0),(2,'e'):(3,0),
              (3,'w'):(2,0),(3,'e'):(4,0),(4,'w'):(3,0),(4,'s'):(7,-1)}
    if terminal_states==None:
        terminal_states = {5,6,7}
    return transit_table,terminal_states


def choose_action(state, Q,EPSILON=0.1):
    action_values = Q[state]
    if random.random() < EPSILON :        
        action_name = random.choice(list(action_values))
    else:
        max_elem = max(action_values,key=action_values.get) 
        action_name  = max_elem[0]
    return action_name

def get_env_feedback(state, action,transit_table,terminal_states):
    next_state,reward = transit_table[(state,action)]
    is_terminal = next_state in terminal_states
    return next_state,reward,is_terminal

import random
def random_start_state(size,terminal_states):
    while True:
        s = random.randint(0,size-1)
        if s not in terminal_states:
            return s

def Q_Learning(Q,transit_table ,terminal_states,
               MAX_EPISODES = 15,EPSILON = 0.2,ALPHA = 0.1,GAMMA = 0.9): 
        
    for episode in range(MAX_EPISODES):  # 循环的回合数 
        step_counter = 0
        s = random_start_state(len(Q),terminal_states)
        s  = 4
        is_terminated = False
        while not is_terminated:  # 循环直到一局游戏结束
            action = choose_action(s, Q,EPSILON)  # 根据状态选择动作
# 获取环境的反馈       
            s_next, R, is_terminated = get_env_feedback(s, action,transit_table,terminal_states)                 
            q_predict = Q[s][action]
            if not is_terminated:     # 如果没有结束就更新q_target值
                action_values = Q[s_next]
                #max_action = max(action_values,key=action_values.get)                
                q_target = R + GAMMA * action_values[max(action_values,key=action_values.get)] 
            else:                    # S_是结束状态，
                q_target = R    
          
            Q[s][action] += ALPHA * (q_target - q_predict)  #更新Q值
            s = s_next  # 进入下一状态            
    return Q
 
Q = build_Q_table()
transit_table,terminal_states = build_T_table()
Q = Q_Learning(Q,transit_table,terminal_states,100)
print('\r\nQ-table:\n')
print(Q)


Q-table:

[{'s': -0.814697981114816, 'e': 0.0}, {'w': 0.0, 'e': 0.44818549935293456}, {'w': 0.11081761095206341, 's': 0.9994360791266039, 'e': 0.21807779531419424}, {'w': 0.8965111343404737, 'e': 0.4516150503836326}, {'w': 0.7967613112390031, 's': -0.7458134171671}]


In [9]:
def shortest_path(state,Q,transit_table,terminals):
    path = []
    count=0
    while state not in terminals:
        path.append(state)      
        action_values = Q[state]
        action = max(action_values,key=action_values.get)
        s_next,reward = transit_table[(state,action)]
        state = s_next      
    path.append(state)
    return path
s = 0
path = shortest_path(s,Q,transit_table,terminal_states)
print("path:\n",path)

path:
 [0, 1, 2, 6]


这个Q-Learning实现不同于网上针对特定问题的特定实现，除2个辅助函数buildTtable()和buildTtable是针对寻宝问题的，其他的函数都可用于其他的类似问题。如网上的“无痛Q-Learning”的房间问题、迷宫问题、QL玩FlappyBird游戏等。即这是一个通用性的Q-Learning程序实现。篇幅有限，这里仅仅以迷宫问题为例，说明程序的通用性。
针对迷宫问题，只需要初始化Q表和状态转移表（包括终止状态集合）就可以了，这里用一个辅助函数initgamemaze()从一个迷宫的二维数组初始化程序的Q表和状态转移表（包括终止状态集合）。


In [10]:
def init_game_maze(maze):
    m = len(maze)
    n = len(maze[0])
    s = 0
    Q = []
    T = dict()
    terminals = set()
    for i in range(m):
        for j in range(n):
            Q.append(dict())
            if i>=1:
                s_ = s-n
                Q[s]['U'] = 0
                T[(s,'U')] = (s_,maze[i-1][j])   
            if i<m-1:
                s_ = s+n
                Q[s]['D'] = 0
                T[(s,'D')] = (s_,maze[i+1][j]) 
            if j>=1:
                s_ = s-1
                Q[s]['L'] = 0
                T[(s,'L')] = (s_,maze[i][j-1])   
            if j<m-1:
                s_ = s+1
                Q[s]['R'] = 0
                T[(s,'R')] = (s_,maze[i][j+1]) 
            if maze[i][j]!=0:
                terminals.add(s)
            s+=1
    return Q,T,terminals

maze = [[0, 0, 0, 0],
        [0, -1, 0, 0],
        [0, -1, -1, 0],
        [0, -0, 0, 1]]

import pprint
if __name__ == "__main__":
    Q,T,terminals = init_game_maze(maze)
    Q = Q_Learning(Q,T,terminals,500)
    print('\r\nQ-table:\n')
    pprint.pprint(Q)  #print(Q)
    s = 0
    path = shortest_path(s,Q,T,terminals)
    print("path: ",path)


Q-table:

[{'D': 0.4282659404761937, 'R': 0.5904891698542925},
 {'D': -0.9866972053527088, 'L': 0.4480106309112527, 'R': 0.6560997016470077},
 {'D': 0.7289999080757529, 'L': 0.46085007742034534, 'R': 0.5929119297418985},
 {'D': 0.7936945368133071, 'L': 0.3069859007842442},
 {'D': 0.23817333884122693, 'R': -0.9999999965733934, 'U': 0.5314392197210498},
 {'D': 0, 'L': 0, 'R': 0, 'U': 0},
 {'D': -0.7712320754503901,
  'L': -0.717570463519,
  'R': 0.8099999814532228,
  'U': 0.5540872987310341},
 {'D': 0.8999999980985273, 'L': 0.5630843970503903, 'U': 0.5451212362124451},
 {'D': 0.0, 'R': -0.7712320754503901, 'U': 0.40621377698596123},
 {'D': 0, 'L': 0, 'R': 0, 'U': 0},
 {'D': 0, 'L': 0, 'R': 0, 'U': 0},
 {'D': 0.9999999996963022, 'L': -0.9282102012308148, 'U': 0.5271814388255629},
 {'R': 0.0, 'U': 0.0},
 {'L': 0.0, 'R': 0, 'U': -0.1},
 {'L': 0, 'R': 0, 'U': 0},
 {'L': 0, 'U': 0}]
path:  [0, 1, 2, 6, 7, 11, 15]
