## 使用DQN

使用神经网络预测来替代查询Q table记录所有state action的奖励值。有两种思路：
1. 输入state和action，通过神经网络，计算对应的Q值
2. 输入state，通过神经网络，计算各个action的值。

DQN有两个重要的技术：
1. <span class="mark">Experience replay</span>：能够抽取之前的经历进行学习
2. <span class="mark">Fixed Q-targets</span>：用于打乱经历之间的相关性，预测 Q 估计 的神经网络具备最新的参数, 而预测 Q 现实 的神经网络使用的参数则是很久以前的。

### DQN with experience replay
初始化大小为 $N$ 的记忆库 $D$  
使用随机权重 $\theta$ 初始化行为-价值神经网络 $Q$  
使用权重 $\theta^{-}=\theta$ 初始化目标的行为-价值函数 $\hat{Q}$  
循环从episode=1到$M$ 
> 初始化序列 $s_1=\{x_1\}$，以及预处理过的序列 $\phi_1=\phi(s_1)$  
> 从t=1到$T$循环  
>> 根据概率 $\epsilon$，随机选择行为 $a_t$，或者选择 $a_t=\arg\max_a Q(\phi(s_t),a;\theta)$  
>> 在模拟器中执行行为 $a_t$，并且观察奖励 $r_t$ 以及图像 $x_{t+1}$  
>> 设置 $s_{t+1}=s_t,a_t,x_{t+1}$，并且预处理 $\phi_{t+1}=\phi(s_{t+1})$  
>> 将序列 $(\phi_t,a_t,r_t,\phi_{t+1})$存入$D$  
>> 从记忆库$D$中选择minibatch个序列 $(\phi_j,a_j,r_j,\phi_{j+1})$  
>> 设置 $y_j$:  
>>> 如果 episode 在 j+1 步终止，$y_j=r_j$  
>>> 否则，$y_j = r_j+\gamma\max_{a'}\hat{Q}(\phi_{j+1},a';\theta^{-})$

>> 在 $(y_j - Q(\phi_j,a_j;\theta))^2$ 上执行梯度下降法，更新网络参数 $\theta$  
>> 每隔C步，更新 $\hat{Q}=Q$

In [None]:
from maze_env import Maze
import numpy as np
import pandas as pd
import tensorflow as tf

In [None]:
"""
确保可重复性
"""
np.random.seed(1)
tf.set_random_seed(1)

In [None]:
class DeepQNetwork:
    """
    off policy 的 DQN
    """
    def __init__(
        self,
        n_actions,
        n_features,
        learning_rate = 0.01,
        reward_decay = 0.9,
        e_greedy = 0.9,
        replace_target_iter = 300,
        memory_size = 500,
        batch_size = 32,
        e_greedy_increment = None,
        output_graph= False,
    ):
        self.n_actions = n_actions
        self.n_features = n_features
        self.lr = learning_rate
        self.gamma = reward_decay
        self.epsilon_max = e_greedy
        self.replace_target_iter = replace_target_iter
        self.memory_size = memory_size
        self.batch_size = batch_size
        self.epsilon_increment = e_greedy_increment
        self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max
        
        self.learn_step_counter = 0 # 总共的学习步数
        self.memory = np.zeros((self.memory_size, n_features*2+2)) # 初始化空记忆体，[s,a,r,s_]
        
        # DQN 由 目标网络和预估网络组成，两个网络的结构相同，但参数大部分时候都不同
        self._build_net()
        t_params = tf.get_collection('target_net_params')
        e_params = tf.get_collection('eval_net_params')
        self.replace_target_op = [tf.assign(t,e) for t,e in zip(t_params, e_params)] # 用于将预估网络参数复制给目标网络
        
        self.sess = tf.Session()
        
        if output_graph: # 使用 $ tensorboard --logdir=logs 打开
            tf.summary.FileWriter('logs/', self.sess.graph)
            
        self.sess.run(tf.global_variables_initializer())
        self.cost_his = []
    
    def _build_net(self):
        """
        创建 eval 网络
        """
        self.s = tf.placeholder(tf.float32, [None, self.n_features], name='s') # 用来接受状态 observation
        self.q_target = tf.placeholder(tf.float32, [None, self.n_actions], name='Q_target') # 用来接受 q_target 的值，之后通过计算可以得到
        with tf.variable_scope('eval_net'):
            """
            c_names (collections_names) 是在更新 target_net 参数的时候会用到
            """
            c_names, n_l1, w_initializer, b_initializer = \
                ['eval_net_params', tf.GraphKeys.GLOBAL_VARIABLES], 10, \
                tf.random_normal_initializer(0., 0.3), tf.constant_initializer(0.1) 
            
            """
            eval 网络的第一层，collections 是在更新 target_net 参数的时候才会用到
            """
            with tf.variable_scope('l1'):
                w1 = tf.get_variable('w1', [self.n_features, n_l1], initializer=w_initializer, collections=c_names)
                b1 = tf.get_variable('b1', [1, n_l1], initializer=b_initializer, collections=c_names)
                l1 = tf.nn.relu(tf.matmul(self.s, w1)+b1)
            
            """
            eval 网络的第二层，collections 是在更新 target_net 参数的时候才会用到
            """
            with tf.variable_scope('l2'):
                w2 = tf.get_variable('w2', [n_l1, self.n_actions], initializer=w_initializer, collections=c_names)
                b2 = tf.get_variable('b2', [1, self.n_actions], initializer=b_initializer, collections=c_names)
                self.q_eval = tf.matmul(l1, w2)+b2
        
        with tf.variable_scope('loss'):
            self.loss = tf.reduce_mean(tf.squared_difference(self.q_target, self.q_eval))
        with tf.variable_scope('train'): # 梯度下降
            self._train_op = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss)
        
        """
        创建 target 网络， 提供 target Q
        """
        self.s_ = tf.placeholder(tf.float32, [None, self.n_features], name='s_') # 用来接受下一个状态 observation
        with tf.variable_scope('target_net'):
            c_names = ['target_net_params', tf.GraphKeys.GLOBAL_VARIABLES]

            with tf.variable_scope('l1'):
                w1 = tf.get_variable('w1', [self.n_features, n_l1], initializer=w_initializer, collections=c_names)
                b1 = tf.get_variable('b1', [1, n_l1], initializer=b_initializer, collections=c_names)
                l1 = tf.nn.relu(tf.matmul(self.s_, w1) + b1) 

            with tf.variable_scope('l2'):
                w2 = tf.get_variable('w2', [n_l1, self.n_actions], initializer=w_initializer, collections=c_names)
                b2 = tf.get_variable('b2', [1, self.n_actions], initializer=b_initializer, collections=c_names)
                self.q_next = tf.matmul(l1, w2) + b2
        
    def store_transition(self, s, a, r, s_):
        if not hasattr(self, 'memory_counter'):
            self.memory_counter = 0
        
        transition = np.hstack((s, [a, r], s_))
        
        # 替换记忆库中的内容
        index = self.memory_counter % self.memory_size
        self.memory[index,:] = transition
        
        self.memory_counter += 1
    
    def choose_action(self, observation):
        observation = observation[np.newaxis, :]
        
        if np.random.uniform() < self.epsilon:
            # 前向传递 observation， 得到每一个 action 对应的 Q 值
            actions_value = self.sess.run(self.q_eval, feed_dict={self.s: observation})
            action = np.argmax(actions_value)
        else:
            action = np.random.randint(0, self.n_actions)
        return action
    
    def learn(self):
        # 检查， 是否需要更新 target 网络
        if self.learn_step_counter % self.replace_target_iter == 0:
            self.sess.run(self.replace_target_op)
            print('\n target params replaced \n')
        
        # 从记忆库中采样样本的batch
        if self.memory_counter > self.memory_size:
            sample_index = np.random.choice(self.memory_size, size=self.batch_size)
        else:
            sample_index = np.random.choice(self.memory_counter, size=self.batch_size)
        batch_memory = self.memory[sample_index, :]
        
        q_next, q_eval = self.sess.run(
            [self.q_next, self.q_eval],
            feed_dict={
                self.s_: batch_memory[:, -self.n_features:],
                self.s: batch_memory[:, :self.n_features]
            }
        )
        
        """
        根据 eval 网络的 action， 改变 target 网络的结果
        """
        q_target = q_eval.copy()

        batch_index = np.arange(self.batch_size, dtype=np.int32)
        eval_act_index = batch_memory[:, self.n_features].astype(int)
        reward = batch_memory[:, self.n_features + 1]

        q_target[batch_index, eval_act_index] = reward + self.gamma * np.max(q_next, axis=1)
        
        # 训练 eval 网络
        _, self.cost = self.sess.run([self._train_op, self.loss],
                                     feed_dict={self.s: batch_memory[:, :self.n_features],
                                                self.q_target: q_target})
        self.cost_his.append(self.cost)

        # increasing epsilon
        self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max
        self.learn_step_counter += 1
        
    def plot_cost(self):
        import matplotlib.pyplot as plt
        plt.plot(np.arange(len(self.cost_his)), self.cost_his)
        plt.ylabel('Cost')
        plt.xlabel('training steps')
        plt.show()

In [None]:
def run_maze():
    step = 0 # 用来控制什么时候学习
    for episode in range(300):
        observation = env.reset() # 环境初始化
        
        while True:
            env.render() # 刷新环境
            action = RL.choose_action(observation) # DQN 根据观察值选择行为
            observation_, reward, done = env.step(action) # 执行action
            RL.store_transition(observation, action, reward, observation_) # DQN 存储记忆
            
            if (step > 200) and (step % 5 == 0): # 控制学习起始时间和频率（先积累一些记忆再开始学习）
                RL.learn()
                
            observation = observation_ # 将下一个state_变为下次循环的state
            
            if done: # 如果终止，就跳出循环
                break
                
            step += 1 # 记录总的步数
    
    # 游戏结束之后
    print('game over')
    env.destroy()

In [None]:
if __name__ == "__main__":
    env = Maze()
    RL = DeepQNetwork(
        env.n_actions,
        env.n_features,
        learning_rate = 0.01,
        reward_decay = 0.9,
        e_greedy = 0.9,
        replace_target_iter = 200, # 每隔200步，替换一次target_net的参数
        memory_size = 2000, # 记忆上限
        output_graph = True # 输出 tensorboard 文件
    )
    env.after(100, run_maze)
    env.mainloop()
    RL.plot_cost() # 观察神经网络的误差曲线