# Deep Q Learning Algorithm

## Introduction

This notebook reproduce the DeepQLearning network and then use this algorithm to control "CartPole" game.

Also, this code refers [MorvanZhou](https://github.com/MorvanZhou/Reinforcement-learning-with-tensorflow/blob/master/contents/5_Deep_Q_Network/DQN_modified.py) and [ljp](https://github.com/ljpzzz/machinelearning/blob/master/reinforcement-learning/dqn.py)

## Import Library

In [13]:
import numpy as np
import tensorflow.compat.v1 as tf

In [14]:
tf.disable_eager_execution()
tf.disable_v2_behavior()

In [15]:
tf.__version__

'2.6.0'

## Code Project

In [352]:
np.random.seed(1)
tf.set_random_seed(1)

In [376]:
class DeepQLearning:
    def __init__(self, n_features, n_actions, learning_rate = 0.001 ,reward_decay = 0.8, e_greedy = 0.9, e_greedy_delta = 0.001, memory_size = 500, batch_size = 64):
        self.n_features = n_features
        self.n_actions = n_actions
        self.lr = learning_rate
        self.gamma = reward_decay
        self.epsilon = e_greedy
        self.epsilon_delta = e_greedy_delta
        self.memory_size = memory_size
        self.batch_size = batch_size
        
        self.sess = tf.Session()
        
        self._build_network()
        
        self._train_network()
        
        self.sess.run(tf.global_variables_initializer())
        
        self.memory = np.zeros((memory_size, 2 * n_features + 2))
        self.memory_counter = 0 
        
        
    def _build_network(self):
        self.observations = tf.placeholder(tf.float32, shape = (None, self.n_features))
        
        # DNN network
        fc1 = tf.layers.dense(self.observations, 16, tf.nn.relu, 
                              kernel_initializer=tf.random_normal_initializer(0., 0.3), 
                              bias_initializer=tf.constant_initializer(0.1))
        fc2 = tf.layers.dense(fc1, 8, tf.nn.relu, 
                              kernel_initializer=tf.random_normal_initializer(0., 0.3), 
                              bias_initializer=tf.constant_initializer(0.1))
        fc3 = tf.layers.dense(fc2, 4, tf.nn.relu, 
                              kernel_initializer=tf.random_normal_initializer(0., 0.3), 
                              bias_initializer=tf.constant_initializer(0.1))
        self.q_eval = tf.layers.dense(fc3, self.n_actions, tf.nn.relu,
                              kernel_initializer=tf.random_normal_initializer(0., 0.3), 
                              bias_initializer=tf.constant_initializer(0.1))
      
    def _train_network(self):
        
        self.actions = tf.placeholder(tf.int32, shape = (None,))
        
        self.rewards = tf.placeholder(tf.int32, shape = (None,))
        
        # Q(S',argmax_A(S',A))
        self.q_next = tf.placeholder(tf.float32, shape = (None,))
        
        hot_code_actions = tf.one_hot(self.actions, self.n_actions) # shape: [None, n_actions]
        
        # Q(S,A)
#        with tf.control_dependencies([tf.print(hot_code_actions)]):
        q_eval = tf.reduce_sum(tf.multiply(self.q_eval, hot_code_actions), axis = 1)
    
        loss = tf.losses.mean_squared_error(labels = tf.stop_gradient(self.gamma * self.q_next + tf.cast(self.rewards, tf.float32)), predictions = q_eval)
        
        self.train_op = tf.train.AdamOptimizer(self.lr).minimize(loss)
    
    def store_transition(self, s, a, r, s_):
        now = np.hstack([s, [a, r], s_])
        
        if self.memory_counter < self.memory_size:
            self.memory[self.memory_counter] = now
        else:
            index = self.memory_counter % self.memory_size
            self.memory[index] = now
        
        self.memory_counter += 1
    
    def train(self):
        if self.memory_counter > self.memory_size:
            sample_index = np.random.choice(self.memory_size, size = self.batch_size)
        else:
            sample_index = np.random.choice(self.memory_counter, size = self.batch_size)
        
        batch_memory = self.memory[sample_index, :]
        
        s = batch_memory[:, :self.n_features]
    #    print(s.shape)
        
        a = batch_memory[:, self.n_features]
        
        r = batch_memory[:, self.n_features+1]
        
        s_ = batch_memory[:, self.n_features+2:]
    #    print(s_.shape)        
            
        q_next_eval = np.zeros_like(r)
        
        eval_sample = []
        
        # 是否是终止状态
        for i in range(q_next_eval.shape[0]):
            if not True in np.isnan(s_[i]):
                eval_sample.append(i)
        
        # eval the actions
        eval_Q = self.sess.run(self.q_eval, feed_dict = {
            self.observations: s_[eval_sample]
        })

        # choose best actions
        
        q_next_eval[eval_sample] = np.max(eval_Q, axis = 1)
        
     #   print(q_next_eval)
       
        # learning from data
        self.sess.run(self.train_op, feed_dict = {
            self.observations: s,
            self.actions: a,
            self.rewards: r,
            self.q_next: q_next_eval,
        })
        
    def predict(self, s):
        
        # eplison - greedy
        if np.random.uniform() < self.epsilon:
            return np.random.randint(self.n_actions)
            self.epsilon -= self.epsilon_delta
        else:
            Q = self.sess.run(self.q_eval, feed_dict = {
            self.observations : s[np.newaxis, :]
        })
      #      print(Q)
            return np.argmax(Q)
        
        

In [377]:
tf.get_collection(tf.GraphKeys.VARIABLES)

[<tf.Variable 'dense/kernel:0' shape=(4, 16) dtype=float32_ref>,
 <tf.Variable 'dense/bias:0' shape=(16,) dtype=float32_ref>,
 <tf.Variable 'dense_1/kernel:0' shape=(16, 8) dtype=float32_ref>,
 <tf.Variable 'dense_1/bias:0' shape=(8,) dtype=float32_ref>,
 <tf.Variable 'dense_2/kernel:0' shape=(8, 4) dtype=float32_ref>,
 <tf.Variable 'dense_2/bias:0' shape=(4,) dtype=float32_ref>,
 <tf.Variable 'dense_3/kernel:0' shape=(4, 2) dtype=float32_ref>,
 <tf.Variable 'dense_3/bias:0' shape=(2,) dtype=float32_ref>,
 <tf.Variable 'beta1_power:0' shape=() dtype=float32_ref>,
 <tf.Variable 'beta2_power:0' shape=() dtype=float32_ref>,
 <tf.Variable 'dense/kernel/Adam:0' shape=(4, 16) dtype=float32_ref>,
 <tf.Variable 'dense/kernel/Adam_1:0' shape=(4, 16) dtype=float32_ref>,
 <tf.Variable 'dense/bias/Adam:0' shape=(16,) dtype=float32_ref>,
 <tf.Variable 'dense/bias/Adam_1:0' shape=(16,) dtype=float32_ref>,
 <tf.Variable 'dense_1/kernel/Adam:0' shape=(16, 8) dtype=float32_ref>,
 <tf.Variable 'dense_1/

## Experiment

In [378]:
import gym

In [389]:
EPSILON_NUM = 3000
THRESHOLD = 10000

In [390]:
env = gym.make("CartPole-v1")

In [391]:
tf.reset_default_graph()

In [392]:
model = DeepQLearning(n_actions = env.action_space.n, n_features = env.observation_space.shape[0])

In [393]:
# env.render()
for ep in range(EPSILON_NUM):
    last_observation = env.reset()
    total_rewards = 0
    index = 0
    while True:
        action = model.predict(last_observation)
     #   print(action)
     #   print("*"*20)
        observation, reward, done, info = env.step(action)
    #    print(observation)
        total_rewards += reward
        index += 1
        reward = -1 if done else 0.1 
        model.store_transition(last_observation, action, reward, observation)
      #  print(reward)
        last_observation = observation
     #   model.train()
        if done:
  #          print(observation)
            print(f"epsilon:{ep}, reward:{total_rewards}")
            model.train()
            total_rewards = 0
            break
        if index > THRESHOLD:
            break

epsilon:0, reward:16.0
epsilon:1, reward:18.0
epsilon:2, reward:14.0
epsilon:3, reward:77.0
epsilon:4, reward:16.0
epsilon:5, reward:20.0
epsilon:6, reward:40.0
epsilon:7, reward:39.0
epsilon:8, reward:12.0
epsilon:9, reward:57.0
epsilon:10, reward:47.0
epsilon:11, reward:19.0
epsilon:12, reward:12.0
epsilon:13, reward:76.0
epsilon:14, reward:33.0
epsilon:15, reward:15.0
epsilon:16, reward:71.0
epsilon:17, reward:10.0
epsilon:18, reward:12.0
epsilon:19, reward:16.0
epsilon:20, reward:21.0
epsilon:21, reward:18.0
epsilon:22, reward:43.0
epsilon:23, reward:10.0
epsilon:24, reward:16.0
epsilon:25, reward:25.0
epsilon:26, reward:24.0
epsilon:27, reward:25.0
epsilon:28, reward:30.0
epsilon:29, reward:14.0
epsilon:30, reward:20.0
epsilon:31, reward:13.0
epsilon:32, reward:26.0
epsilon:33, reward:32.0
epsilon:34, reward:75.0
epsilon:35, reward:16.0
epsilon:36, reward:27.0
epsilon:37, reward:17.0
epsilon:38, reward:13.0
epsilon:39, reward:14.0
epsilon:40, reward:13.0
epsilon:41, reward:13.0
ep

epsilon:336, reward:25.0
epsilon:337, reward:42.0
epsilon:338, reward:15.0
epsilon:339, reward:45.0
epsilon:340, reward:19.0
epsilon:341, reward:35.0
epsilon:342, reward:44.0
epsilon:343, reward:16.0
epsilon:344, reward:17.0
epsilon:345, reward:11.0
epsilon:346, reward:13.0
epsilon:347, reward:13.0
epsilon:348, reward:23.0
epsilon:349, reward:39.0
epsilon:350, reward:39.0
epsilon:351, reward:15.0
epsilon:352, reward:46.0
epsilon:353, reward:12.0
epsilon:354, reward:21.0
epsilon:355, reward:22.0
epsilon:356, reward:16.0
epsilon:357, reward:57.0
epsilon:358, reward:15.0
epsilon:359, reward:22.0
epsilon:360, reward:10.0
epsilon:361, reward:19.0
epsilon:362, reward:23.0
epsilon:363, reward:22.0
epsilon:364, reward:31.0
epsilon:365, reward:19.0
epsilon:366, reward:43.0
epsilon:367, reward:47.0
epsilon:368, reward:16.0
epsilon:369, reward:40.0
epsilon:370, reward:14.0
epsilon:371, reward:17.0
epsilon:372, reward:39.0
epsilon:373, reward:11.0
epsilon:374, reward:16.0
epsilon:375, reward:21.0


epsilon:674, reward:50.0
epsilon:675, reward:39.0
epsilon:676, reward:38.0
epsilon:677, reward:32.0
epsilon:678, reward:12.0
epsilon:679, reward:19.0
epsilon:680, reward:12.0
epsilon:681, reward:20.0
epsilon:682, reward:16.0
epsilon:683, reward:11.0
epsilon:684, reward:11.0
epsilon:685, reward:23.0
epsilon:686, reward:15.0
epsilon:687, reward:11.0
epsilon:688, reward:16.0
epsilon:689, reward:46.0
epsilon:690, reward:12.0
epsilon:691, reward:24.0
epsilon:692, reward:13.0
epsilon:693, reward:25.0
epsilon:694, reward:19.0
epsilon:695, reward:12.0
epsilon:696, reward:24.0
epsilon:697, reward:17.0
epsilon:698, reward:16.0
epsilon:699, reward:11.0
epsilon:700, reward:27.0
epsilon:701, reward:18.0
epsilon:702, reward:67.0
epsilon:703, reward:18.0
epsilon:704, reward:50.0
epsilon:705, reward:12.0
epsilon:706, reward:43.0
epsilon:707, reward:30.0
epsilon:708, reward:30.0
epsilon:709, reward:11.0
epsilon:710, reward:24.0
epsilon:711, reward:11.0
epsilon:712, reward:18.0
epsilon:713, reward:27.0


epsilon:1017, reward:25.0
epsilon:1018, reward:33.0
epsilon:1019, reward:20.0
epsilon:1020, reward:11.0
epsilon:1021, reward:20.0
epsilon:1022, reward:16.0
epsilon:1023, reward:34.0
epsilon:1024, reward:20.0
epsilon:1025, reward:10.0
epsilon:1026, reward:23.0
epsilon:1027, reward:22.0
epsilon:1028, reward:10.0
epsilon:1029, reward:19.0
epsilon:1030, reward:29.0
epsilon:1031, reward:29.0
epsilon:1032, reward:9.0
epsilon:1033, reward:9.0
epsilon:1034, reward:11.0
epsilon:1035, reward:24.0
epsilon:1036, reward:42.0
epsilon:1037, reward:29.0
epsilon:1038, reward:22.0
epsilon:1039, reward:82.0
epsilon:1040, reward:34.0
epsilon:1041, reward:33.0
epsilon:1042, reward:22.0
epsilon:1043, reward:22.0
epsilon:1044, reward:9.0
epsilon:1045, reward:13.0
epsilon:1046, reward:67.0
epsilon:1047, reward:15.0
epsilon:1048, reward:23.0
epsilon:1049, reward:10.0
epsilon:1050, reward:18.0
epsilon:1051, reward:33.0
epsilon:1052, reward:41.0
epsilon:1053, reward:30.0
epsilon:1054, reward:10.0
epsilon:1055, r

epsilon:1349, reward:16.0
epsilon:1350, reward:16.0
epsilon:1351, reward:14.0
epsilon:1352, reward:24.0
epsilon:1353, reward:30.0
epsilon:1354, reward:21.0
epsilon:1355, reward:21.0
epsilon:1356, reward:23.0
epsilon:1357, reward:14.0
epsilon:1358, reward:18.0
epsilon:1359, reward:21.0
epsilon:1360, reward:12.0
epsilon:1361, reward:16.0
epsilon:1362, reward:26.0
epsilon:1363, reward:38.0
epsilon:1364, reward:17.0
epsilon:1365, reward:28.0
epsilon:1366, reward:23.0
epsilon:1367, reward:13.0
epsilon:1368, reward:30.0
epsilon:1369, reward:36.0
epsilon:1370, reward:95.0
epsilon:1371, reward:28.0
epsilon:1372, reward:22.0
epsilon:1373, reward:11.0
epsilon:1374, reward:13.0
epsilon:1375, reward:16.0
epsilon:1376, reward:14.0
epsilon:1377, reward:23.0
epsilon:1378, reward:12.0
epsilon:1379, reward:42.0
epsilon:1380, reward:16.0
epsilon:1381, reward:13.0
epsilon:1382, reward:12.0
epsilon:1383, reward:11.0
epsilon:1384, reward:12.0
epsilon:1385, reward:36.0
epsilon:1386, reward:31.0
epsilon:1387

epsilon:1692, reward:16.0
epsilon:1693, reward:28.0
epsilon:1694, reward:50.0
epsilon:1695, reward:27.0
epsilon:1696, reward:16.0
epsilon:1697, reward:20.0
epsilon:1698, reward:17.0
epsilon:1699, reward:13.0
epsilon:1700, reward:12.0
epsilon:1701, reward:16.0
epsilon:1702, reward:32.0
epsilon:1703, reward:43.0
epsilon:1704, reward:10.0
epsilon:1705, reward:22.0
epsilon:1706, reward:15.0
epsilon:1707, reward:25.0
epsilon:1708, reward:28.0
epsilon:1709, reward:13.0
epsilon:1710, reward:20.0
epsilon:1711, reward:16.0
epsilon:1712, reward:20.0
epsilon:1713, reward:17.0
epsilon:1714, reward:11.0
epsilon:1715, reward:13.0
epsilon:1716, reward:48.0
epsilon:1717, reward:13.0
epsilon:1718, reward:21.0
epsilon:1719, reward:15.0
epsilon:1720, reward:25.0
epsilon:1721, reward:31.0
epsilon:1722, reward:26.0
epsilon:1723, reward:13.0
epsilon:1724, reward:14.0
epsilon:1725, reward:21.0
epsilon:1726, reward:11.0
epsilon:1727, reward:18.0
epsilon:1728, reward:14.0
epsilon:1729, reward:15.0
epsilon:1730

epsilon:2044, reward:59.0
epsilon:2045, reward:11.0
epsilon:2046, reward:21.0
epsilon:2047, reward:10.0
epsilon:2048, reward:33.0
epsilon:2049, reward:16.0
epsilon:2050, reward:20.0
epsilon:2051, reward:15.0
epsilon:2052, reward:32.0
epsilon:2053, reward:33.0
epsilon:2054, reward:34.0
epsilon:2055, reward:13.0
epsilon:2056, reward:17.0
epsilon:2057, reward:18.0
epsilon:2058, reward:39.0
epsilon:2059, reward:15.0
epsilon:2060, reward:21.0
epsilon:2061, reward:13.0
epsilon:2062, reward:21.0
epsilon:2063, reward:15.0
epsilon:2064, reward:12.0
epsilon:2065, reward:16.0
epsilon:2066, reward:13.0
epsilon:2067, reward:16.0
epsilon:2068, reward:26.0
epsilon:2069, reward:11.0
epsilon:2070, reward:31.0
epsilon:2071, reward:13.0
epsilon:2072, reward:32.0
epsilon:2073, reward:18.0
epsilon:2074, reward:17.0
epsilon:2075, reward:26.0
epsilon:2076, reward:55.0
epsilon:2077, reward:25.0
epsilon:2078, reward:13.0
epsilon:2079, reward:18.0
epsilon:2080, reward:20.0
epsilon:2081, reward:11.0
epsilon:2082

epsilon:2402, reward:60.0
epsilon:2403, reward:18.0
epsilon:2404, reward:15.0
epsilon:2405, reward:15.0
epsilon:2406, reward:20.0
epsilon:2407, reward:44.0
epsilon:2408, reward:11.0
epsilon:2409, reward:18.0
epsilon:2410, reward:18.0
epsilon:2411, reward:20.0
epsilon:2412, reward:23.0
epsilon:2413, reward:14.0
epsilon:2414, reward:18.0
epsilon:2415, reward:10.0
epsilon:2416, reward:13.0
epsilon:2417, reward:16.0
epsilon:2418, reward:25.0
epsilon:2419, reward:30.0
epsilon:2420, reward:25.0
epsilon:2421, reward:17.0
epsilon:2422, reward:20.0
epsilon:2423, reward:23.0
epsilon:2424, reward:21.0
epsilon:2425, reward:9.0
epsilon:2426, reward:25.0
epsilon:2427, reward:9.0
epsilon:2428, reward:40.0
epsilon:2429, reward:10.0
epsilon:2430, reward:16.0
epsilon:2431, reward:22.0
epsilon:2432, reward:17.0
epsilon:2433, reward:50.0
epsilon:2434, reward:9.0
epsilon:2435, reward:20.0
epsilon:2436, reward:36.0
epsilon:2437, reward:25.0
epsilon:2438, reward:32.0
epsilon:2439, reward:11.0
epsilon:2440, r

epsilon:2758, reward:24.0
epsilon:2759, reward:48.0
epsilon:2760, reward:37.0
epsilon:2761, reward:10.0
epsilon:2762, reward:48.0
epsilon:2763, reward:21.0
epsilon:2764, reward:20.0
epsilon:2765, reward:25.0
epsilon:2766, reward:19.0
epsilon:2767, reward:14.0
epsilon:2768, reward:30.0
epsilon:2769, reward:22.0
epsilon:2770, reward:18.0
epsilon:2771, reward:26.0
epsilon:2772, reward:16.0
epsilon:2773, reward:16.0
epsilon:2774, reward:37.0
epsilon:2775, reward:31.0
epsilon:2776, reward:16.0
epsilon:2777, reward:17.0
epsilon:2778, reward:17.0
epsilon:2779, reward:14.0
epsilon:2780, reward:24.0
epsilon:2781, reward:30.0
epsilon:2782, reward:44.0
epsilon:2783, reward:16.0
epsilon:2784, reward:23.0
epsilon:2785, reward:14.0
epsilon:2786, reward:15.0
epsilon:2787, reward:25.0
epsilon:2788, reward:12.0
epsilon:2789, reward:15.0
epsilon:2790, reward:13.0
epsilon:2791, reward:12.0
epsilon:2792, reward:17.0
epsilon:2793, reward:15.0
epsilon:2794, reward:17.0
epsilon:2795, reward:12.0
epsilon:2796

## Result

In [342]:
dir(env)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__enter__',
 '__eq__',
 '__exit__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_action_space',
 '_elapsed_steps',
 '_max_episode_steps',
 '_metadata',
 '_observation_space',
 '_reward_range',
 'action_space',
 'class_name',
 'close',
 'compute_reward',
 'env',
 'metadata',
 'observation_space',
 'render',
 'reset',
 'reward_range',
 'seed',
 'spec',
 'step',
 'unwrapped']

In [343]:
env.observation_space

Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)