# Deep Q Learning Algorithm

## Introduction

This notebook reproduce the DeepQLearning network and then use this algorithm to control "CartPole" game.

Also, this code refers [MorvanZhou](https://github.com/MorvanZhou/Reinforcement-learning-with-tensorflow/blob/master/contents/5_Deep_Q_Network/DQN_modified.py) and [ljp](https://github.com/ljpzzz/machinelearning/blob/master/reinforcement-learning/dqn.py)

## Import Library

In [54]:
import numpy as np
import tensorflow.compat.v1 as tf

In [55]:
tf.disable_eager_execution()

In [56]:
tf.__version__

'2.6.0'

## Code Project

In [57]:
class DeepQLearning:
    def __init__(self, n_features, n_actions, learning_rate = 0.01 ,reward_decay = 0.9, e_greedy = 0.9, memory_size = 500, batch_size = 32):
        self.n_features = n_features
        self.n_actions = n_actions
        self.lr = learning_rate
        self.gamma = reward_decay
        self.epsilon = e_greedy
        self.memory_size = memory_size
        self.batch_size = batch_size
        
        self.sess = tf.Session()
        
        self._build_network()
        
        self._train_network()
        
        self.sess.run(tf.global_variables_initializer())
        
        self.memory = np.zeros((memory_size, 2 * n_features + 2))
        self.memory_counter = 0 
        
        
    def _build_network(self):
        self.observations = tf.placeholder(tf.float32, shape = (None, self.n_features))
        
        tf.random_normal_initializer(0., 0.3), tf.constant_initializer(0.1)
        # DNN network
        fc1 = tf.layers.dense(self.observations, 8, tf.nn.relu, 
                              kernel_initializer=tf.random_normal_initializer(0., 0.3), 
                              bias_initializer=tf.constant_initializer(0.1))
        self.q_eval = tf.layers.dense(fc1, self.n_actions,  
                              kernel_initializer=tf.random_normal_initializer(0., 0.3), 
                              bias_initializer=tf.constant_initializer(0.1))
      
    def _train_network(self):
        
        self.actions = tf.placeholder(tf.int32, shape = (None,))
        
        self.rewards = tf.placeholder(tf.int32, shape = (None,))
        
        # Q(S',argmax_A(S',A))
        self.q_next = tf.placeholder(tf.float32, shape = (None,))
        
        hot_code_actions = tf.one_hot(self.actions, self.n_actions)
        
        # Q(S,A)
        q_eval = tf.reduce_sum(tf.multiply(self.q_eval, hot_code_actions), axis = 1)
    
        loss = tf.losses.mean_squared_error(labels = self.q_next + tf.cast(self.rewards, tf.float32), predictions = q_eval)
        
        self.train_op = tf.train.AdamOptimizer(self.lr).minimize(loss)
    
    def store_transition(self, s, a, r, s_):
        now = np.hstack([s, [a, r], s_])
        
        if self.memory_counter < self.memory_size:
            self.memory[self.memory_counter] = now
        else:
            index = self.memory_counter % self.memory_size
            self.memory[index] = now
        
        self.memory_counter += 1
    
    def train(self):
        if self.memory_counter > self.memory_size:
            sample_index = np.random.choice(self.memory_size, size = self.batch_size)
        else:
            sample_index = np.random.choice(self.memory_counter, size = self.batch_size)
            
        batch_memory = self.memory[sample_index, :]
        
        s = batch_memory[:, :self.n_features]
        
        a = batch_memory[:, self.n_features]
        
        r = batch_memory[:, self.n_features+1]
        
        s_ = batch_memory[:, self.n_features+2:]
                
        q_next_eval = np.zeros_like(r)
        
        eval_sample = []
        
        for i in range(q_next_eval.shape[0]):
            if not True in np.isnan(s_[i]):
                eval_sample.append(i)
        
        # eval the actions
        eval_Q = self.sess.run(self.q_eval, feed_dict = {
            self.observations: s_[eval_sample]
        })

        # choose best actions
        
        q_next_eval[eval_sample] = np.max(eval_Q, axis = 1)  
       
        # learning from data
        self.sess.run(self.train_op, feed_dict = {
            self.observations: s,
            self.actions: a,
            self.rewards: r,
            self.q_next: q_next_eval,
        })
        
    def predict(self, s):
        
        # eplison - greedy
        if np.random.uniform() < self.epsilon:
            Q = self.sess.run(self.q_eval, feed_dict = {
            self.observations : s[np.newaxis, :]
        })
            return np.argmax(Q)
        else:
            return np.random.randint(self.n_actions)
            
        
        

## Experiment

In [58]:
import gym

In [59]:
EPSILON_NUM = 30
THRESHOLD = 10000

In [60]:
env = gym.make("CartPole-v1")

In [61]:
tf.reset_default_graph()

AssertionError: Do not use tf.reset_default_graph() to clear nested graphs. If you need a cleared graph, exit the nesting and create a new graph.

In [62]:
model = DeepQLearning(n_actions = env.action_space.n, n_features = env.observation_space.shape[0])

RuntimeError: Attempting to capture an EagerTensor without building a function.

In [63]:
# env.render()
for ep in range(EPSILON_NUM):
    last_observation = env.reset()
    total_rewards = 0
    index = 0
    while True:
        action = model.predict(last_observation)
        observation, reward, done, info = env.step(action)
        # print(observation)
        total_rewards += 1 
        index += 1
        model.store_transition(last_observation, action, reward, observation)
        last_observation = observation
        if done:
            print(f"epsilon:{ep}, reward:{total_rewards}")
            model.train()
            total_rewards = 0
            break
        if index > THRESHOLD:
            break

[-0.0005265  -0.15344673  0.04717619  0.31150812]
[-0.00359544 -0.34920794  0.05340636  0.61868787]
[-0.0105796  -0.5450336   0.06578011  0.9277016 ]
[-0.02148027 -0.7409791   0.08433414  1.2403095 ]
[-0.03629985 -0.9370765   0.10914034  1.5581752 ]
[-0.05504138 -1.133323    0.14030384  1.8828169 ]
[-0.07770784 -1.3296661   0.17796017  2.2155519 ]
[-0.10430116 -1.5259888   0.22227122  2.5574296 ]
[ 0.00845116 -0.17191142 -0.02345716  0.2672434 ]
[ 0.00501293 -0.36669087 -0.01811229  0.5524364 ]
[-0.00232089 -0.56155384 -0.00706356  0.8393582 ]
[-0.01355196 -0.7565786   0.00972361  1.1298114 ]
[-0.02868354 -0.9518266   0.03231983  1.4255282 ]
[-0.04772007 -1.1473327   0.0608304   1.7281346 ]
[-0.07066672 -1.3430948   0.09539309  2.0391078 ]
[-0.09752861 -1.5390601   0.13617525  2.3597224 ]
[-0.12830982 -1.7351091   0.1833697   2.6909854 ]
[-0.163012  -1.9310374  0.2371894  3.0335598]
[ 0.02622542 -0.16632874 -0.02603409  0.29898548]
[ 0.02289885 -0.3610701  -0.02005438  0.5833454 ]
[ 0.

[ 0.02169101 -0.16214462 -0.03603952  0.3306894 ]
[ 0.01844812 -0.35673553 -0.02942573  0.61179286]
[ 0.01131341 -0.5514341  -0.01718988  0.8950644 ]
[ 2.8472734e-04 -7.4631882e-01  7.1141386e-04  1.1822947e+00]
[-0.01464165 -0.94145     0.02435731  1.4752005 ]
[-0.03347065 -1.136861    0.05386132  1.7753904 ]
[-0.05620787 -1.3325468   0.08936913  2.084321  ]
[-0.0828588  -1.5284512   0.13105555  2.4032445 ]
[-0.11342783 -1.7244499   0.17912044  2.7331417 ]
[-0.14791682 -1.9203327   0.23378326  3.074645  ]
[ 0.01276363 -0.20435172  0.04960406  0.29947105]
[ 0.00867659 -0.40014434  0.05559348  0.60737634]
[ 0.00067371 -0.20584193  0.06774101  0.33270907]
[-0.00344313 -0.4018594   0.07439519  0.6459615 ]
[-0.01148032 -0.59793484  0.08731442  0.96111315]
[-0.02343902 -0.40408802  0.10653669  0.6970885 ]
[-0.03152078 -0.6005134   0.12047846  1.0213186 ]
[-0.04353105 -0.7970165   0.14090483  1.3492718 ]
[-0.05947138 -0.9935995   0.16789027  1.6825129 ]
[-0.07934336 -1.190221    0.20154051  

## Result