# Deep Q Learning Algorithm

## Introduction

This notebook reproduce the DeepQLearning network and then use this algorithm to control "CartPole" game.

Also, this code refers [MorvanZhou](https://github.com/MorvanZhou/Reinforcement-learning-with-tensorflow/blob/master/contents/5_Deep_Q_Network/DQN_modified.py) and [ljp](https://github.com/ljpzzz/machinelearning/blob/master/reinforcement-learning/dqn.py)

## Import Library

In [3]:
import numpy as np
import tensorflow.compat.v1 as tf

In [4]:
tf.disable_eager_execution()

In [5]:
tf.__version__

'2.6.0'

## Code Project

In [36]:
class DeepQLearning:
    def __init__(self, n_features, n_actions, learning_rate = 0.01 ,reward_decay = 0.9, e_greedy = 0.9, memory_size = 500, batch_size = 32):
        self.n_features = n_features
        self.n_actions = n_actions
        self.lr = learning_rate
        self.gamma = reward_decay
        self.epsilon = e_greedy
        self.memory_size = memory_size
        self.batch_size = batch_size
        
        self.sess = tf.Session()
        
        self._build_network()
        
        self._train_network()
        
        self.sess.run(tf.global_variables_initializer())
        
        self.memory = np.zeros((memory_size, 2 * n_features + 2))
        self.memory_counter = 0 
        
        
    def _build_network(self):
        observations = tf.placeholder(tf.float32, shape = (None, self.n_features))
        
        tf.random_normal_initializer(0., 0.3), tf.constant_initializer(0.1)
        # DNN network
        fc1 = tf.layers.dense(observation, 8, tf.nn.relu, 
                              kernel_initializer=tf.random_normal_initializer(0., 0.3), 
                              bias_initializer=tf.constant_initializer(0.1))
        self.q_eval = tf.layers.dense(fc1, self.n_actions,  
                              kernel_initializer=tf.random_normal_initializer(0., 0.3), 
                              bias_initializer=tf.constant_initializer(0.1))
      
    def _train_network(self):
        
        actions = tf.placeholder(tf.int32, shape = (None,))
        
        rewards = tf.placeholder(tf.int32, shape = (None,))
        
        # Q(S',argmax_A(S',A))
        q_next = tf.placeholder(tf.float32, shape = (None,))
        
        hot_code_actions = tf.transpose(tf.one_hot(actions, self.n_actions))
        
        # Q(S,A)
        q_eval = tf.matmul(self.q_eval, hot_code_actions)
        
        loss = tf.mean_squared_error(labels = q_next + reward, predictions = q_eval)
        
        self.train_op = tf.train.AdamOptimizer(self.lr).minimize(loss)
    
    def store_transition(self, s, a, r, s_):
        now = np.hstack([s, [a, r], s_])
        
        if self.memory_counter < self.memory_size:
            self.memory[self.memory_counter] = now
        else:
            index = self.memory_counter % self.memory_size
            self.memory[index] = now
    
    def train(self):
        if self.memory_counter > self.memory_size:
            sample_index = np.random.choice(self.memory_size, size = self.batch_size)
        else:
            sample_index = np.random.choice(self.memory_counter, size = self.batch_size)
            
        batch_memory = self.memory[sample_index, :]
        
        s = batch_memory[:, :n_features]
        
        a = batch_memory[:, n_features]
        
        r = batch_memory[:, n_features+1]
        
        s_ = batch_memory[:, n_features+1:]
                
        q_next_eval = np.zeros_like(r)
        
        eval_sample = []
        
        for i in range(q_next_eval.shape[0]):
            if not True in np.isnan(s_[i]):
                eval_sample.append(i)
        
        # eval the actions
        eval_Q = self.sess.run(self.q_eval, feed_dict = {
            observations: s_[eval_sample]
        })

        # choose best actions
        
        q_next_eval[eval_sample] = np.sum(eval_Q, axis = 1)  
       
        # learning from data
        self.sess.run(self.train_op, feed_dict = {
            observations: s,
            actions: a,
            rewards: r,
            q_next: q_next_eval,
        })
        
    def predict(self, s):
        
        # eplison - greedy
        if np.random.uniform() < self.epsilon:
            Q = self.sess.run(self.q_eval, feed_dict = {
            observations : s[np.newaxis, :]
        })
            return np.argmax(Q)
        else:
            return np.random.randint(self.n_actions)
            
        
        

## Experiment

In [7]:
np.array([1,2,3,4])[:2]

array([1, 2])

## Result