# Deep Q Learning Algorithm

## Introduction

This notebook reproduce the DeepQLearning network and then use this algorithm to control "CartPole" game.

Also, this code refers [MorvanZhou](https://github.com/MorvanZhou/Reinforcement-learning-with-tensorflow/blob/master/contents/5_Deep_Q_Network/DQN_modified.py) and [ljp](https://github.com/ljpzzz/machinelearning/blob/master/reinforcement-learning/dqn.py)

## Import Library

In [64]:
import numpy as np
import tensorflow.compat.v1 as tf

In [77]:
tf.disable_eager_execution()
tf.disable_v2_behavior()

Instructions for updating:
non-resource variables are not supported in the long term


In [78]:
tf.__version__

'2.6.0'

## Code Project

In [79]:
class DeepQLearning:
    def __init__(self, n_features, n_actions, learning_rate = 0.01 ,reward_decay = 0.9, e_greedy = 0.9, memory_size = 500, batch_size = 32):
        self.n_features = n_features
        self.n_actions = n_actions
        self.lr = learning_rate
        self.gamma = reward_decay
        self.epsilon = e_greedy
        self.memory_size = memory_size
        self.batch_size = batch_size
        
        self.sess = tf.Session()
        
        self._build_network()
        
        self._train_network()
        
        self.sess.run(tf.global_variables_initializer())
        
        self.memory = np.zeros((memory_size, 2 * n_features + 2))
        self.memory_counter = 0 
        
        
    def _build_network(self):
        self.observations = tf.placeholder(tf.float32, shape = (None, self.n_features))
        
        tf.random_normal_initializer(0., 0.3), tf.constant_initializer(0.1)
        # DNN network
        fc1 = tf.layers.dense(self.observations, 8, tf.nn.relu, 
                              kernel_initializer=tf.random_normal_initializer(0., 0.3), 
                              bias_initializer=tf.constant_initializer(0.1))
        self.q_eval = tf.layers.dense(fc1, self.n_actions,  
                              kernel_initializer=tf.random_normal_initializer(0., 0.3), 
                              bias_initializer=tf.constant_initializer(0.1))
      
    def _train_network(self):
        
        self.actions = tf.placeholder(tf.int32, shape = (None,))
        
        self.rewards = tf.placeholder(tf.int32, shape = (None,))
        
        # Q(S',argmax_A(S',A))
        self.q_next = tf.placeholder(tf.float32, shape = (None,))
        
        hot_code_actions = tf.one_hot(self.actions, self.n_actions)
        
        # Q(S,A)
        q_eval = tf.reduce_sum(tf.multiply(self.q_eval, hot_code_actions), axis = 1)
    
        loss = tf.losses.mean_squared_error(labels = self.q_next + tf.cast(self.rewards, tf.float32), predictions = q_eval)
        
        self.train_op = tf.train.AdamOptimizer(self.lr).minimize(loss)
    
    def store_transition(self, s, a, r, s_):
        now = np.hstack([s, [a, r], s_])
        
        if self.memory_counter < self.memory_size:
            self.memory[self.memory_counter] = now
        else:
            index = self.memory_counter % self.memory_size
            self.memory[index] = now
        
        self.memory_counter += 1
    
    def train(self):
        if self.memory_counter > self.memory_size:
            sample_index = np.random.choice(self.memory_size, size = self.batch_size)
        else:
            sample_index = np.random.choice(self.memory_counter, size = self.batch_size)
            
        batch_memory = self.memory[sample_index, :]
        
        s = batch_memory[:, :self.n_features]
        
        a = batch_memory[:, self.n_features]
        
        r = batch_memory[:, self.n_features+1]
        
        s_ = batch_memory[:, self.n_features+2:]
                
        q_next_eval = np.zeros_like(r)
        
        eval_sample = []
        
        for i in range(q_next_eval.shape[0]):
            if not True in np.isnan(s_[i]):
                eval_sample.append(i)
        
        # eval the actions
        eval_Q = self.sess.run(self.q_eval, feed_dict = {
            self.observations: s_[eval_sample]
        })

        # choose best actions
        
        q_next_eval[eval_sample] = np.max(eval_Q, axis = 1)  
       
        # learning from data
        self.sess.run(self.train_op, feed_dict = {
            self.observations: s,
            self.actions: a,
            self.rewards: r,
            self.q_next: q_next_eval,
        })
        
    def predict(self, s):
        
        # eplison - greedy
        if np.random.uniform() < self.epsilon:
            Q = self.sess.run(self.q_eval, feed_dict = {
            self.observations : s[np.newaxis, :]
        })
            return np.argmax(Q)
        else:
            return np.random.randint(self.n_actions)
            
        
        

## Experiment

In [80]:
import gym

In [81]:
EPSILON_NUM = 30
THRESHOLD = 10000

In [82]:
env = gym.make("CartPole-v1")

In [83]:
# tf.reset_default_graph()

In [84]:
model = DeepQLearning(n_actions = env.action_space.n, n_features = env.observation_space.shape[0])

RuntimeError: Attempting to capture an EagerTensor without building a function.

In [85]:
# env.render()
for ep in range(EPSILON_NUM):
    last_observation = env.reset()
    total_rewards = 0
    index = 0
    while True:
        action = model.predict(last_observation)
        observation, reward, done, info = env.step(action)
        # print(observation)
        total_rewards += 1 
        index += 1
        model.store_transition(last_observation, action, reward, observation)
        last_observation = observation
        if done:
            print(f"epsilon:{ep}, reward:{total_rewards}")
            model.train()
            total_rewards = 0
            break
        if index > THRESHOLD:
            break

epsilon:0, reward:10
epsilon:1, reward:9
epsilon:2, reward:12
epsilon:3, reward:12
epsilon:4, reward:11
epsilon:5, reward:11
epsilon:6, reward:10
epsilon:7, reward:9
epsilon:8, reward:9
epsilon:9, reward:9
epsilon:10, reward:14
epsilon:11, reward:9
epsilon:12, reward:10
epsilon:13, reward:10
epsilon:14, reward:10
epsilon:15, reward:9
epsilon:16, reward:9
epsilon:17, reward:9
epsilon:18, reward:11
epsilon:19, reward:10
epsilon:20, reward:10
epsilon:21, reward:10
epsilon:22, reward:9
epsilon:23, reward:10
epsilon:24, reward:9
epsilon:25, reward:11
epsilon:26, reward:10
epsilon:27, reward:9
epsilon:28, reward:10
epsilon:29, reward:11


## Result