# DQN Agent

In cases where the number of possible states becomes very large it becomes computationally inefficient to store the Q Values for each state in a table.

To overcome this we use a Neural Network as a function approximator. We pass the input state and action to the network and retrieve the approximated Q Value.

Since Q Learning is an Off policy algorithm it is able to learn from old experiences. We create a simple buffer using a deque that stores the last 200 frames. At each timestep we sample from the buffer for training.


In [2]:
import gym
import time
from IPython.display import clear_output
from matplotlib import pyplot as plt
import random
import gym
import numpy as np
import tensorflow as tf
from collections import deque
%matplotlib inline

### Visualize random agent

In [3]:
# Please note to render the Cart Pole Environment on a remote server you need to run the jupyter notebook using a virtual display.
# Use the following command
# xvfb-run -a -s "-screen 0 1400x900x24 +extension RANDR" -- jupyter notebook --no-browser

env = gym.make("CartPole-v1")

state = env.reset()
done = False
total_reward = 0
while not done:
    clear_output(wait = True)
    plt.imshow(env.render(mode='rgb_array'))
    plt.show()
    action = env.action_space.sample()
    state, reward, done, info = env.step(action)
    time.sleep(.1)
    total_reward += reward
print("Times steps survivied when using a Random Agent: ", total_reward)

NoSuchDisplayException: Cannot connect to "None"

### Building the Network

In [3]:

EPISODES = 100

class DQAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = .95
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.999
        self.learning_rate = 0.0001
        
        with tf.variable_scope("DQNetwork"):
            self.inputs_ = tf.placeholder(tf.float32, 
                                         [None, state_size], 
                                         name="inputs")
            self.actions_ = tf.placeholder(tf.float32,
                                          action_size,
                                          name="actions")
            self.target_Q = tf.placeholder(tf.float32,
                                          (),
                                          name="target_q")
            
            self.fc1 = tf.layers.dense(inputs=self.inputs_,
                                      units=24,
                                      activation=tf.nn.elu,
                                      kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                      name="fc1"
                                      )
            
            self.fc2 = tf.layers.dense(inputs=self.fc1,
                                      units=24,
                                      activation=tf.nn.elu,
                                      kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                      name="fc2")
            
            self.output = tf.layers.dense(inputs=self.fc2,
                                         units = action_size,
                                         activation=None,
                                         kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                         name="output")
            self.Q = tf.reduce_sum(tf.multiply(self.output, self.actions_))
            self.loss = tf.reduce_mean(tf.square(self.target_Q - self.Q))
            
            self.optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)
            
    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state, sess):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        else:
            act_values = sess.run(self.output, feed_dict={
                self.inputs_ : state
            })
            return np.argmax(act_values)

    def replay(self, batch_size, sess):
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                target_q = sess.run(self.output, feed_dict={
                    self.inputs_ : next_state
                })
                target = (reward + self.gamma * np.amax(target_q[0]))
            #Remove incorrect indent, we want to train at each timestep not just when it is done
            action = np.eye(self.action_size)[action]

            loss, _ = sess.run([self.loss, self.optimizer], feed_dict={
                self.inputs_ : state,
                self.target_Q : target,
                self.actions_ : action
            })


            

### Train Agent

In [4]:
env = gym.make("CartPole-v1")
state_size = env.observation_space.shape[0]
action_size = env.action_space.n

tf.reset_default_graph()
agent = DQAgent(state_size, action_size)

sess = tf.Session()
sess.run(tf.global_variables_initializer())
done = False
batch_size = 32

for e in range(EPISODES):
    state = env.reset()
    state = np.reshape(state, (1, state_size))
    for time in range(200):
        action = agent.act(state, sess)
        next_state, reward, done, _ = env.step(action)
        next_state = np.reshape(next_state, (1, state_size))
        reward = reward if not done else -10.0
        agent.remember(state, action, reward, next_state, done)
        state = next_state

        if len(agent.memory)> batch_size:
            agent.replay(batch_size, sess)
            if agent.epsilon > agent.epsilon_min:
                agent.epsilon *= agent.epsilon_decay
        if done:
            break
    print("episode {}/{}, score: {}, e: {:.2}".format(e, EPISODES, time, agent.epsilon))

    

episode 0/100, score: 12, e: 1.0
episode 1/100, score: 20, e: 1.0
episode 2/100, score: 19, e: 0.98
episode 3/100, score: 26, e: 0.95
episode 4/100, score: 27, e: 0.93
episode 5/100, score: 15, e: 0.91
episode 6/100, score: 41, e: 0.87
episode 7/100, score: 8, e: 0.87
episode 8/100, score: 15, e: 0.85
episode 9/100, score: 36, e: 0.82
episode 10/100, score: 11, e: 0.81
episode 11/100, score: 26, e: 0.79
episode 12/100, score: 19, e: 0.77
episode 13/100, score: 43, e: 0.74
episode 14/100, score: 40, e: 0.71
episode 15/100, score: 28, e: 0.69
episode 16/100, score: 83, e: 0.63
episode 17/100, score: 43, e: 0.61
episode 18/100, score: 33, e: 0.59
episode 19/100, score: 19, e: 0.58
episode 20/100, score: 19, e: 0.56
episode 21/100, score: 41, e: 0.54
episode 22/100, score: 14, e: 0.53
episode 23/100, score: 22, e: 0.52
episode 24/100, score: 42, e: 0.5
episode 25/100, score: 15, e: 0.49
episode 26/100, score: 24, e: 0.48
episode 27/100, score: 20, e: 0.47
episode 28/100, score: 35, e: 0.45

### Evaluating Agent

In [5]:
agent.epsilon = 0.0
episodes = 100

total_reward = 0
for _ in range(episodes):
    state = env.reset()
    done = False
    while not done:
        state = np.reshape(state, (1, state_size))
        action = agent.act(state, sess)
        state, reward, done, _ = env.step(action)
        total_reward += reward

print(f"Results after {episodes} episodes:")
print(f"Average Reward per episode: {total_reward / episodes}")

Results after 100 episodes:
Average Reward per episode: 442.69


In [6]:
sess.close()