# Building an Agent to Play Atari games using Deep Q Network

First we import all the necessary libraries </font> 


In [1]:
import numpy as np
import gym
import tensorflow as tf
from tensorflow.contrib.layers import flatten, conv2d, fully_connected
from collections import deque, Counter
import random
from datetime import datetime
import time
import pickle

Now we define a function called preprocess_observation for preprocessing our input game screen. We reduce the image size
and convert the image into greyscale.

In [2]:
color = np.array([210, 164, 74]).mean()

def preprocess_observation(obs):

    # Crop and resize the image
    img = obs[20:196:2, ::2]
    
    # Convert the image to greyscale
    img = img.mean(axis=2)
    
    # Improve image contrast
    img[img==color] = 0
    
    # Next we normalize the image from -1 to +1
    img = img/255
    
    img = img.reshape(88,80,1)
    
    return img

 Let us initialize our gym environment

In [3]:
env = gym.make("SpaceInvaders-v0")
n_outputs = env.action_space.n

Okay, Now we define a function called q_network for building our Q network. We input the game state
to the Q network and get the Q values for all the actions in that state. <br><br>
We build Q network with three convolutional layers with same padding followed by a fully connected layer. 

In [4]:
tf.reset_default_graph()

def q_network(X, name_scope):
    
    # Initialize layers
    initializer = tf.contrib.layers.variance_scaling_initializer()

    with tf.variable_scope(name_scope) as scope: 

        # initialize the convolutional layers
        layer_1 = conv2d(X, num_outputs=32, kernel_size=(8,8), stride=4, padding='SAME', weights_initializer=initializer) 
        tf.summary.histogram('layer_1',layer_1)
        
        layer_2 = conv2d(layer_1, num_outputs=64, kernel_size=(4,4), stride=2, padding='SAME', weights_initializer=initializer)
        tf.summary.histogram('layer_2',layer_2)
        
        layer_3 = conv2d(layer_2, num_outputs=64, kernel_size=(3,3), stride=1, padding='SAME', weights_initializer=initializer)
        tf.summary.histogram('layer_3',layer_3)
        
        # Flatten the result of layer_3 before feeding to the fully connected layer
        flat = flatten(layer_3)

        fc = fully_connected(flat, num_outputs=128, weights_initializer=initializer)
        tf.summary.histogram('fc',fc)
        
        output = fully_connected(fc, num_outputs=n_outputs, activation_fn=None, weights_initializer=initializer)
        print(output.name)
        tf.summary.histogram('output',output)
        

        # Vars will store the parameters of the network such as weights
        vars = {v.name[len(scope.name):]: v for v in tf.get_collection(key=tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope.name)} 
        return vars, output

Next we define a function called epsilon_greedy for performing epsilon greedy policy. In epsilon greedy policy we either select the best action with probability 1 - epsilon or a random action with
probability epsilon.

We use decaying epsilon greedy policy where value of epsilon will be decaying over time as we don't want to explore
forever. So over time our policy will be exploiting only good actions.

In [5]:
epsilon = 0.5
eps_min = 0.05
eps_max = 1.0
eps_decay_steps = 720000

def epsilon_greedy(action, step):
    p = np.random.random(1).squeeze()
    epsilon = max(eps_min, eps_max - (eps_max-eps_min) * step/eps_decay_steps)
    if np.random.rand() < epsilon:
        return np.random.randint(n_outputs)
    else:
        return action

Now, we initialize our experience replay buffer of length 20000 which holds the experience.

We store all the agent's experience i.e (state, action, rewards) in the experience replay buffer
and  we sample from this minibatch of experience for training the network.

In [6]:
buffer_len = 20000
exp_buffer = deque(maxlen=buffer_len)

Next, we define a function called sample_memories for sampling experiences from the memory. Batch size is the number of experience sampled
from the memory.


In [7]:
def sample_memories(batch_size):
    perm_batch = np.random.permutation(len(exp_buffer))[:batch_size]
    mem = np.array(exp_buffer)[perm_batch]
    return mem[:,0], mem[:,1], mem[:,2], mem[:,3], mem[:,4]

Now we define our network hyperparameters,

In [8]:
num_episodes = 1000
batch_size = 64
input_shape = (None, 88, 80, 1)
learning_rate = 0.00025
X_shape = (None, 88, 80, 1)
discount_factor = 0.9

global_step = 0
copy_steps = 100
steps_train = 4
start_steps = 2000

 Now let us build our primary and target Q network

In [9]:
logdir = 'logs'

# Now we define the placeholder for our input i.e game state
X = tf.placeholder(tf.float32, shape=X_shape, name="X")

# we define a boolean called in_training_model to toggle the training
in_training_mode = tf.placeholder(tf.bool, name="in_training_mode")

# we build our Q network, which takes the input X and generates Q values for all the actions in the state
mainQ, mainQ_outputs = q_network(X, 'mainQ')

# similarly we build our target Q network
targetQ, targetQ_outputs = q_network(X, 'targetQ')

# define the placeholder for our action values
X_action = tf.placeholder(tf.int32, shape=(None,))
Q_action = tf.reduce_sum(targetQ_outputs * tf.one_hot(X_action, n_outputs), axis=-1, keep_dims=True)

# Copy the primary Q network parameters to the target Q network
copy_op = [tf.assign(main_name, targetQ[var_name]) for var_name, main_name in mainQ.items()]
copy_target_to_main = tf.group(*copy_op)

# Compute and optimize loss using gradient descent optimizer
# define a placeholder for our output i.e action
y = tf.placeholder(tf.float32, shape=(None,1))

# now we calculate the loss which is the difference between actual value and predicted value
loss = tf.reduce_mean(tf.square(y - Q_action))

# we use adam optimizer for minimizing the loss
optimizer = tf.train.AdamOptimizer(learning_rate)
training_op = optimizer.minimize(loss)

init = tf.global_variables_initializer()

loss_summary = tf.summary.scalar('LOSS', loss)
merge_summary = tf.summary.merge_all()
file_writer = tf.summary.FileWriter(logdir, tf.get_default_graph())

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use keras.layers.flatten instead.
mainQ/fully_connected_1/BiasAdd:0
targetQ/fully_connected_1/BiasAdd:0
Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
Use tf.cast instead.


 Now we start the tensorflow session and run the model,

In [None]:
with tf.Session() as sess:
    saver = tf.train.Saver()
    init.run()
    rewards = []
    # for each episode
    for i in range(num_episodes):
        done = False
        obs = env.reset()
        epoch = 0
        episodic_reward = 0
        actions_counter = Counter() 
        episodic_loss = []

        # while the state is not the terminal state
        while not done:

#             env.render()
        
            # get the preprocessed game screen
            obs = preprocess_observation(obs)

            # feed the game screen and get the Q values for each action
            actions = mainQ_outputs.eval(feed_dict={X:[obs], in_training_mode:False})

            # get the action
            action = np.argmax(actions, axis=-1)
            actions_counter[str(action)] += 1 

            # select the action using epsilon greedy policy
            action = epsilon_greedy(action, global_step)
            
            # now perform the action and move to the next state, next_obs, receive reward
            next_obs, reward, done, _ = env.step(action)

            # Store this transistion as an experience in the replay buffer
            exp_buffer.append([obs, action, preprocess_observation(next_obs), reward, done])
            
            # After certain steps, we train our Q network with samples from the experience replay buffer
            if global_step % steps_train == 0 and global_step > start_steps:
                
                # sample experience
                o_obs, o_act, o_next_obs, o_rew, o_done = sample_memories(batch_size)

                # states
                o_obs = [x for x in o_obs]

                # next states
                o_next_obs = [x for x in o_next_obs]

                # next actions
                next_act = mainQ_outputs.eval(feed_dict={X:o_next_obs, in_training_mode:False})


                # reward
                y_batch = o_rew + discount_factor * np.max(next_act, axis=-1) * (1-o_done) 

                # merge all summaries and write to the file
                mrg_summary = merge_summary.eval(feed_dict={X:o_obs, y:np.expand_dims(y_batch, axis=-1), X_action:o_act, in_training_mode:False})
                file_writer.add_summary(mrg_summary, global_step)

                # now we train the network and calculate loss
                train_loss, _ = sess.run([loss, training_op], feed_dict={X:o_obs, y:np.expand_dims(y_batch, axis=-1), X_action:o_act, in_training_mode:True})
                episodic_loss.append(train_loss)
            
            # after some interval we copy our main Q network weights to target Q network
            if (global_step+1) % copy_steps == 0 and global_step > start_steps:
                copy_target_to_main.run()
                
            obs = next_obs
            epoch += 1
            global_step += 1
            episodic_reward += reward
        print('Episode', i,'Epoch', epoch, 'Reward', episodic_reward, 'Global Step', global_step)
        rewards.append(episodic_reward)
        if i % 50 == 0:
            saver.save(sess, './models/trained-model-'+ str(i))
            pickle.dump(rewards, open('./rewards/trained-model-'+str(i)+'.pck', 'wb+'))
    saver.save(sess, './trained-model')
    pickle.dump(rewards, open('./rewards.pck', 'wb+'))
    env.close()
    
    

Episode 0 Epoch 723 Reward 120.0 Global Step 723
Episode 1 Epoch 698 Reward 155.0 Global Step 1421
Episode 2 Epoch 542 Reward 65.0 Global Step 1963
Episode 3 Epoch 392 Reward 35.0 Global Step 2355
Episode 4 Epoch 425 Reward 65.0 Global Step 2780
Episode 5 Epoch 463 Reward 35.0 Global Step 3243
Episode 6 Epoch 736 Reward 140.0 Global Step 3979
Episode 7 Epoch 628 Reward 210.0 Global Step 4607
Episode 8 Epoch 595 Reward 80.0 Global Step 5202
Episode 9 Epoch 663 Reward 180.0 Global Step 5865
Episode 10 Epoch 684 Reward 140.0 Global Step 6549
Episode 11 Epoch 1209 Reward 445.0 Global Step 7758
Episode 12 Epoch 686 Reward 410.0 Global Step 8444
Episode 13 Epoch 841 Reward 165.0 Global Step 9285
Episode 14 Epoch 637 Reward 110.0 Global Step 9922
Episode 15 Epoch 411 Reward 100.0 Global Step 10333
Episode 16 Epoch 634 Reward 80.0 Global Step 10967
Episode 17 Epoch 636 Reward 85.0 Global Step 11603
Episode 18 Epoch 641 Reward 80.0 Global Step 12244
Episode 19 Epoch 1050 Reward 490.0 Global Ste

Episode 158 Epoch 813 Reward 230.0 Global Step 108062
Episode 159 Epoch 991 Reward 345.0 Global Step 109053
Episode 160 Epoch 632 Reward 135.0 Global Step 109685
Episode 161 Epoch 868 Reward 440.0 Global Step 110553
Episode 162 Epoch 657 Reward 105.0 Global Step 111210
Episode 163 Epoch 492 Reward 60.0 Global Step 111702
Episode 164 Epoch 498 Reward 60.0 Global Step 112200
Episode 165 Epoch 653 Reward 80.0 Global Step 112853
Episode 166 Epoch 700 Reward 335.0 Global Step 113553
Episode 167 Epoch 920 Reward 180.0 Global Step 114473
Episode 168 Epoch 815 Reward 185.0 Global Step 115288
Episode 169 Epoch 783 Reward 155.0 Global Step 116071
Episode 170 Epoch 607 Reward 105.0 Global Step 116678
Episode 171 Epoch 639 Reward 120.0 Global Step 117317
Episode 172 Epoch 635 Reward 105.0 Global Step 117952
Episode 173 Epoch 831 Reward 180.0 Global Step 118783
Episode 174 Epoch 526 Reward 80.0 Global Step 119309
Episode 175 Epoch 703 Reward 80.0 Global Step 120012
Episode 176 Epoch 963 Reward 110.

Episode 305 Epoch 820 Reward 225.0 Global Step 211934
Episode 306 Epoch 721 Reward 170.0 Global Step 212655
Episode 307 Epoch 457 Reward 25.0 Global Step 213112
Episode 308 Epoch 398 Reward 50.0 Global Step 213510
Episode 309 Epoch 808 Reward 130.0 Global Step 214318
Episode 310 Epoch 759 Reward 210.0 Global Step 215077
Episode 311 Epoch 683 Reward 85.0 Global Step 215760
Episode 312 Epoch 700 Reward 135.0 Global Step 216460
Episode 313 Epoch 802 Reward 285.0 Global Step 217262
Episode 314 Epoch 674 Reward 110.0 Global Step 217936
Episode 315 Epoch 568 Reward 70.0 Global Step 218504
Episode 316 Epoch 1170 Reward 465.0 Global Step 219674
Episode 317 Epoch 497 Reward 105.0 Global Step 220171
Episode 318 Epoch 441 Reward 65.0 Global Step 220612
Episode 319 Epoch 634 Reward 155.0 Global Step 221246
Episode 320 Epoch 369 Reward 50.0 Global Step 221615
Episode 321 Epoch 437 Reward 75.0 Global Step 222052
Episode 322 Epoch 968 Reward 240.0 Global Step 223020
Episode 323 Epoch 1099 Reward 290.

Episode 458 Epoch 806 Reward 325.0 Global Step 316372
Episode 459 Epoch 806 Reward 215.0 Global Step 317178
Episode 460 Epoch 648 Reward 70.0 Global Step 317826
Episode 461 Epoch 330 Reward 35.0 Global Step 318156
Episode 462 Epoch 673 Reward 125.0 Global Step 318829
Episode 463 Epoch 409 Reward 70.0 Global Step 319238
Episode 464 Epoch 956 Reward 185.0 Global Step 320194
Episode 465 Epoch 904 Reward 130.0 Global Step 321098
Episode 466 Epoch 877 Reward 175.0 Global Step 321975
Episode 467 Epoch 639 Reward 105.0 Global Step 322614
Episode 468 Epoch 953 Reward 260.0 Global Step 323567
Episode 469 Epoch 1444 Reward 410.0 Global Step 325011
Episode 470 Epoch 787 Reward 215.0 Global Step 325798
Episode 471 Epoch 797 Reward 135.0 Global Step 326595
Episode 472 Epoch 583 Reward 30.0 Global Step 327178
Episode 473 Epoch 461 Reward 50.0 Global Step 327639
Episode 474 Epoch 464 Reward 125.0 Global Step 328103
Episode 475 Epoch 838 Reward 270.0 Global Step 328941
Episode 476 Epoch 611 Reward 105

Episode 611 Epoch 1301 Reward 360.0 Global Step 427326
Episode 612 Epoch 950 Reward 200.0 Global Step 428276
Episode 613 Epoch 750 Reward 200.0 Global Step 429026
Episode 614 Epoch 608 Reward 125.0 Global Step 429634
Episode 615 Epoch 414 Reward 35.0 Global Step 430048
Episode 616 Epoch 696 Reward 135.0 Global Step 430744
Episode 617 Epoch 395 Reward 55.0 Global Step 431139
Episode 618 Epoch 408 Reward 65.0 Global Step 431547
Episode 619 Epoch 373 Reward 60.0 Global Step 431920
Episode 620 Epoch 984 Reward 210.0 Global Step 432904
Episode 621 Epoch 621 Reward 115.0 Global Step 433525
Episode 622 Epoch 949 Reward 200.0 Global Step 434474
Episode 623 Epoch 818 Reward 205.0 Global Step 435292
Episode 624 Epoch 557 Reward 65.0 Global Step 435849
Episode 625 Epoch 505 Reward 65.0 Global Step 436354
Episode 626 Epoch 986 Reward 275.0 Global Step 437340
Episode 627 Epoch 442 Reward 50.0 Global Step 437782
Episode 628 Epoch 946 Reward 190.0 Global Step 438728
Episode 629 Epoch 352 Reward 30.0 

Episode 764 Epoch 808 Reward 185.0 Global Step 541470
Episode 765 Epoch 699 Reward 75.0 Global Step 542169
Episode 766 Epoch 831 Reward 175.0 Global Step 543000
Episode 767 Epoch 795 Reward 155.0 Global Step 543795
Episode 768 Epoch 1056 Reward 255.0 Global Step 544851
Episode 769 Epoch 804 Reward 210.0 Global Step 545655
Episode 770 Epoch 759 Reward 115.0 Global Step 546414
Episode 771 Epoch 662 Reward 120.0 Global Step 547076
Episode 772 Epoch 1265 Reward 390.0 Global Step 548341
Episode 773 Epoch 817 Reward 210.0 Global Step 549158
Episode 774 Epoch 958 Reward 285.0 Global Step 550116
Episode 775 Epoch 678 Reward 60.0 Global Step 550794
Episode 776 Epoch 683 Reward 110.0 Global Step 551477
Episode 777 Epoch 805 Reward 180.0 Global Step 552282
Episode 778 Epoch 862 Reward 210.0 Global Step 553144
Episode 779 Epoch 1411 Reward 250.0 Global Step 554555
Episode 780 Epoch 989 Reward 260.0 Global Step 555544
Episode 781 Epoch 622 Reward 110.0 Global Step 556166
Episode 782 Epoch 1079 Rewa

In [None]:
X.graph == tf.get_default_graph()