# Space Invaders with DQL 

In [1]:
import tensorflow as tf
import numpy as np
import gym

from skimage import transform
from skimage.color import rgb2gray
import matplotlib.pyplot as plt
from collections import deque
import random
import warnings

#warnings.filterwarnings('ignore')  #ignores skimage warnings during training

In [2]:
env = gym.make('SpaceInvaders-v0')
print("The size of our frame is: ", env.observation_space)
print("The action size is: ", env.action_space.n)
print("Action Meanings: ", env.get_action_meanings())
#One Hot encoded version of our actions
possible_actions = np.array(np.identity(env.action_space.n, dtype=int).tolist())
print("\nOH Actions: ", possible_actions)

The size of our frame is:  Box(210, 160, 3)
The action size is:  6
Action Meanings:  ['NOOP', 'FIRE', 'RIGHT', 'LEFT', 'RIGHTFIRE', 'LEFTFIRE']

OH Actions:  [[1 0 0 0 0 0]
 [0 1 0 0 0 0]
 [0 0 1 0 0 0]
 [0 0 0 1 0 0]
 [0 0 0 0 1 0]
 [0 0 0 0 0 1]]


## Preprocessing
1. Grayscale
2. Crop 
3. Normalize pixel values
4. Resize the frame

In [3]:
def preprocess_frame(frame):
    gray = rgb2gray(frame)
                 #[up: down, left:right]
    cropped_frame = gray[8:-12, 4:-12]  #crop the lower part; trim from the sides
    normalized_frame = cropped_frame / 255.0
    preprocessed_frame = transform.resize(normalized_frame, [110, 84])
    return preprocessed_frame  #110x84x1 frame

## Stacking frames
We skip 4 frames each timestep i.e. we only stack every fourth frame.

In [4]:
stack_size = 4

#Initialize deque with zero-images. One array for each image.
stacked_frames = deque([np.zeros((110, 84), dtype=np.int) for i in range(stack_size)], maxlen=4)

def stack_frames(stacked_frames, state, is_new_episode):
    frame = preprocess_frame(state)
    
    if is_new_episode:
        #Clear all old frames
        stacked_frames = deque([np.zeros((110, 84), dtype=np.int) for i in range(stack_size)], maxlen=4)
        
        stacked_frames.append(frame)
        stacked_frames.append(frame)
        stacked_frames.append(frame)
        stacked_frames.append(frame)
        
        #Stack 'em
        stacked_state = np.stack(stacked_frames, axis=2)
    else:
        stacked_frames.append(frame)
        stacked_state = np.stack(stacked_frames, axis=2)
    
    return stacked_state, stacked_frames


## Hyperparameters

In [6]:
# Model Hyperparams
state_size = [110, 84, 4]
action_size = env.action_space.n  #6 in our case
learning_rate = 0.00025

#Training Hyperparams
total_episodes = 50
max_steps = 50000
batch_size = 64

#Policy params
explore_start = 1.0
explore_stop = 0.01
decay_rate = 1e-5
gamma = 0.9  #for QL

pretrain_length = batch_size  #No. of experiences stored in the memory when initialized at the first time
memory_size = 1000000
stack_size = 4
training = True #False if you wanna see trained agent in action
episode_render = False

 ## Deep Q net
 
 Stack of 4 frames as input, 3 conv layers, flatten, 2 FC
 
 Output a Q Value for each action

In [7]:
class DQNetwork:
    def __init__(self, state_size, action_size, learning_rate, name='DQNetwork'):
        self.state_size = state_size
        self.action_size = action_size
        self.learning_rate = learning_rate
        
        with tf.variable_scope(name):
            self.inputs_ = tf.placeholder(tf.float32, [None, *state_size], name='inputs') #*state_size (star) indicates it is a tuple
            self.actions_ = tf.placeholder(tf.float32, [None, self.action_size], name='actions_')
            self.target_Q = tf.placeholder(tf.float32, [None], name='target')
            
            #ConvNet
            self.conv1 = tf.layers.conv2d(inputs=self.inputs_, filters=32, kernel_size=[8, 8], 
                                          strides=[4, 4], padding='VALID', 
                                          kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(), name='conv1')
            self.conv1_out = tf.nn.elu(self.conv1, name='conv1_out')
            
            self.conv2 = tf.layers.conv2d(inputs=self.conv1_out, filters=64, kernel_size=[4, 4],
                                         strides=[2, 2], padding='VALID',
                                         kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(), name='conv2')
            self.conv2_out = tf.nn.elu(self.conv2, name='conv2_out')
            
            self.conv3 = tf.layers.conv2d(inputs=self.conv2_out, filters= 64, kernel_size=[3, 3], 
                                         strides=[2, 2], padding='VALID', 
                                         kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(), name='conv3')
            self.conv3_out = tf.nn.elu(self.conv3, name='conv3_out')

            self.flatten = tf.contrib.layers.flatten(self.conv3_out)
            
            self.fc = tf.layers.dense(inputs=self.flatten, units=512, activation=tf.nn.elu, kernel_initializer=tf.contrib.layers.xavier_initializer(), name='fc1')
            self.output = tf.layers.dense(inputs=self.fc, kernel_initializer=tf.contrib.layers.xavier_initializer(), units=self.action_size, activation=None)
            
            #Q is the predicted value
            self.Q = tf.reduce_sum(tf.multiply(self.output, self.actions_))
            #Loss is the difference between predicted vs. target Q values
            self.loss = tf.reduce_mean(tf.square(self.target_Q - self.Q))
            
            self.optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)
            

In [7]:
tf.reset_default_graph()
#Instantiate the Network
DQNetwork = DQNetwork(state_size, action_size, learning_rate)


For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Use keras.layers.conv2d instead.
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use keras.layers.flatten instead.
Instructions for updating:
Use keras.layers.dense instead.


## Experience Replay

In [8]:
class Memory():
    def __init__(self, max_size):
        self.buffer = deque(maxlen=max_size)
    def add(self, experience):
        self.buffer.append(experience)
    def sample(self, batch_size):
        buffer_size = len(self.buffer)
        index = np.random.choice(np.arange(buffer_size), size=batch_size, replace=False)
        return [self.buffer[i] for i in index]

### Empty memory problem
We pre-populate our memory by taking random, actions and storing experience

In [13]:
memory = Memory(max_size=memory_size)
for i in range(pretrain_length):  #batch_size
    if i == 0:
        state = env.reset()
        state, stacked_frames = stack_frames(stacked_frames, state, True)
    
    choice = random.randint(1, len(possible_actions)) - 1
    action = possible_actions[choice]
    #action = choice
    next_state, reward, done, _ = env.step(choice)
    
    #env.render()
    next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
    
    if done:
        #if episode is over then there is no next state
        next_state = np.zeros(state.shape)
        #add episode to memory when it finishes
        memory.add((state, action, reward, next_state, done))
        #start a new episode
        state = env.reset()
        state, stacked_frames = stack_frames(stacked_frames, state, True)
    else:
        #if episode is still continuing
        memory.add((state, action, reward, next_state, done))
        #new state is next state
        state = next_state

## Tensorboard

In [10]:
writer = tf.summary.FileWriter('./tensorboard/dql/1')
tf.summary.scalar('Loss', DQNetwork.loss)
write_op = tf.summary.merge_all()

## Let's train now

























In [14]:
def predict_action(explore_start, explore_stop, decay_rate, decay_step, state, actions):
    exp_tradeoff = np.random.rand()
    
    explore_probability = explore_stop + (explore_start - explore_stop) * np.exp(-decay_rate * decay_step)
    
    if explore_probability > exp_tradeoff:
        #make a random choice [EXPLORE]
        choice = random.randint(1, len(possible_actions)) - 1
        action = possible_actions[choice]
        #action = choice
    else: #[EXPLOIT]
        Qs = sess.run(DQNetwork.output, feed_dict={DQNetwork.inputs_:state.reshape((1, *state.shape))})

        choice = np.argmax(Qs)
        action = possible_actions[choice]
        #action = choice
        
    return action, explore_probability

def predict_action_e_greedy(state, actions):
    epsilon = 0.2
    if np.random.rand() < epsilon:#EXPLORE
        choice = randon.randint(1, len(possible_actions)) - 1 #0-5
        action = possible_actions[choice]
    else:#EXPLOIT
        Qs = sess.run(DQNetwork.output, feed_dict={DQNetwork.inputs_:state.reshape((1, *state.shape))})
        choice = np.argmax(Qs)
        action = possible_actions[choice]

In [None]:
saver = tf.train.Saver()


if training == True:
    with tf.Session() as sess:
        #Initialize the variables
        sess.run(tf.global_variables_initializer())
        decay_step = 0
        rewards_list = []
        
        for episode in range(total_episodes):
            step = 0
            
            episode_rewards = []
            state = env.reset()
            #Process the state
            state, stacked_frames = stack_frames(stacked_frames, state, True)
            
            while step < max_steps:
                step += 1
                decay_step += 1
                
                action, explore_probability = predict_action(explore_start, explore_stop, decay_rate, decay_step, state, possible_actions)
                next_state, reward, done, _ = env.step(np.argmax(action))
                
                if episode_render:
                    env.render()
                episode_rewards.append(reward)
                
                if done:
                    next_state = np.zeros((110, 84), dtype=np.int)
                    next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
                    
                    #set step=max_steps to end the episode i.e end while loop
                    step = max_steps
                    total_reward = np.sum(episode_rewards)
                    
                    print('Episode: {}'.format(episode), 'Total reward: {}'.format(total_reward),
                                  'Explore P: {:.4f}'.format(explore_probability),
                                'Training Loss {:.4f}'.format(loss))
                    rewards_list.append((episode, total_reward))
                    # Store transition <st,at,rt+1,st+1> in memory D
                    memory.add((state, action, reward, next_state, done))
                    
                else:
                    next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
                    # Add experience to memory
                    memory.add((state, action, reward, next_state, done))
                    state = next_state
                    
                #LEARNING
                batch = memory.sample(batch_size)
                states_mb = np.array([each[0] for each in batch], ndmin=3)
                actions_mb = np.array([each[1] for each in batch])
                rewards_mb = np.array([each[2] for each in batch]) 
                next_states_mb = np.array([each[3] for each in batch], ndmin=3)
                dones_mb = np.array([each[4] for each in batch])

                target_Qs_batch = []
                #Get Q values for next_state 
                Qs_next_state = sess.run(DQNetwork.output, feed_dict = {DQNetwork.inputs_: next_states_mb})
                
                #Set Q_target = r if the episode ends at s+1, otherwise set Q_target = r + gamma*maxQ(s', a')
                for i in range(0, len(batch)):
                    terminal = dones_mb[i]

                    # If we are in a terminal state, only equals reward
                    if terminal:
                        target_Qs_batch.append(rewards_mb[i])
                        
                    else:
                        target = rewards_mb[i] + gamma * np.max(Qs_next_state[i])
                        target_Qs_batch.append(target)
                        

                targets_mb = np.array([each for each in target_Qs_batch])

                loss, _ = sess.run([DQNetwork.loss, DQNetwork.optimizer],
                                        feed_dict={DQNetwork.inputs_: states_mb,
                                                   DQNetwork.target_Q: targets_mb,
                                                   DQNetwork.actions_: actions_mb})
                # Write TF Summaries
                summary = sess.run(write_op, feed_dict={DQNetwork.inputs_: states_mb,
                                                       DQNetwork.target_Q: targets_mb,
                                                       DQNetwork.actions_: actions_mb})
                writer.add_summary(summary, episode)
                writer.flush()
                
            # Save model every 5 episodes
            if episode % 5 == 0:
                save_path = saver.save(sess, "./models/model.ckpt")
                print("Model Saved")


Episode: 0 Total reward: 210.0 Explore P: 0.9932 Training Loss 0.0993
Model Saved
Episode: 1 Total reward: 110.0 Explore P: 0.9858 Training Loss 0.4786
Episode: 2 Total reward: 120.0 Explore P: 0.9797 Training Loss 1.9359
Episode: 3 Total reward: 180.0 Explore P: 0.9726 Training Loss 1.8997
Episode: 4 Total reward: 185.0 Explore P: 0.9658 Training Loss 9.6552
Episode: 5 Total reward: 105.0 Explore P: 0.9601 Training Loss 1.5480
Model Saved
Episode: 6 Total reward: 315.0 Explore P: 0.9467 Training Loss 3.4678
Episode: 7 Total reward: 230.0 Explore P: 0.9380 Training Loss 625.0986
Episode: 8 Total reward: 380.0 Explore P: 0.9268 Training Loss 0.0368
Episode: 9 Total reward: 150.0 Explore P: 0.9208 Training Loss 18.5401
Episode: 10 Total reward: 110.0 Explore P: 0.9152 Training Loss 0.1006
Model Saved
Episode: 11 Total reward: 60.0 Explore P: 0.9104 Training Loss 0.0327
Episode: 12 Total reward: 55.0 Explore P: 0.9058 Training Loss 11.1173
Episode: 13 Total reward: 120.0 Explore P: 0.8990

# Testing our agent, watch it play!

In [21]:
with tf.Session() as sess:
    total_test_rewards = []
    saver.restore(sess, "./models/model.ckpt")
    
    for episode in range(1):
        total_rewards = 0
        state = env.reset()
        state, stacked_frames = stack_frames(stacked_frames, state, True)
        print("************************************************\n")
        print("EPISODE: ", episode)
        
        while True:
            #Reshape the state
            state = state.reshape((1, *state_size))
            #Get action from Qnet
            Qs = sess.run(DQNetwork.output, feed_dict={DQNetwork.inputs_:state})
            
            #Take the biggest Q value
            choice = np.argmax(Qs)
            action = possible_actions[choice]
            
            next_state, reward, done, _ = env.step(np.argmax(action))
            env.render()
            
            total_rewards += reward
            
            if done:
                print ("Score", total_rewards)
                total_test_rewards.append(total_rewards)
                break
                 
            next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
            state = next_state
            
    env.close()
                

INFO:tensorflow:Restoring parameters from ./models/model.ckpt
************************************************

EPISODE:  0
Score 270.0
