In [0]:
# Reinforcement Learning
# Deep Q Learning for Atari Games

In [15]:
# installing retro:
!apt-get install pkg-config lua5.1 build-essential libav-tools git

!pip install tqdm retrowrapper gym-retro
!pip install -U git+git://github.com/frenchie4111/dumbrain.git

Reading package lists... 0%Reading package lists... 0%Reading package lists... 0%Reading package lists... 8%Reading package lists... 8%Reading package lists... 8%Reading package lists... 8%Reading package lists... 70%Reading package lists... 75%Reading package lists... 75%Reading package lists... 76%Reading package lists... 76%Reading package lists... 81%Reading package lists... 81%Reading package lists... 82%Reading package lists... 82%Reading package lists... 90%Reading package lists... 90%Reading package lists... 90%Reading package lists... 90%Reading package lists... 90%Reading package lists... 90%Reading package lists... 90%Reading package lists... 90%Reading package lists... 92%Reading package lists... 92%Reading package lists... 92%Reading package lists... 92%Reading package lists... 93%Reading package lists... 93%Reading package lists... 93%Reading package lists... 93%Reading package lists... 94%Reading package 

In [0]:
# import libraries:
import tensorflow as tf      # Deep Learning Framework
import numpy as np           # Handling Matrics
import retro                 # Retro Environment

from skimage import transform
from skimage.color import rgb2grey

import matplotlib.pyplot as plt

from collections import deque

import random
import warnings

warnings.filterwarnings('ignore')

In [17]:
# Create Game Environment:
!python -m dumbrain.rl.retro_contest.install_games http://aiml.mikelyons.org/datasets/sonic/Sonic%20Roms.zip 

Namespace(download_url='http://aiml.mikelyons.org/datasets/sonic/Sonic%20Roms.zip', romdir='data/roms/')
3694592it [00:00, 6669823.91it/s]                 
100% 5767168/5767168 [00:00<00:00, 92112350.10it/s]
Importing SonicAndKnuckles3-Genesis
Importing SonicTheHedgehog2-Genesis
Importing SonicTheHedgehog-Genesis
Imported 3 games


In [0]:
env.close() # for closing the running instance of the game

In [19]:
# creating environment:
env =retro.make(game='SonicTheHedgehog-Genesis')

print('The Size of our Frame is : ',env.observation_space)

print('The action_size is : ',env.action_space.n)


The Size of our Frame is :  Box(224, 320, 3)
The action_size is :  12


In [20]:
# creating action matrics using one-hot encoder:
possible_actions = np.array(np.identity(env.action_space.n,dtype=int).tolist())
print(possible_actions)

[[1 0 0 0 0 0 0 0 0 0 0 0]
 [0 1 0 0 0 0 0 0 0 0 0 0]
 [0 0 1 0 0 0 0 0 0 0 0 0]
 [0 0 0 1 0 0 0 0 0 0 0 0]
 [0 0 0 0 1 0 0 0 0 0 0 0]
 [0 0 0 0 0 1 0 0 0 0 0 0]
 [0 0 0 0 0 0 1 0 0 0 0 0]
 [0 0 0 0 0 0 0 1 0 0 0 0]
 [0 0 0 0 0 0 0 0 1 0 0 0]
 [0 0 0 0 0 0 0 0 0 1 0 0]
 [0 0 0 0 0 0 0 0 0 0 1 0]
 [0 0 0 0 0 0 0 0 0 0 0 1]]


In [0]:
# Define the Pre-Processing Function:
# Grayscale each of our frames (because color does not add important information ).
# We normalize pixel values
# Finally we resize the preprocessed frame

In [0]:
def preprocess_frame(frame):
  grey = rgb2grey(frame)
  normalized = grey/255.0
  preprocessed_frame = transform.resize(normalized,[110,84])
  
  return preprocessed_frame

In [0]:
# Stacking Frames:

# First we preprocess frame
# Then we append the frame to the deque that automatically removes the oldest frame
# Finally we build the stacked state

stack_size = 4     # stacking 4 frames at a time

# initialize deque with 0 images , an array for each image:
stacked_frames = deque([np.zeros((110,84),dtype=int) for i in range(stack_size)],maxlen=4)

def stack_frames(stacked_frames,state,is_new_episode):
  
  # Preprocess Frame:
  frame = preprocess_frame(state)
  
  if is_new_episode:
    # clear stacked frame:
    stacked_frames = deque([np.zeros((110,84),dtype=int) for i in range(stack_size)],maxlen=4)
    
    # appending the first frame 4 times as the episode is new , so no frame was there initially
    stacked_frames.append(frame)
    stacked_frames.append(frame)
    stacked_frames.append(frame)
    stacked_frames.append(frame)
    
    # stack frames
    stacked_state  = np.stack(stacked_frames,axis=2)
    
  else:
    stacked_frames.append(frame) # automatically removes the oldest frame from the deque
    
    stacked_state = np.stack(stacked_frames,axis=2)
    
  return stacked_state , stacked_frames

In [0]:
# setting up our Hyper-Parameters:
# First, you begin by defining the neural networks hyperparameters when you implement the model.
# Then, you'll add the training hyperparameters when you implement the training algorithm.

# Model Hyper-Parameters:
state_size = [110, 84, 4]      # Our input is a stack of 4 frames hence 110x84x4 (Width, height, channels) 
action_size = env.action_space.n # 12 possible actions
learning_rate =  0.00025      # Alpha (aka learning rate)

# Training Hyper-Parameters:
total_episodes = 50            # Total episodes for training
max_steps = 50000              # Max possible steps in an episode
batch_size = 64                # Batch size

# Exploration Parameter for epsilon greedy strategy:
explore_start = 1.0            # exploration probability at start
explore_stop = 0.01            # minimum exploration probability 
decay_rate = 0.00001           # exponential decay rate for exploration prob

# Q Learning Parameters:
gamma = 0.9                    # Discounting rate

# Memory Hyper-Parameters:
pretrain_length = batch_size   # Number of experiences stored in the Memory when initialized for the first time
memory_size = 1000000          # Number of experiences the Memory can keep

# Pre-Processing Hyper-Parameters:
stack_size = 4                 # Number of frames stacked

# Modify this if you just want to see training agent:
training = True

# Turn this to TRUE , if you want to render the environment
episode_render = True

In [0]:
# Creating Deep Q Learning Neural Network:

# We take a stack of 4 frames as input
# It passes through 3 convnets
# Then it is flatened
# Finally it passes through 2 FC layers
# It outputs a Q value for each actions

In [0]:
class DQNetwork:
  def __init__(self,state_size,action_size,learning_rate,name='DQNetwork'):
    self.state_size = state_size
    self.action_size = action_size
    self.learning_rate = learning_rate
    
    with tf.variable_scope(name):
      # We create the placeholders
      # *state_size means that we take each elements of state_size in tuple hence is like if we wrote
      # [None, 110, 84, 4]
      
      self.inputs_ = tf.placeholder(tf.float32,[None,*state_size],name='inputs_')
      self.actions_ = tf.placeholder(tf.float32,[None,action_size],name='actions_')
      
      # Remember that target_Q is the R(s,a) + ymax Qhat(s', a'):
      self.target_Q = tf.placeholder(tf.float32,[None],name='target_Q')
      
      """
      First convnet:
      CNN
      ELU   ( Exponential Linear Unit )
      """
      self.conv1 = tf.layers.conv2d(inputs = self.inputs_,
                                         filters = 32,
                                         kernel_size = [8,8],
                                         strides = [4,4],
                                         padding = "VALID",
                                         kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
                                         name = "conv1")
            
      self.conv1_out = tf.nn.elu(self.conv1, name="conv1_out")
      
      """
      Second convnet:
      CNN
      ELU   ( Exponential Linear Unit )
      """
      self.conv2 = tf.layers.conv2d(inputs = self.conv1_out,
                                   filters = 64,
                                   kernel_size = [4,4],
                                   strides = [2,2],
                                   padding = "VALID",
                                   kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
                                   name= "conv2")
      
      self.conv2_out = tf.nn.elu(self.conv2,name="conv2_out")
      
      """
      Third convnet:
      CNN
      ELU   ( Exponential Linear Unit )
      """
      self.conv3 = tf.layers.conv2d(inputs = self.conv2_out,
                                   filters=64,
                                   kernel_size=[3,3],
                                   strides=[2,2],
                                   padding="VALID",
                                   kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
                                   name="conv3")
      
      self.conv3_out = tf.nn.elu(self.conv3,name="conv3_out")
      
      # Flattening the outputs from 3 Convolution Layers:
      
      self.flatten = tf.contrib.layers.flatten(self.conv3_out)
      
      # Fully Connected Layer:
      
      self.fc = tf.layers.dense(inputs=self.flatten,
                          units=512,
                          activation=tf.nn.elu,
                          kernel_initializer=tf.contrib.layers.xavier_initializer(),
                          name="fc1")
      
      self.output = tf.layers.dense(inputs = self.fc, 
                                           kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                          units = self.action_size, 
                                        activation=None)
      
      # Q is our predicted Q value:
        
      self.Q = tf.reduce_sum(tf.multiply(self.output, self.actions_))
            
      # The loss is the difference between our predicted Q_values and the Q_target
      # Sum(Qtarget - Q)^2
        
      self.loss = tf.reduce_mean(tf.square(self.target_Q - self.Q))
            
      self.optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)

In [0]:
# Reset the Graph:
tf.reset_default_graph()

# Initiating the DQNetwork:
DQNetwork = DQNetwork(state_size,action_size,learning_rate)

In [0]:
# Experience Replay:

# Here we'll create the Memory object that creates a deque.
# A deque (double ended queue) is a data type that removes the oldest element each time that you add a new element.

In [0]:
class Memory():
  def __init__(self,max_size):
    self.buffer = deque(maxlen=max_size)
    
  def add(self,experience):
    self.buffer.append(experience)
    
  def sample(self,batch_size):
    buffer_size = len(self.buffer)
    index = np.random.choice(np.arrange(buffer_size),
                            size=batch_size,
                            replace=False)
    
    return [self.buffer[i] for i in index]

In [0]:
# Here we'll deal with the empty memory problem: 
# we pre-populate our memory by taking random actions and storing the experience (state, action, reward, next_state).

In [0]:
# instantiate Memory:
memory = Memory(max_size=memory_size)
for i in range(pretrain_length):
  # if its first step:
  
  if i==0:
    state = env.reset()
    
    state,stacked_frames = stack_frames(stacked_frames,state,True)
    
  # get the next state , reward , by taking a random action:
  choice = random.randint(1,len(possible_actions))-1
  action = possible_actions[choice]
  next_state,reward,done,_=env.step(action)
  
  # env.render()
  
  # stack the frames:
  next_state , stacked_frames = stack_frames(stacked_frames,next_state,False)
  
  # if the episode is finished:
  if done:
    # we finished the episode:
    next_state = np.zeros(state.shape)
    
    # adding expeience to memory:
    memory.add((state,action,reward,next_state,done))
    
    # start a new episode:
    state = env.reset()
    
    # stack the frames:
    state , stacked_frames = stack_frames(stacked_frames,state,True)
    
  else:
    # Add experience to memory
    memory.add((state, action, reward, next_state, done))
        
    # Our new state is now the next_state
    state = next_state

In [0]:
# Setup Tensorboard:

# Setup TensorBoard Writer
writer = tf.summary.FileWriter("/tensorboard/dqn/1")

## Losses
tf.summary.scalar("Loss", DQNetwork.loss)

write_op = tf.summary.merge_all()

In [0]:
# Train our agent:


"""
This function will do the part
With ϵϵ select a random action atat, otherwise select at=argmaxaQ(st,a)
"""

def predict_action(explore_start,explore_stop,decay_rate,decay_step,state,actions):
  ## EPSILON GREEDY STRATEGY
  # Choose action a from state s using epsilon greedy.
  ## First we randomize a number
  
  exp_exp_tradeoff = np.random.rand()
  
  # Using improved version of Exploration Probability:
  explore_probability = explore_stop + (explore_start - explore_stop) * np.exp(-decay_rate * decay_step)
  
  if (explore_probability > exp_exp_tradeoff):
    # Make a random action ( exploration )
    choice = random.randint(1,len(possible_actions))-1
    action = possible_actions[choice]
    
  else:
    # Get action from Q-network (exploitation)
    # Estimate the Qs values state
    Qs = sess.run(DQNetwork.output,feed_dict = {DQNetwork.inputs_ : state.reshape((1,*state.shape))})
    
    # Take the biggest Q value (= the best action)
    choice = np.argmax(Qs)
    action = possible_action[choice]
    
  return action,explore_probability

In [0]:
saver = tf.train.Saver()      # saving the model

if training == True:
  with tf.Session() as sess:
    # initialize the variables:
    sess.run(tf.global_variables_initializer())
    
    # initialize the decay rate to reduce epsilon:
    decay_step = 0
    
    for episode in range(total_episodes):
      # set step equals to zero
      
      step =0
      
      # initialize the rewards of the episode:
      episode_rewards = []
      
      # Make a new episode and observe the first state:
      state = env.reset()
      
      state , stacked_frames = stack_frames(stacked_frames,state,True)
      
      while step < max_steps:
        step += 1
        
        # increase decay step:
        decay_step += 1
        
        # predict the action to take , and take it:
        action , explore_probability = predict_action(explore_start,explore_stop,decay_rate,decay_step,state,possible_actions)
        
        # perfor the action and get the next_state,reward and done information:
        next_state,reward,done,_ = env.step(action)
        
        if episode_render:
          env.render()
          
        # add the reward to total reward:
        episode.rewards.append(reward)
        
        # if game is finished:
        if done:
          # the episode ends , so no next state:
          next_state = np.zeroes((110,84),dtype=np.int)
          
          next_state , stacked_frames = stack_frames(stacked_frames,next_state,False)
          
          # set step equals max steps to end the episode:
          step = max_steps
          
          # get the total reward of the episode:
          total_reward = np.sum(episode_rewards)
          
          print('Episode: {}'.format(episode),
               'Total Reward: {}'.format(total_reward),
               'Explore P: {:.4f}'.format(explore_probability),
               'Training Loss {:.4f}'.format(loss))
          
          rewards_list.append((episode,total_rewards))
          
          # store transition <st,at,rt+1,st+1> in memory D
          memory.add((state,action,reward,next_state,done))
          
          
        else:
          # stack the frame of next state:
          next_state , stacked_frames = stack_frames(stacked_frames,next_state,False)
            
          # adding experience into memory:
          memory.add((state,action,reward,next_state,done))
            
          # st+1 is now our new state:
          state = next_state
            
        ## Learning Part:
        # obtain random mini-batch from the memory:
          
        batch = memory.sample(batch_size)
        states_mb = np.array([each[0] for each in batch],ndim=3)
        actions_mb = np.array([each[1] for each in batch])
        rewards_mb = np.array([each[2] for each in batch]) 
        next_states_mb = np.array([each[3] for each in batch],ndim=3)
        dones_mb = np.array([each[4] for each in batch])
          
        target_Qs_batch = []
          
        # get Q values for next_state:
        Qs_next_state = DQNetwork(DQNetwork.output,feed_dict={DQNetwork.inputs_:next_states_mb})
          
        # set Q_target equals r if episode ends at st+1 , else Q_target = r + gamma * max Q(s',a'):
          
        for i in range(0,len(batch)):
          terminals = dones_mb[i]
            
          # if we are in termina state , then only eqauls to reward:
          if terminal:
            target_Qs_batch.append(rewards_mb[i])
              
          else:
            target = rewards_mb[i] + gamma * np.max(Qs_next_state[i])
            target_Qs_batch.append(target)
              
          targets_mb = np.array([each for each in target_Qs_batch])
            
          loss,_ = sess.run([DQNetwork.loss,DQNetwork.optimizer],feed_dict={DQNetwork.inputs_:states_mb,
                                                                             DQNetwork.target_Q:targets_mb,
                                                                             DQNetworks.actions:actions_mb})
            
          writer.add_summary(summary,episode)
          writer.flush()
            
      # save model at every 5 steps:
      if episode%5 == 0:
        save_path = saver.save(sess,"C:/timru/dqn.ckpt")
        print("Model Saved")

In [0]:
# Test and watch our agent play:

with tf.Session as sess:
  total_test_rewards = []
  
  # Load the model:
  saver.restore(sess,"C:/timru/dqn.ckpt")
  
  for episode in range(1):
    total_rewards = 0
    
    state = env.reset()
    
    state ,stacked_frames = stack_frames(stacked_frames , state ,True)
    
    print("Episode ",episode)
    
    while True:
      # reshape the state:
      state = state.reshape((1,*state_size))
      
      # get action from Q-Network:
      Qs = sess.run(DQNetwork.output,feed_dict={dQNetwork.inputs_ : state})
      
      # take the biggest Q value ( the best action )
      choice =argmax(Qs)
      action = possible_actions[choice]
      
      # perform the action and get the next state , reward and done information:
      next_state , reward , done, _ = env.step(action)
      env.render()
      
      total_rewards += reward
      
      if done:
        print("Score = ",total_rewards)
        total_test_rewards.append(total_rewards)
        break
        
      next_state , stacked_frames = stack_frames(stacked_frames , next_state ,done , False)
      state = next_state
      
  env.close()