<a href="https://colab.research.google.com/github/kulka193/SpaceInvaders/blob/pb2/DQN_SpaceIvaders.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# SpaceInvaders DQN

#### Import dependencies

In [0]:
!pip install wandb -qq
import random
import gym
import numpy as np
from collections import deque
from keras.models import Sequential, load_model, save_model
from keras.layers import Dense, Conv2D, MaxPooling2D, Flatten, Dropout, Lambda
from keras.optimizers import Adam, RMSprop
import os
import cv2
import wandb
import math
from skimage.color import rgb2gray
from skimage.transform import resize
from keras import backend as KB
from google.colab import files
from random import randrange

In [0]:
from google.colab import drive
drive.mount('/content/drive')

# REQUIRES GOOGLE DRIVE SIGN-IN

#### Set hyperparameters

In [0]:
#wandb.init(project="qualcomm")

###Hyperparameters & Constants

In [0]:

batch_size = 32
n_episodes = 10000
output_dir = '/content/drive/My Drive/RL_modified_runs/run4'  #change the directory according to your local file system
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
stack_size = 4 # We stack 4 composite frames in total
FRAME_SKIP_SIZE = 4
min_observations_to_train = 10000
np.random.seed(10)
input_nn_shape = (84,84)


###Things to implement to improve performance

* Try out Dueling (DDQN )and Adavantage actor-critic variants like A2C and A3C (Big ticket item)
* Model based learning methods like planning (Muzero)
* Using RAM and spaceship lives as a part of the state-space features -> phi(state)

* https://colab.research.google.com/drive/1DggF1gE3FjRu4ftYhYoxQCxLIOaxwVyw
* https://becominghuman.ai/beat-atari-with-deep-reinforcement-learning-part-2-dqn-improvements-d3563f665a2c
* https://nihit.github.io/resources/spaceinvaders.pdf


##Things implemented:
* Model-free control using Q-learning
* Frame stacking to acheive temporal correlation (modify to look like: https://github.com/gsurma/atari/blob/master/gym_wrappers.py)
* Image preprocessing (gray scale, normalization, crop & resize)
* Use Experiential Replay
* Function Approximation of Q-learning agent using a Conv-Net based network on batchwise state-action pairs
* Hyperparameter tuning
*  Use of Target network for better convergence
* Use RMS Prop optimization with very less learning rate (done)
* Use same set of hyperparamaters as DeepMind paper (done)
* reward clipping -> (reward clipping and huber loss are  aim to achieve similar things)
* Huber loss function (done)

####Image Preprocessing

Utlity functions

In [0]:
def preprocess(observation):
    observation = cv2.cvtColor(cv2.resize(observation, (84, 110)), cv2.COLOR_BGR2GRAY)
    observation = observation[26:110,:]
    ret, observation = cv2.threshold(observation,1,255,cv2.THRESH_BINARY)
    return np.reshape(observation,(84,84,1))

color = np.array([210, 164, 74]).mean()

def preprocess_frame(obs):
    # Crop and resize
    img = np.uint8(resize(rgb2gray(obs), input_nn_shape, mode='constant'))
    return img

def convert_process_buffer(process_buffer):
    """Converts the list of NUM_FRAMES images in the process buffer
    into one training sample"""
    black_buffer = [cv2.resize(cv2.cvtColor(x, cv2.COLOR_RGB2GRAY), (84, 90)) for x in process_buffer]
    black_buffer = [x[1:85, :, np.newaxis] for x in black_buffer]
    return np.concatenate(black_buffer, axis=2) 

####Frame Stacking

In [0]:


# Initialize deque with zero-images one array for each image. Deque is a special kind of queue that deletes last entry when new entry comes in
stacked_frames  =  deque([np.zeros(input_nn_shape, dtype=np.int) for i in range(stack_size)], maxlen=stack_size)

def stack_frames(stacked_frames, state, is_new_episode):
    # Preprocess frame
    frame = preprocess_frame(state)
    
    if is_new_episode:
        # Clear our stacked_frames
        stacked_frames = deque([np.zeros(input_nn_shape, dtype=np.int) for i in range(stack_size)], maxlen=stack_size)
        
        # Because we're in a new episode, copy the same frame 4x, apply elementwise maxima
        maxframe = np.maximum(frame,frame)
        for i in range(stack_size):
            stacked_frames.append(frame)
        # Stack the frames
        stacked_state = np.stack(stacked_frames, axis=2)
        
    else:
        #Since deque append adds t right, we can fetch rightmost element
        maxframe=np.maximum(stacked_frames[-1],frame)
        # Append frame to deque, automatically removes the oldest frame
        stacked_frames.append(frame)

        # Build the stacked state (first dimension specifies different frames)
        stacked_state = np.stack(stacked_frames, axis=2) 
    
    return stacked_state, stacked_frames

In [0]:
def huber_loss(a, b):
    error = a - b
    quadratic_term = error*error / 2
    linear_term = abs(error) - 1/2
    use_linear_term = (abs(error) > 1.0)
    use_linear_term = KB.cast(use_linear_term, 'float32')
    return use_linear_term * linear_term + (1-use_linear_term) * quadratic_term

#### Define agent

In [0]:
class DQN:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = 0.99 # decay or discount rate: enables agent to take into account future actions in addition to the immediate ones, but discounted at this rate
        self.learning_rate = 0.0005 # rate at which NN adjusts models parameters via SGD to reduce cost 
        self.model = self._build_model() # private method
        self.target_model = self._build_model()
        self.num_experiences = 0 
    
    def _build_model(self):
        # neural net to approximate Q-value function:
        model = Sequential()
        model.add(Lambda(lambda x: x/255.0,input_shape=input_nn_shape + (stack_size,)))
        model.add(Conv2D(32, (8,8), strides= (4,4), activation='relu', padding='valid'))
        model.add(Conv2D(64, (4,4), strides= (2,2), activation='relu', padding='valid'))
        model.add(Conv2D(64, (3,3),  strides= (1,1), activation='relu', padding='valid'))
        model.add(Flatten())
        model.add(Dense(512, activation='relu')) # 1st hidden layer; states as input
        model.add(Dense(64,activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))
        model.compile(loss=huber_loss, optimizer=RMSprop(learning_rate=self.learning_rate, rho=0.95), metrics=['accuracy'])
        print(model.summary())
        return model

    '''
    #DQA
    def train(self, batch_size): # method that trains NN with experiences sampled from memory
        states, actions, rewards, next_states, dones = self.sample_memories(batch_size) # sample a minibatch from memory
        targets = np.zeros((batch_size, action_size))
        for i in range(batch_size):
            targets[i] = self.predict_returns(states[i], target=False)[0] #self.model.predict(states[i], batch_size = 1)
            Q_sa = self.predict_returns(next_states[i], target=True)[0]  #self.model.predict(next_states[i], batch_size = 1)
            targets[i, actions[i]] = rewards[i]
            if not dones[i]:
                targets[i, actions[i]] += self.gamma * np.max(Q_sa)
        loss = self.model.train_on_batch(states, targets)
        self.epsilon = max(self.epsilon_min, self.epsilon -((1-self.epsilon_min) * self.epsilon_decay))
    '''


    def train_network(self, data):
        x_train = []
        t_train = []
        batch_size = 0
        for batchpoint in data:
            this_s = batchpoint[0]
            action = batchpoint[1]
            reward = batchpoint[2]
            next_s = batchpoint[3]
            done = batchpoint[4]
            x_train.append(this_s.astype(np.float64))
            Q_sa = self.predict_returns(next_s, target=True)[0]
            t = self.predict_returns(this_s, target=False)[0]
            t[action] = reward
            if not done:
                t[action] += self.gamma * np.max(Q_sa)
            t_train.append(t)
            batch_size += 1
        x_train.asarray(np.float64).squeeze()
        t_train.asarray(np.float64).squeeze()
        print('X train shape', x_train.shape)
        print('Y train shape', t_train.shape)
        self.model.fit(x_train, t_train, batch_size, nb_epoch=1)

    def predict_returns(self, s_t, target=False):
        state = np.reshape(s_t, (1,) + s_t.shape)
        if target:
            return self.target_model.predict(state, batch_size = 1)
        return self.model.predict(state, batch_size = 1)

    '''
    #DQA
    def sample_memories(self, batch_size):
        if self.num_experiences < batch_size:
            batch = random.sample(self.memory, self.num_experiences)
        else:
            batch = random.sample(self.memory, batch_size)
        # Maps each experience in batch in batches of states, actions, rewards and new states
        s_batch, a_batch, r_batch, d_batch, s2_batch = list(map(np.array, list(zip(*batch))))
        return s_batch, a_batch, r_batch, d_batch, s2_batch
    '''


    '''
    #DQA
    def act(self, state):
        if np.random.random() <= self.epsilon:
            a = random.randrange(self.action_size)
        else:
            a = np.argmax(predict_returns(state)[0])#np.argmax(self.model.predict(state.reshape(shape=(1,)+state.shape))[0])
        return a
    '''
        
    def save(self, name):
        save_model(self.model, name)

    def target_train(self):
        self.target_model.model.set_weights(self.model.model.get_weights())

    def load(self, name, custom_objects):
        return load_model(name, custom_objects)
        #self.model.load_weights(name)

In [0]:
class Agent:
    def __init__(self, max_memory_size, action_size, state_size, epsilon_min=0.1, epsilon_decay=1e-04):
        self.max_memory_size = max_memory_size
        self.memory = deque(maxlen=max_memory_size)
        self.num_experiences = 0
        self.action_size = action_size
        self.state_size = state_size
        self.nn = DQN(state_size, action_size)
        self.epsilon_decay = epsilon_decay
        self.epsilon=1.0
        self.epsilon_min=epsilon_min


    def get_size(self):
        return len(self.memory)

    def remember(self, state, action, reward, next_state, done):
        if len(self.memory) < self.memory.maxlen: # list of previous experiences, enabling re-training later
            self.num_experiences += 1
        else:
            self.memory.popleft()
        self.memory.append((state, action, reward, next_state, done))
    
    def act(self, state):
        if np.random.random() <= self.epsilon:
            a = random.randrange(self.action_size)
        else:
            a = np.argmax(predict_returns(state)[0])#np.argmax(self.model.predict(state.reshape(shape=(1,)+state.shape))[0])
        return a

    def sample_memories(self, batch_size):
        batch = []
        for i in range(batch_size):
            batch.append(self.memory[randrange(0, len(self.memory))])
        return np.asarray(batch)
    
    def decay_epsilon(self):
        self.epsilon = max(self.epsilon_min, self.epsilon -((1-self.epsilon_min) * self.epsilon_decay))

    def set_target(self):
        self.nn.target_train()

    def train_agent(self, batch_size):
        batch_data = self.sample_memories(batch_size=batch_size)
        self.nn.train_network(batch_data)
        self.epsilon = max(self.epsilon_min, self.epsilon -((1-self.epsilon_min) * self.epsilon_decay))

#### Interact with environment

In [0]:
import matplotlib.pyplot as plt
%matplotlib inline
from IPython import display as ipythondisplay

In [0]:
!pip install gym pyvirtualdisplay > /dev/null 2>&1
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1

In [0]:
!apt-get update > /dev/null 2>&1
!apt-get install cmake > /dev/null 2>&1
!pip install --upgrade setuptools 2>&1
!pip install ez_setup > /dev/null 2>&1
!pip install gym[atari] > /dev/null 2>&1

In [0]:
from gym.wrappers import Monitor
import glob
import io
import base64
from IPython.display import HTML
from pyvirtualdisplay import Display

In [0]:
display = Display(visible=0, size=(1400,900))
display.start()

In [0]:
"""
Utility functions to enable video recording of gym environment and displaying it
To enable video, just do "env = wrap_env(env)""
"""

def show_video():
  mp4list = glob.glob('video/*.mp4')
  if len(mp4list) > 0:
    mp4 = mp4list[0]
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
  else: 
    print("Could not find video")
    

def wrap_env(env):
  env = Monitor(env, './video', force=True)
  return env

In [0]:
cumulative_reward = 0


def evaluate(episodic_reward, ep):
  '''
  Takes in the reward for an episode, calculates the cumulative_avg_reward
    and logs it in wandb. If episode > 100, stops logging scores to wandb.
    Called after playing each episode. See example below.

  Arguments:
    episodic_reward - reward received after playing current episode
  '''
  global cumulative_reward

  # your models will be evaluated on 100-episode average reward
  # therefore, we stop logging after 100 episodes
  # log total reward received in this episode to wandb
  #wandb.log({'episodic_reward': episodic_reward})

  # add reward from this episode to cumulative_reward
  cumulative_reward += episodic_reward

  # calculate the cumulative_avg_reward
  # this is the metric your models will be evaluated on
  if (ep + 1) % 100 == 0:
      cumulative_avg_reward = cumulative_reward/100
      print('cumulative_avg_reward: ',cumulative_avg_reward)
      # log cumulative_avg_reward over all episodes played so far
      #wandb.log({'cumulative_avg_reward': cumulative_avg_reward})
      cumulative_reward = 0

In [0]:
'''
batch_size = 32
no_op_steps =10  #number of steps at the start of each episode for which NoOP  is performed
C=10   # target network update frequency
start_ep=0  # keep this 0 if you're training a fresh model
load = True  # keep this False if you havent saved your graph
if load:
    load_dir = '/content/drive/My Drive/RL_modified_runs/run3'
    agent.model = agent.load(os.path.join(load_dir,'model_10000.h5'), custom_objects={'huber_loss': huber_loss})  #model you want to load #override
    start_ep = 0  #traiing to resume from which episode? #override
    agent.epsilon = 1.0  # value of epsilon when the training was left off previously #override
    print('resuming training... \n')
for e in range(start_ep,n_episodes + 1): # iterate over episodes of gameplay
    state = env.reset() # reset state at start of each new episode of the game
    #state = preprocess_frame(state)
    done = False
    agent.state_size = np.reshape(state,[-1,]).shape[0]
    #print(agent.state_size)
    for _ in range(no_op_steps):
        state, _, _, _ = env.step(0)
    state,stacked_frames= stack_frames(stacked_frames,state,True)
    time = 0 # time represents a frame of the episode
    total_rewards = 0
    while not done:
        action = agent.act(state) # action is either 0 or 1 (move cart left or right); decide on one or other here
        reward = 0
        for _ in range(0,FRAME_SKIP_SIZE-1):  # perform frame skipping
            next_state, r_frame, done, _ = env.step(action) # agent interacts with env, gets feedback;
            reward += r_frame
            if done:  # if episode ends: 
                print("episode: {}/{}, frames observed: {}, e: {:.2}" # print the episode's score and agent's epsilon
                  .format(e, n_episodes-1, total_frames_seen, agent.epsilon))
                break
        # reward = np.sign(reward)       # reward clipping
        next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)  #stack last 4 frames seen
        agent.remember(state, action, reward, next_state, done) # store experience 
        state = next_state  # set "current state" for upcoming iteration to the current next state        
        time += 1
        total_rewards += reward
        total_frames_seen += 4
    print('total rewards this episode: ', total_rewards)
    evaluate(total_rewards, e)
    if agent.size() > min_observations_to_train:
        agent.train(batch_size) # train the agent by replaying the experiences of the episode
    if e % C == 0 and e != 0:
        agent.target_train()   #target network training every C updates for stability
    if e % 500 == 0 and e!=0 :
        show_video()
        agent.save(os.path.join(output_dir,"model_"+str(int(e))+".h5"))
        #wandb.save(os.path.join(wandb.run.dir, "model.h5"))
        # agent.save(output_dir + "weights_"  + '{:04d}'.format(e) + ".hdf5")
env.close()
'''

In [0]:
def get_frames(current, obs):
    return np.append(current[1:], [obs], axis=0)

def preprocess_frame(obs):
    # Crop and resize
    img = np.uint8(resize(rgb2gray(obs), input_nn_shape, mode='constant'))
    return img

In [0]:
class PlayGame(object):
    def __init__(self, batch_size, n_epsiodes, update_freq, target_update_freq, load=False, noOp=30):
        self.batch_size = batch_size
        self.n_episodes = n_episodes
        self.load = load
        self.target_update_freq = target_update_freq
        self.update_freq = update_freq
        self.noOp = noOp
        if not load:
            self.start_ep = 0
        self._set_env()
        self.agent = Agent(max_memory_size=500000, state_size=self.env.observation_space.shape[0], action_size=self.env.action_space.n)
    
    def _set_env(self):
        self.env = wrap_env(gym.make("SpaceInvaders-v0"))


    def game(self, show_every):
        total_frames_seen = 0
        for e in range(self.start_ep, self.n_episodes + 1): # iterate over episodes of gameplay
            state = self.env.reset() # reset state at start of each new episode of the game
            #state = preprocess_frame(state)
            done = False
            self.agent.state_size = np.reshape(state,[-1,]).shape[0]
            #print(agent.state_size)
            for _ in range(self.noOp):
                state, _, _, _ = self.env.step(0)
            state = preprocess_frame(state)
            current_framed_state = np.array([state] *stack_size)
            #current_framed_state = current_framed_state.reshape([input_nn_shape[0], input_nn_shape[1]] + [stack_size])
            total_frames_seen += 1
            time_steps = 0
            total_rewards = 0
            frame_counter = 0
            while not done:
                action = self.agent.act(current_framed_state)
                next_state, reward, done, _ = self.env.step(action)
                total_frames_seen += 1
                next_state = preprocess_frame(next_state)
                if done:  # if episode ends: 
                    print("episode: {}/{}, frames observed: {}, frames till now: {}, e: {:.2} , total_rewards" # print the episode's score and agent's epsilon
                      .format(e, n_episodes-1, frame_counter, total_frames_seen, self.agent.epsilon, total_rewards))
                    break
                print(current_framed_state[1:].shape)
                next_framed_state = get_frames(current_framed_state, next_state)
                frame_counter += 1
                self.agent.remember(current_framed_state, action, reward, next_framed_state, done)
                if time_steps % self.update_freq == 0 and self.agent.get_size() >= min_observations_to_train:
                    self.agent.train_agent(self.batch_size)
                    if total_frames_seen % self.target_update_freq == 0 and total_frames_seen >= min_observations_to_train:
                        self.agent.set_target()
                current_framed_state = next_framed_state
                total_rewards += reward
                time_steps += 1
                if e % show_every == 0 and e!=0 :
                    show_video()
                    #agent.save(os.path.join(output_dir,"model_"+str(int(e))+".h5"))
                    self.agent.nn.save(os.path.join(output_dir,"model_"+str(int(e))+".h5"))
        self.env.close()

In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
playgame = PlayGame(batch_size=32, n_epsiodes=10000, update_freq=4, target_update_freq=10000)
playgame.game(show_every=500)

# New Section

In [0]:
from google.colab import files
files.download(os.path.join(output_dir,"model_"+str(int(e))+".h5"))

# New Section

In [0]:
agent.save('model.h5')

In [0]:
agent.load('model.h5')
agent.model.summary()

In [0]:
agent.save(os.path.join(wandb.run.dir, "model.h5"))

In [0]:
from keras.models import load_model

In [0]:
wandb.save(os.path.join(wandb.run.dir, "model.h5"))

In [0]:
s = [(0,1),(1,2)]
s.remove((0,1))

In [0]:
state

In [0]:
env.reset().shape

Evaluate and play

In [0]:
api = wandb.Api()
agent.load('/content/wandb/run-20200422_031633-24ees0sk/model.h5')