# SpaceInvaders DQN

#### Import dependencies

In [0]:
!pip install wandb -qq
import random
import gym
import numpy as np
from collections import deque
from keras.models import Sequential, load_model, save_model
from keras.layers import Dense, Conv2D, MaxPooling2D, Flatten, Dropout, Lambda
from keras.optimizers import Adam, RMSprop
import os
import cv2
import wandb
import math
from skimage.color import rgb2gray
from skimage.transform import resize
from keras import backend as KB
from google.colab import files

#### Set hyperparameters

In [0]:
#wandb.init(project="qualcomm")

In [0]:
env = gym.make('SpaceInvaders-v0') # initialize environment

In [0]:
state_size = env.observation_space.shape[0]
state_size

In [0]:
action_size = env.action_space.n
action_size

In [0]:
batch_size = 32
n_episodes = 10000
output_dir = 'model_output/spaceinvaders/'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
stack_size = 3 # We stack 3 composite frames in total
total_frames_seen = 0
final_epsilon = 0.1

min_observations_to_train = 10000
np.random.seed(12345)
TAU=0.01

In [0]:
env.observation_space.shape


###Things to implement to improve performance
1.  Use Target network
2.  Use RMS Prop with very less learning rate (done)
3. Use same hyperparamaters as DeepMind paper
4. Frame skipping and reward clipping -> (reward clipping and huber loss are mutually exclusive)
6. Huber loss optimization (done)
7. Try out Dueling (DDQN )and A2C/A3C
8. Using RAM and spaceship lives as a part of the state-space features

* https://colab.research.google.com/drive/1DggF1gE3FjRu4ftYhYoxQCxLIOaxwVyw
* https://becominghuman.ai/beat-atari-with-deep-reinforcement-learning-part-2-dqn-improvements-d3563f665a2c
* https://nihit.github.io/resources/spaceinvaders.pdf


##Things implemented:
* Frame stacking to acheive temporal correlation (modify to look like: https://github.com/gsurma/atari/blob/master/gym_wrappers.py)
* Image preprocessing ( gray scale, normalization, crop & resize)
* Use Experiential Replay buffers
* Function Approximation of Q-learning agent using a Conv-Net based network on batchwise state-action pairs
* Hyperparameter tuning

####Image Preprocessing

In [0]:
input_nn_shape = (84,84)

In [0]:
def preprocess(observation):
    observation = cv2.cvtColor(cv2.resize(observation, (84, 110)), cv2.COLOR_BGR2GRAY)
    observation = observation[26:110,:]
    ret, observation = cv2.threshold(observation,1,255,cv2.THRESH_BINARY)
    return np.reshape(observation,(84,84,1))

color = np.array([210, 164, 74]).mean()

def preprocess_frame(obs):
    # Crop and resize
    img = np.uint8(resize(rgb2gray(obs), input_nn_shape, mode='constant'))
    return img

def convert_process_buffer(process_buffer):
    """Converts the list of NUM_FRAMES images in the process buffer
    into one training sample"""
    black_buffer = [cv2.resize(cv2.cvtColor(x, cv2.COLOR_RGB2GRAY), (84, 90)) for x in process_buffer]
    black_buffer = [x[1:85, :, np.newaxis] for x in black_buffer]
    return np.concatenate(black_buffer, axis=2) 

####Frame Stacking

In [0]:


# Initialize deque with zero-images one array for each image. Deque is a special kind of queue that deletes last entry when new entry comes in
stacked_frames  =  deque([np.zeros(input_nn_shape, dtype=np.int) for i in range(stack_size)], maxlen=stack_size)

def stack_frames(stacked_frames, state, is_new_episode):
    # Preprocess frame
    frame = preprocess_frame(state)
    
    if is_new_episode:
        # Clear our stacked_frames
        stacked_frames = deque([np.zeros(input_nn_shape, dtype=np.int) for i in range(stack_size)], maxlen=stack_size)
        
        # Because we're in a new episode, copy the same frame 4x, apply elementwise maxima
        maxframe = np.maximum(frame,frame)
        for i in range(stack_size):
            stacked_frames.append(frame)
        # Stack the frames
        stacked_state = np.stack(stacked_frames, axis=2)
        
    else:
        #Since deque append adds t right, we can fetch rightmost element
        maxframe=np.maximum(stacked_frames[-1],frame)
        # Append frame to deque, automatically removes the oldest frame
        stacked_frames.append(frame)

        # Build the stacked state (first dimension specifies different frames)
        stacked_state = np.stack(stacked_frames, axis=2) 
    
    return stacked_state, stacked_frames


def temp_bufferize(env):
    for i in range(NUM_FRAMES):
        temp_observation, temp_reward, temp_done, _ = env.step(predict_movement)
        reward += temp_reward
        self.process_buffer.append(temp_observation)
        done = done | temp_done

In [0]:
def huber_loss(a, b):
    error = a - b
    quadratic_term = error*error / 2
    linear_term = abs(error) - 1/2
    use_linear_term = (abs(error) > 1.0)
    use_linear_term = KB.cast(use_linear_term, 'float32')
    return use_linear_term * linear_term + (1-use_linear_term) * quadratic_term

#### Define agent

In [0]:
class DQNAgent:
    def __init__(self, state_size, action_size, epsilon_norm):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=800000) # double-ended queue; acts like list, but elements can be added/removed from either end
        self.gamma = 0.99 # decay or discount rate: enables agent to take into account future actions in addition to the immediate ones, but discounted at this rate
        self.epsilon = 1.0 # exploration rate: how much to act randomly; more initially than later due to epsilon decay
        self.epsilon_min = 0.1 # minimum amount of random exploration permitted
        self.epsilon_decay = 1e-04 #(self.epsilon - self.epsilon_min) / (1 + math.log(epsilon_norm))  with decay=0.9995 # decrease number of random explorations as the agent's performance (hopefully) improves over time
        print('Choosing epsilon decay rate=',self.epsilon_decay)
        self.learning_rate = 0.00025 # rate at which NN adjusts models parameters via SGD to reduce cost 
        self.model = self._build_model() # private method
        self.target_model = self._build_model()
        self.num_experiences = 0 
    
    def _build_model(self):
        # neural net to approximate Q-value function:
        model = Sequential()
        model.add(Lambda(lambda x: x/255.0,input_shape=input_nn_shape + (stack_size,)))
        model.add(Conv2D(32, (8,8), strides= (4,4), activation='relu', padding='same'))
        model.add(Conv2D(64, (4,4), strides= (2,2), activation='relu', padding='same'))
        model.add(Conv2D(64, (3,3),  strides= (1,1), activation='relu', padding='same'))
        model.add(Flatten())
        model.add(Dropout(0.1))
        model.add(Dense(512, activation='relu')) # 1st hidden layer; states as input
        model.add(Dense(64,activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))
        model.compile(loss=huber_loss, optimizer=RMSprop(learning_rate=self.learning_rate, rho=0.95), metrics=['accuracy'])
        print(model.summary())
        return model
    
    def remember(self, state, action, reward, next_state, done):
        if self.num_experiences < self.memory.maxlen:
            self.memory.append((state, action, reward, next_state, done)) # list of previous experiences, enabling re-training later
            self.num_experiences += 1
        else:
            self.memory.popleft()
            self.memory.append((state, action, reward, next_state, done))

    def size(self):
        return self.num_experiences

    def train(self, batch_size): # method that trains NN with experiences sampled from memory
        states, actions, rewards, next_states, dones = self.sample_memories(batch_size) # sample a minibatch from memory
        targets = np.zeros((batch_size, action_size))
        for i in range(batch_size):
            targets[i] = self.predict_returns(states[i], target=False)[0] #self.model.predict(states[i], batch_size = 1)
            Q_sa = self.predict_returns(next_states[i], target=True)[0]  #self.model.predict(next_states[i], batch_size = 1)
            targets[i, actions[i]] = rewards[i]
            if not dones[i]:
                targets[i, actions[i]] += self.gamma * np.max(Q_sa)
        loss = self.model.train_on_batch(states, targets)
        self.epsilon = max(self.epsilon_min, self.epsilon -((1-self.epsilon_min) * self.epsilon_decay))
        '''
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            td_target = reward
            if not done:
                Q_next = self.model.predict(next_state)[0]
                td_target = reward + self.gamma * np.max(Q_next)
            target_f = self.model.predict(state)
            target_f[0, action] = td_target
            self.model.fit(state, [target_f], epochs=1, verbose=0)
        #self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay) #max(self.epsilon_min, self.epsilon - ((self.epsilon_decay)**2.5) * self.epsilon)
        self.epsilon = max(self.epsilon_min, self.epsilon -((1-self.epsilon_min) * self.epsilon_decay))
        '''
    
    def predict_returns(self, s_t, target=False):
        state = np.reshape(s_t, (1,) + s_t.shape)
        if target:
            return self.target_model.predict(state, batch_size = 1)
        return self.model.predict(state, batch_size = 1)


    def sample_memories(self, batch_size):
        if self.num_experiences < batch_size:
            batch = random.sample(self.memory, self.num_experiences)
        else:
            batch = random.sample(self.memory, batch_size)
        # Maps each experience in batch in batches of states, actions, rewards and new states
        s_batch, a_batch, r_batch, d_batch, s2_batch = list(map(np.array, list(zip(*batch))))
        return s_batch, a_batch, r_batch, d_batch, s2_batch

    def act(self, state):
        if np.random.random() <= self.epsilon:
            a = random.randrange(self.action_size)
        else:
            a = np.argmax(self.predict_returns(state)[0])#np.argmax(self.model.predict(state.reshape(shape=(1,)+state.shape))[0])
        return a
    
    def save(self, name):
        save_model(self.model, name)

    def target_train(self):
        model_weights = self.model.get_weights()
        target_model_weights = self.target_model.get_weights()
        for i in range(len(model_weights)):
            target_model_weights[i] = TAU * model_weights[i] + (1 - TAU) * target_model_weights[i]
        self.target_model.set_weights(target_model_weights)

    def load(self, name, custom_objects):
        return load_model(name, custom_objects)
        #self.model.load_weights(name)

#### Interact with environment

In [0]:
agent = DQNAgent(state_size, action_size, n_episodes) # initialise agent

In [0]:
import matplotlib.pyplot as plt
%matplotlib inline
from IPython import display as ipythondisplay

In [0]:
!pip install gym pyvirtualdisplay > /dev/null 2>&1
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1

In [0]:
!apt-get update > /dev/null 2>&1
!apt-get install cmake > /dev/null 2>&1
!pip install --upgrade setuptools 2>&1
!pip install ez_setup > /dev/null 2>&1
!pip install gym[atari] > /dev/null 2>&1

In [0]:
from gym.wrappers import Monitor
import glob
import io
import base64
from IPython.display import HTML
from pyvirtualdisplay import Display

In [0]:
display = Display(visible=0, size=(1400,900))
display.start()

In [0]:
"""
Utility functions to enable video recording of gym environment and displaying it
To enable video, just do "env = wrap_env(env)""
"""

def show_video():
  mp4list = glob.glob('video/*.mp4')
  if len(mp4list) > 0:
    mp4 = mp4list[0]
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
  else: 
    print("Could not find video")
    

def wrap_env(env):
  env = Monitor(env, './video', force=True)
  return env

In [0]:
env = wrap_env(gym.make("SpaceInvaders-v0"))

In [0]:
cumulative_reward = 0


def evaluate(episodic_reward, ep):
  '''
  Takes in the reward for an episode, calculates the cumulative_avg_reward
    and logs it in wandb. If episode > 100, stops logging scores to wandb.
    Called after playing each episode. See example below.

  Arguments:
    episodic_reward - reward received after playing current episode
  '''
  global cumulative_reward

  # your models will be evaluated on 100-episode average reward
  # therefore, we stop logging after 100 episodes
  # log total reward received in this episode to wandb
  #wandb.log({'episodic_reward': episodic_reward})

  # add reward from this episode to cumulative_reward
  cumulative_reward += episodic_reward

  # calculate the cumulative_avg_reward
  # this is the metric your models will be evaluated on
  if (ep + 1) % 100 == 0:
      cumulative_avg_reward = cumulative_reward/100
      print('cumulative_avg_reward: ',cumulative_avg_reward)
      # log cumulative_avg_reward over all episodes played so far
      #wandb.log({'cumulative_avg_reward': cumulative_avg_reward})
      cumulative_reward = 0

In [0]:
batch_size = 32
no_op_steps = 30
C=10000
for e in range(n_episodes + 1): # iterate over episodes of gameplay
    state = env.reset() # reset state at start of each new episode of the game
    #state = preprocess_frame(state)
    done = False
    #agent.model = agent.load('model_1000.h5',custom_objects={'huber_loss':huber_loss})
    agent.state_size = np.reshape(state,[-1,]).shape[0]
    #print(agent.state_size)
    for _ in range(random.randint(1, no_op_steps)):
            state, _, _, _ = env.step(1)
    state,stacked_frames= stack_frames(stacked_frames,state,True)
    #state = np.reshape(state, (1,)+ state.shape)
    time = 0 # time represents a frame of the episode
    total_rewards = 0
    while not done:
        action = agent.act(state) # action is either 0 or 1 (move cart left or right); decide on one or other here
        next_state, reward, done, _ = env.step(action) # agent interacts with env, gets feedback; 4 state data points, e.g., pole angle, cart position 
        # reward = np.sign(reward)       # reward clipping
        next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
        agent.remember(state, action, reward, next_state, done) # store experience 
        state = next_state  # set "current state" for upcoming iteration to the current next state        
        if done:  # if episode ends: 
            print("episode: {}/{}, frames observed: {}, e: {:.2}" # print the episode's score and agent's epsilon
                  .format(e, n_episodes-1, total_frames_seen, agent.epsilon))
            break
        time += 1
        total_rewards += reward
        total_frames_seen += 1
    print('total rewards this episode: ', total_rewards)
    evaluate(total_rewards, e)
    if agent.size() > min_observations_to_train:
        agent.train(batch_size) # train the agent by replaying the experiences of the episode
    if total_frames_seen % C == 0:
        agent.target_train()   #target network training every C updates for stability
    if e % 100 == 0 and e!=0 :
        show_video()
        agent.save("model_"+str(int(e))+".h5")
        #wandb.save(os.path.join(wandb.run.dir, "model.h5"))
        # agent.save(output_dir + "weights_"  + '{:04d}'.format(e) + ".hdf5")
    '''
    try:
        if e % 1000 == 0 and e!=0:
            files.download("model_"+str(int(e))+".h5")
    except:
        print('Couldnt download')
        pass
    '''
env.close()


In [0]:
agent.save('model.h5')

In [0]:
84*84

In [0]:
agent.load('model.h5')
agent.model.summary()

In [0]:
agent.save(os.path.join(wandb.run.dir, "model.h5"))

In [0]:
from keras.models import load_model

In [0]:
wandb.save(os.path.join(wandb.run.dir, "model.h5"))

In [0]:
s = [(0,1),(1,2)]
s.remove((0,1))

In [0]:
state

In [0]:
env.reset().shape

Evaluate and play

In [0]:
api = wandb.Api()
agent.load('/content/wandb/run-20200422_031633-24ees0sk/model.h5')

In [0]:
cumulative_reward = 0
episode = 0

def evaluate_play(episodic_reward, reset=False):
  '''
  Takes in the reward for an episode, calculates the cumulative_avg_reward
    and logs it in wandb. If episode > 100, stops logging scores to wandb.
    Called after playing each episode. See example below.

  Arguments:
    episodic_reward - reward received after playing current episode
  '''
  global episode
  global cumulative_reward
  if reset:
    cumulative_reward = 0
    episode = 0
    
  episode += 1
  print("Episode: %d"%(episode))

  # your models will be evaluated on 100-episode average reward
  # therefore, we stop logging after 100 episodes
  if (episode > 100):
    print("Scores from episodes > 100 won't be logged in wandb.")
    return

  # log total reward received in this episode to wandb
  wandb.log({'episodic_reward': episodic_reward})

  # add reward from this episode to cumulative_reward
  cumulative_reward += episodic_reward

  # calculate the cumulative_avg_reward
  # this is the metric your models will be evaluated on
  cumulative_avg_reward = cumulative_reward/episode

  # log cumulative_avg_reward over all episodes played so far
  wandb.log({'cumulative_avg_reward': cumulative_avg_reward})

  return cumulative_avg_reward

In [0]:
env = gym.make('SpaceInvaders-v0')

In [0]:
agent.model.train_on_batch()

In [0]:
print("Final score: ", np.mean(cumulative_avg_rewards))

In [0]:
cumulative_avg_rewards

In [0]:
1/300000

In [0]:
1 -((1-0.05) * 1e-3)

In [0]:
z = np.random.randn(5,10)
z

In [0]:
z[4,::2]