## Imports

In [None]:
import numpy as np
import gym
from gym import wrappers
import matplotlib.pyplot as plt
%matplotlib inline

# Imports specifically so we can render outputs in Jupyter.
from JSAnimation.IPython_display import display_animation
from matplotlib import animation
from IPython.display import display


def display_frames_as_gif(frames):
    """
    Displays a list of frames as a gif, with controls
    """
    #plt.figure(figsize=(frames[0].shape[1] / 72.0, frames[0].shape[0] / 72.0), dpi = 72)
    patch = plt.imshow(frames[0])
    plt.axis('off')

    def animate(i):
        patch.set_data(frames[i])

    anim = animation.FuncAnimation(plt.gcf(), animate, frames = len(frames), interval=50)
    display(display_animation(anim, default_mode='loop'))

## Hyperparameters

In [None]:
# Environment
environment = 'RocketLander-v0'     # Environment name

# Agent
gamma = 0.99                        # Reward discount factor
learning_rate = 5e-5                # Learning rate
num_episodes = 10                   # number of episodes
max_steps_ep = 1000                 # default max number of steps per episode (unless env has a lower hardcoded limit)
update_target = 100                 # number of steps to use slow target as target before updating it to latest weights
epsilon_start = 1.0                 # probability of random action at start
epsilon_end = 0.05                  # minimum probability of random action after linear decay period
epsilon_decay_length = 1e5          # number of steps over which to linearly decay epsilon
epsilon_decay_exp = 0.97            # exponential decay rate after reaching epsilon_end (per episode)

# Brain
huber_loss_delta = 1.0              # huber loss delta
save_model_episode = 100            # interval to save model

# Memory
batch_size = 1024                   # size of batch from experience replay memory for updates
memory_capacity = int(1e6)   # capacity of experience replay memory

# Start environment
env = gym.make(environment)

# State and action variables
stateCnt  = env.env.observation_space.shape[0]
actionCnt = env.env.action_space.n

# set seeds to 0
env.seed(0)
np.random.seed(0)

# prepare monitorings
monitorDir = 'videos'
env = wrappers.Monitor(env, monitorDir, force=True, video_callable=lambda episode_id: episode_id%100==0)

# prepare models
modelDir = 'models'

## Brain

The `Brain` class encapsulates the Neural Network. It was defined with 4 hidden layers with 512 neurons each and `ReLU` activation function. The input number of neurons is the number of states and the output number of neuros is the number of actions.

This network is trained in order to aproximate the Q function and the target model is a simple copy of the model, however it is updated more sporadically.

For the loss it is using the Huber loss function, it is a loss function used in robust regression, that is less sensitive to outliers in data than the squared error loss.

In [None]:
from keras.models import Sequential
from keras.layers import *
from keras.optimizers import *
import tensorflow as tf

# Huber loss function
def huber_loss(y_true, y_pred):
    
    err = y_true - y_pred

    cond = K.abs(err) < huber_loss_delta
    L2 = 0.5 * K.square(err)
    L1 = huber_loss_delta * (K.abs(err) - 0.5 * huber_loss_delta)

    loss = tf.where(cond, L2, L1)

    return K.mean(loss)

class Brain:
    
    # Initialize brain
    def __init__(self, stateCnt, actionCnt, batchSize):
        self.stateCnt = stateCnt                 # number of states
        self.actionCnt = actionCnt               # number os actions
        self.batchSize = batchSize               # batch size

        self.model = self.createModel()          # model
        self.targetModel = self.createModel()    # target model

    # Create model
    def createModel(self):
        model = Sequential()
        model.add(Dense(units=512, activation='relu', input_dim=stateCnt))
        model.add(Dense(units=512, activation='relu'))
        model.add(Dense(units=512, activation='relu'))
        model.add(Dense(units=512, activation='relu'))
        model.add(Dense(units=actionCnt, activation='linear'))
        model.compile(loss=huber_loss, optimizer=RMSprop(lr=learning_rate))
        return model

    # Train model using batch of random examples
    def train(self, x, y, batchSize=batch_size, epochs=1, verbose=0):
        self.model.fit(x, y, batch_size=batchSize, epochs=epochs, verbose=verbose)

    # Predict using normal or target model given a batch of states
    def predict(self, s, target=False):
        if target:
            return self.targetModel.predict(s)
        else:
            return self.model.predict(s)

    # Predict given only one state
    def predictOne(self, s, target=False):
        return self.predict(s.reshape(1, self.stateCnt), target=target).flatten()

    # Update target model 
    def updateTargetModel(self):
        self.targetModel.set_weights(self.model.get_weights())
        
    # Save target model
    def saveModel(self, episode):
        self.targetModel.save(modelDir + "/" + environment + str(episode) + ".h5")

## Memory

The `Memory` class is necessary to store the experience that will be used for experience replay. A `deque` was chosen because it is a list-like container with fast appends and pops on either end.

Each experience saved to the memory will have the following information:

`(current_state, action, reward, next_state)`

In [None]:
from collections import deque
import random

class Memory:
    
    # Initialize memory
    def __init__(self, capacity):
        self.samples = deque(maxlen=capacity)
        self.capacity = capacity

    # Add sample to the memory
    def add(self, sample):
        self.samples.append(sample)

    # Generate 'n' random samples from the memory
    def sample(self, n):
        n = min(n, len(self.samples))
        return random.sample(self.samples, n)
    
    # Number of current samples in memory
    def numberSamples(self):
        return len(self.samples)

## Agent

The `replay` function is responsible for

In [None]:
class Agent:
    steps = 0
    epsilon = epsilon_start
    epsilon_linear_step = (epsilon_start-epsilon_end)/epsilon_decay_length

    # Initialize agent
    def __init__(self, stateCnt, actionCnt, memoryCapacity, updateTarget, batchSize):
        self.stateCnt = stateCnt
        self.actionCnt = actionCnt
        self.updateTarget = updateTarget
        
        self.brain = Brain(stateCnt, actionCnt, batchSize)     # initialize brain
        self.memory = Memory(memoryCapacity)                   # initialize memory

    # Act based ond epsilon
    def act(self, s):
        if random.random() < self.epsilon:
            return random.randint(0, self.actionCnt-1)
        else:
            return np.argmax(self.brain.predictOne(s))

    # Save experience update target model if necessary
    def observe(self, sample):
        self.memory.add(sample)

        if (self.steps % self.updateTarget == 0):
            self.brain.updateTargetModel()
            
    # Decrement epsilon
    def decrementEpsilon(self, done):
        self.steps += 1
        
        # linearly decay epsilon from epsilon_start to epsilon_end over epsilon_decay_length steps
        if self.steps < epsilon_decay_length:
            self.epsilon -= self.epsilon_linear_step
        # then exponentially decay it every episode
        elif done:
            self.epsilon *= epsilon_decay_exp
        
    # Replay saved data
    def replay(self):
        batch = self.memory.sample(batch_size)
        batchLen = len(batch)

        no_state = np.zeros(self.stateCnt)

        states = np.array([ o[0] for o in batch ])
        states_ = np.array([ (no_state if o[3] is None else o[3]) for o in batch ])

        p = self.brain.predict(states)
        p_ = self.brain.predict(states_, target=True)

        x = np.zeros((batchLen, self.stateCnt))
        y = np.zeros((batchLen, self.actionCnt))

        for i in range(batchLen):
            o = batch[i]
            s = o[0]; a = o[1]; r = o[2]; s_ = o[3]

            t = p[i]
            if s_ is None:
                t[a] = r
            else:
                t[a] = r + gamma * np.amax(p_[i])

            x[i] = s
            y[i] = t

        self.brain.train(x, y)

## Main

In [None]:
# Initialize agent
agent = Agent(stateCnt, actionCnt, memory_capacity, update_target, batch_size)

# Populate memory
while agent.memory.numberSamples() <= batch_size:
    state = env.reset()
    done = False
    
    while not done:
        # act
        action = random.randint(0, actionCnt-1)

        # execute action
        next_state, reward, done, _ = env.step(action)
        
        # observe
        agent.memory.add((state, action, reward, None if done else next_state))
    
for episode in range(num_episodes):
    total_reward = 0
    steps_in_episode = 0

    state = env.reset()

    for frame in range(max_steps_ep):
        # act
        action = agent.act(state)

        # execute action
        next_state, reward, done, _ = env.step(action)

        # update total reward
        total_reward += reward

        # observe
        agent.observe((state, action, reward, None if done else next_state))

        # decrement epsilon
        agent.decrementEpsilon(done)
        
        # replay
        agent.replay()

        # update variables
        state = next_state
        steps_in_episode += 1

        # save model
        if (episode%save_model_episode==0):
            agent.brain.saveModel(episode)
        
        if done:
            break

    print('Episode %2i, Reward: %7.3f, Steps: %i, Next eps: %7.3f'%(episode,total_reward,steps_in_episode, agent.epsilon))