# Double Deep Q-Learning & Open AI Gym: Intro

## The Open AI Lunar Lander environment - Deep Q-Learning

### Loading Libraries

In [83]:
# Numerical Computing
import numpy as np

# Data Manipulation
import pandas as pd

# Data Visualization
import seaborn as sns
import matplotlib.pyplot as plt
from mdptoolbox import mdp
from itertools import product

# Time & Path
import time
from pathlib import Path
from time import process_time

# Warnigns
import warnings

# TensorFlow
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2

# OpenAI Gym
import gym
from gym import wrappers

In [85]:
np.random.seed(42)

tf.random.set_seed(42)

warnings.filterwarnings('ignore')

sns.set_style('whitegrid', {'axes.grid' : False})

In [87]:
gpu_devices = tf.config.experimental.list_physical_devices('GPU')

if gpu_devices:
    print('Using GPU')
    tf.config.experimental.set_memory_growth(gpu_devices[0], True)
else:
    print('Using CPU')

Using GPU


#### Display Helper Functions Result

In [90]:
def format_time(t):
    m_, s = divmod(t, 60)
    h, m = divmod(m_, 60)
    return '{:02.0f}:{:02.0f}:{:02.0f}'.format(h, m, s)

#### Enable Virtual Display to Run from Docker Container

In [95]:
# from pyvirtualdisplay import Display
# virtual_display = Display(visible=0, size=(1400, 900))
# virtual_display.start()

### Defining DDQN Agent

#### Replay Buffer

In [98]:
class Memory():
    def __init__(self, capacity, state_dims):
        self.capacity = capacity
        self.idx = 0

        self.state_memory = np.zeros(shape=(capacity, state_dims), 
                                     dtype=np.float32)
        self.new_state_memory = np.zeros_like(self.state_memory)

        self.action_memory = np.zeros(capacity, dtype=np.int32)
        self.reward_memory = np.zeros_like(self.action_memory)
        self.done = np.zeros_like(self.action_memory)

    def store(self, state, action, reward, next_state, done):
        self.state_memory[self.idx, :] = state
        self.new_state_memory[self.idx, :] = next_state
        self.reward_memory[self.idx] = reward
        self.action_memory[self.idx] = action
        self.done[self.idx] = 1 - int(done)
        self.idx += 1

    def sample(self, batch_size):
        batch = np.random.choice(self.idx, batch_size, replace=False)

        states = self.state_memory[batch]
        next_states = self.new_state_memory[batch]
        rewards = self.reward_memory[batch]
        actions = self.action_memory[batch]
        done = self.done[batch]
        return states, actions, rewards, next_states, done

#### Agent Class

In [101]:
class DDQNAgent:
    def __init__(self,
                 state_dim,
                 num_actions,
                 gamma,
                 epsilon_start,
                 epsilon_end,
                 epsilon_decay_steps,
                 epsilon_exponential_decay,
                 learning_rate,
                 architecture,
                 l2_reg,
                 replay_capacity,
                 tau,
                 batch_size,
                 results_dir,
                 log_every=10):

        self.state_dim = state_dim
        self.num_actions = num_actions

        self.architecture = architecture
        self.l2_reg = l2_reg
        self.learning_rate = learning_rate
        self.experience = Memory(replay_capacity,
                                 state_dim)
        self.gamma = gamma
        self.tau = tau
        self.batch_size = batch_size
        self.idx = np.arange(batch_size, dtype=np.int32)

        self.online_network = self.build_model()
        self.target_network = self.build_model(trainable=False)
        self.optimizer = Adam(lr=learning_rate)
        self.update_target()

        self.epsilon = epsilon_start
        self.epsilon_decay_steps = epsilon_decay_steps
        self.epsilon_decay = (epsilon_start - epsilon_end) / epsilon_decay_steps
        self.epsilon_exponential_decay = epsilon_exponential_decay
        self.epsilon_history = []

        self.total_steps = self.train_steps = 0
        self.episodes = self.episode_length = self.train_episodes = 0
        self.steps_per_episode = []
        self.episode_reward = 0
        self.rewards_history = []

        self.results_dir = results_dir
        self.experiment = experiment
        self.log_every = log_every
        
        self.summary_writer = (tf.summary
                               .create_file_writer(results_dir.as_posix()))
        self.start = time()
        self.train = True

    def build_model(self, trainable=True):
        layers = []
        for i, units in enumerate(self.architecture, 1):
            layers.append(Dense(units=units,
                                input_dim=self.state_dim if i == 1 else None,
                                activation='relu',
                                kernel_regularizer=l2(self.l2_reg),
                                trainable=trainable))
        layers.append(Dense(units=self.num_actions,
                            trainable=trainable))
        return Sequential(layers)

    def update_target(self):
        self.target_network.set_weights(self.online_network.get_weights())

    # @tf.function
    def epsilon_greedy_policy(self, state):
        self.total_steps += 1
        if np.random.rand() <= self.epsilon:
            return np.random.choice(self.num_actions)
        q = self.online_network.predict(state)
        return np.argmax(q, axis=1).squeeze()

    # @tf.function
    def decay_epsilon(self):
        if self.train:
            if self.episodes < self.epsilon_decay_steps:
                self.epsilon -= self.epsilon_decay
            else:
                self.epsilon *= self.epsilon_exponential_decay

    def log_progress(self):
        self.rewards_history.append(self.episode_reward)
        self.steps_per_episode.append(self.episode_length)

        avg_steps_100 = np.mean(self.steps_per_episode[-100:])
        avg_steps_10 = np.mean(self.steps_per_episode[-10:])
        max_steps_10 = np.max(self.steps_per_episode[-10:])
        avg_rewards_100 = np.mean(self.rewards_history[-100:])
        avg_rewards_10 = np.mean(self.rewards_history[-10:])
        max_rewards_10 = np.max(self.rewards_history[-10:])

        with self.summary_writer.as_default():
            tf.summary.scalar('Episode Reward', self.episode_reward, step=self.episodes)
            tf.summary.scalar('Episode Rewards (MA 100)', avg_rewards_100, step=self.episodes)
            tf.summary.scalar('Episode Steps', self.episode_length, step=self.episodes)
            tf.summary.scalar('Epsilon', self.epsilon, step=self.episodes)

        if self.episodes % self.log_every == 0:
            template = '{:03} | {} | Rewards {:4.0f} {:4.0f} {:4.0f} | ' \
                       'Steps: {:4.0f} {:4.0f} {:4.0f} | Epsilon: {:.4f}'
            print(template.format(self.episodes, format_time(time() - self.start),
                                  avg_rewards_100, avg_rewards_10, max_rewards_10,
                                  avg_steps_100, avg_steps_10, max_steps_10,
                                  self.epsilon))

    def memorize_transition(self, s, a, r, s_prime, done):
        self.experience.store(s, a, r, s_prime, done)
        self.episode_reward += r
        self.episode_length += 1

        if done:
            self.epsilon_history.append(self.epsilon)
            self.decay_epsilon()
            self.episodes += 1
            self.log_progress()
            self.episode_reward = 0
            self.episode_length = 0

    def experience_replay(self):
        # not enough experience yet
        if self.batch_size > self.experience.idx:
            return

        # sample minibatch
        states, actions, rewards, next_states, done = self.experience.sample(self.batch_size)

        # select best next action (online)
        next_action = tf.argmax(self.online_network.predict(next_states, self.batch_size), axis=1, name='next_action')
        # predict next q values (target)
        next_q_values = self.target_network.predict(next_states, self.batch_size)
        # get q values for best next action
        target_q = (tf.math.reduce_sum(next_q_values *
                                       tf.one_hot(next_action,
                                                  self.num_actions),
                                       axis=1, name='target_q'))
        # compute td target
        td_target = rewards + done * self.gamma * target_q

        with tf.GradientTape() as tape:
            q_values = self.online_network(states)
            q_values = tf.math.reduce_sum(q_values * tf.one_hot(actions, self.num_actions), axis=1, name='q_values')
            loss = tf.math.reduce_mean(tf.square(td_target - q_values))

        # run back propagation
        variables = self.online_network.trainable_variables
        gradients = tape.gradient(loss, variables)

        self.optimizer.apply_gradients(zip(gradients, variables))
        with self.summary_writer.as_default():
            tf.summary.scalar('Loss', loss, step=self.train_steps)
        self.train_steps += 1

        if self.total_steps % self.tau == 0:
            self.update_target()

    def store_results(self):
        result = pd.DataFrame({'Rewards': self.rewards_history,
                               'Steps'  : self.steps_per_episode,
                               'Epsilon': self.epsilon_history},
                              index=list(range(1, len(self.rewards_history) + 1)))

        result.to_csv(self.results_dir / 'results.csv', index=False)

### Running Experiment

In [104]:
experiment = 0

In [106]:
results_dir = Path('results', 'lunar_lander', 'experiment_{}'.format(experiment))

if not results_dir.exists():
    results_dir.mkdir(parents=True)

#### Set  OpenAI Gym Lunar Lander Environment Up

In [114]:
env = gym.make('LunarLander-v2')

state_dim = env.observation_space.shape[0]  # number of dimensions in state
num_actions = env.action_space.n  # number of actions
max_episode_steps = env.spec.max_episode_steps  # max number of steps per episode

env.reset(seed=42)

(array([ 0.00229702,  1.4181306 ,  0.23264714,  0.32046658, -0.00265488,
        -0.05269808,  0.        ,  0.        ], dtype=float32),
 {})

In [116]:
monitor_path = results_dir / 'monitor'
video_freq = 500

In [124]:
env = wrappers.Monitor(env,
                       directory=monitor_path.as_posix(),
                       video_callable=lambda count: count % video_freq == 0,
                      force=True)

### Defining Hyperparameters

#### Discount Factor

In [127]:
gamma = .99

#### Q-Network Parameters

In [130]:
learning_rate = 0.0001

In [132]:
architecture = (256, 256)  

l2_reg = 1e-6  

#### Replay Buffer Parameters

In [135]:
tau = 100  

replay_capacity = int(1e6)

batch_size = 1024

#### ε-greedy Policy

In [138]:
epsilon_start = 1.0

epsilon_end = 0.01

epsilon_decay_steps = 100

epsilon_exponential_decay = .99

### Instantiate DDQN Agent

In [143]:
agent = DDQNAgent(state_dim=state_dim,
                  num_actions=num_actions,
                  learning_rate=learning_rate,
                  gamma=gamma,
                  epsilon_start=epsilon_start,
                  epsilon_end=epsilon_end,
                  epsilon_decay_steps=epsilon_decay_steps,
                  epsilon_exponential_decay=epsilon_exponential_decay,
                  replay_capacity=replay_capacity,
                  architecture=architecture,
                  l2_reg=l2_reg,
                  tau=tau,
                  batch_size=batch_size,
                  results_dir=results_dir)

### Training & Testing Agent

In [146]:
tf.keras.backend.clear_session()

In [148]:
max_episodes = 2500

test_episodes = 0

In [150]:
while agent.episodes < max_episodes:
    this_state = env.reset()
    done = False
    while not done:
        action = agent.epsilon_greedy_policy(this_state.reshape(-1, state_dim))
        next_state, reward, done, _ = env.step(action)
        agent.memorize_transition(this_state, action, reward, next_state, done)
        agent.experience_replay()
        this_state = next_state
    if np.mean(agent.rewards_history[-100:]) > 200:
        break

agent.store_results()
env.close()

### Evaluating Results

In [153]:
results = pd.read_csv(results_dir / 'results.csv')

results['MA100'] = results.rolling(window=100, min_periods=25).Rewards.mean()

In [155]:
fig, axes = plt.subplots(ncols=2, figsize=(16, 4), sharex=True)
results[['Rewards', 'MA100']].plot(ax=axes[0])
axes[0].set_ylabel('Rewards')
axes[0].set_xlabel('Episodes')
axes[0].axhline(200, c='k', ls='--', lw=1)
results[['Steps', 'Epsilon']].plot(secondary_y='Epsilon', ax=axes[1]);
axes[1].set_xlabel('Episodes')
fig.suptitle('Double Deep Q-Network Agent | Lunar Lander', fontsize=16)
fig.tight_layout()
fig.subplots_adjust(top=.9)
fig.savefig(results_dir / 'trading_agent_2ed', dpi=300)
plt.show()

### Tensorboard

In [158]:
%load_ext tensorboard

In [160]:
%tensorboard --logdir results/lunar_lander/experiment_0