# Deep Q-Network

Reference: [Landing a Rocket with Reinforcement Learning](https://towardsdatascience.com/ai-learning-to-land-a-rocket-reinforcement-learning-84d61f97d055)

```
initialize replay memory R
initialize action-value function Q (with random weights)
observe initial state s
repeat
	select an action a
		with probability ϵ select a random action
		otherwise select a= argmaxa′Q(s,a′)
	carry out action a
	observe reward rr and new state s’
	store experience <s,a,r,s> in replay memory R
	sample random transitions <ss,aa,rr,ss′>from replay memory R
	calculate target for each minibatch transition
		if ss’ is terminal state then tt =rr otherwise tt =rr + γmaxa′Q(ss′,aa′)
	train the Q network using (tt−Q(ss,aa))2 as loss
	s=s′
until terminated
```

In [None]:
%%capture
!pip install Box2D
!pip install box2d
!pip install box2d-py
!pip install gym[all]
!pip install gym[Box_2D]
!pip install wandb
!pip install knockknock

In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/Othercomputers/razer13/0-assignments/ca2/part-b

In [None]:
import gym
from knockknock import telegram_sender

import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.models import load_model

from utils import seed_everything, Experience, ReplayBuffer
from collections import deque
from model.dqn import DQN

In [None]:
import wandb
wandb.login()
wandb.init(
    entity="onsen",
    project="dqn-lunar-lander"
)
# 8b99e70d604baf9855037e8ebf97bacb8af829fc

In [None]:
# Set random seed to 1
seed_everything(1)

In [None]:
env = gym.make('LunarLander-v2')
# env = gym.make('CartPole-v1')

# set seeds
env.seed(1)

# setting up params
lr = 0.001
epsilon = 1.0
epsilon_decay = 0.995
gamma = 0.99
training_episodes = 2000
noisy = True

# create new deep q-network instance
model = DQN(env=env, lr=lr, gamma=gamma, epsilon=epsilon, epsilon_decay=epsilon_decay, log_wandb=True, noisy = noisy)

# training
@telegram_sender(token="5158827103:AAE-QThCKlqgqBwRgR_heJjqB1BarWjdIxk", chat_id=943489922)
def train_model(model):
    model.train(training_episodes, mean_stopping=True)

train_model(model)

In [None]:
# Complete training
wandb.finish()

In [None]:
model.save('./saved-models/dqn/toasty_plant_44.h5')

In [None]:
def plot_df(df, chart_name, title, x_axis_label, y_axis_label):
    plt.rcParams.update({'font.size': 17})
    df['rolling_mean'] = df[df.columns[0]].rolling(100).mean()
    plt.figure(figsize=(15, 8))
    plt.close()
    plt.figure()
    # plot = df.plot(linewidth=1.5, figsize=(15, 8), title=title)
    plot = df.plot(linewidth=1.5, figsize=(15, 8))
    plot.set_xlabel(x_axis_label)
    plot.set_ylabel(y_axis_label)
    # plt.ylim((-400, 300))
    fig = plot.get_figure()
    plt.legend().set_visible(False)
    fig.savefig(chart_name)

def plot_df2(df, chart_name, title, x_axis_label, y_axis_label):
    df['mean'] = df[df.columns[0]].mean()
    plt.rcParams.update({'font.size': 17})
    plt.figure(figsize=(15, 8))
    plt.close()
    plt.figure()
    # plot = df.plot(linewidth=1.5, figsize=(15, 8), title=title)
    plot = df.plot(linewidth=1.5, figsize=(15, 8))
    plot.set_xlabel(x_axis_label)
    plot.set_ylabel(y_axis_label)
    plt.ylim((0, 300))
    plt.xlim((0, 100))
    plt.legend().set_visible(False)
    fig = plot.get_figure()
    fig.savefig(chart_name)

def plot_experiments(df, chart_name, title, x_axis_label, y_axis_label, y_limit):
    plt.rcParams.update({'font.size': 17})
    plt.figure(figsize=(15, 8))
    plt.close()
    plt.figure()
    plot = df.plot(linewidth=1, figsize=(15, 8), title=title)
    plot.set_xlabel(x_axis_label)
    plot.set_ylabel(y_axis_label)
    plt.ylim(y_limit)
    fig = plot.get_figure()
    fig.savefig(chart_name)

In [None]:
# test the model
trained_model = load_model(save_dir + "trained_model_new.h5")
test_rewards = test_already_trained_model(trained_model)
pickle.dump(test_rewards, open(save_dir + "test_rewards_new.p", "wb"))
test_rewards = pickle.load(open(save_dir + "test_rewards_new.p", "rb"))

plot_df2(
    pd.DataFrame(test_rewards), 
    "Figure 2: Reward for each testing episode", 
    "Reward for each testing episode", 
    "Episode", 
    "Reward"
)

print("Training and Testing Completed...!")

# Hyperparameter Experiments

In [None]:
def run_experiment_for_gamma():
    print('Running Experiment for gamma...')
    env = gym.make('LunarLander-v2')

    # set seeds
    env.seed(21)
    np.random.seed(21)

    # setting up params
    lr = 0.001
    epsilon = 1.0
    epsilon_decay = 0.995
    gamma_list = [0.99, 0.9, 0.8, 0.7]
    training_episodes = 1000

    rewards_list_for_gammas = []
    
    for gamma_value in gamma_list:
        # save_dir = "hp_gamma_"+ str(gamma_value) + "_"
        model = DQN(env, lr, gamma_value, epsilon, epsilon_decay)
        print("Training model for Gamma: {}".format(gamma_value))
        model.train(training_episodes, False)
        rewards_list_for_gammas.append(model.rewards_list)

    pickle.dump(rewards_list_for_gammas, open("rewards_list_for_gammas.p", "wb"))
    rewards_list_for_gammas = pickle.load(open("rewards_list_for_gammas.p", "rb"))

    gamma_rewards_pd = pd.DataFrame(index=pd.Series(range(1, training_episodes + 1)))
    
    for i in range(len(gamma_list)):
        col_name = "gamma=" + str(gamma_list[i])
        gamma_rewards_pd[col_name] = rewards_list_for_gammas[i]
    
    plot_experiments(
        gamma_rewards_pd,
        "Figure 4: Rewards per episode for different gamma values",
        "Figure 4: Rewards per episode for different gamma values",
        "Episodes",
        "Reward",
        (-600, 300)
    )

def run_experiment_for_lr():
    print('Running Experiment for learning rate...')
    env = gym.make('LunarLander-v2')

    # set seeds
    env.seed(21)
    np.random.seed(21)

    # setting up params
    lr_values = [0.0001, 0.001, 0.01, 0.1]
    epsilon = 1.0
    epsilon_decay = 0.995
    gamma = 0.99
    training_episodes = 1000
    rewards_list_for_lrs = []
    
    for lr_value in lr_values:
        model = DQN(env, lr_value, gamma, epsilon, epsilon_decay)
        print("Training model for LR: {}".format(lr_value))
        model.train(training_episodes, False)
        rewards_list_for_lrs.append(model.rewards_list)

    pickle.dump(rewards_list_for_lrs, open("rewards_list_for_lrs.p", "wb"))
    rewards_list_for_lrs = pickle.load(open("rewards_list_for_lrs.p", "rb"))

    lr_rewards_pd = pd.DataFrame(index=pd.Series(range(1, training_episodes + 1)))
    
    for i in range(len(lr_values)):
        col_name = "lr="+ str(lr_values[i])
        lr_rewards_pd[col_name] = rewards_list_for_lrs[i]
    
    plot_experiments(
        lr_rewards_pd,
        "Figure 3: Rewards per episode for different learning rates",
        "Figure 3: Rewards per episode for different learning rates",
        "Episodes", 
        "Reward",
        (-2000, 300)
    )

def run_experiment_for_ed():
    print('Running Experiment for epsilon decay...')
    env = gym.make('LunarLander-v2')

    # set seeds
    env.seed(21)
    np.random.seed(21)

    # setting up params
    lr = 0.001
    epsilon = 1.0
    ed_values = [0.999, 0.995, 0.990, 0.9]
    gamma = 0.99
    training_episodes = 1000

    rewards_list_for_ed = []
    
    for ed in ed_values:
        save_dir = "hp_ed_"+ str(ed) + "_"
        model = DQN(env, lr, gamma, epsilon, ed)
        print("Training model for ED: {}".format(ed))
        model.train(training_episodes, False)
        rewards_list_for_ed.append(model.rewards_list)

    pickle.dump(rewards_list_for_ed, open("rewards_list_for_ed.p", "wb"))
    rewards_list_for_ed = pickle.load(open("rewards_list_for_ed.p", "rb"))

    ed_rewards_pd = pd.DataFrame(index=pd.Series(range(1, training_episodes+1)))
    
    for i in range(len(ed_values)):
        col_name = "epsilon_decay = "+ str(ed_values[i])
        ed_rewards_pd[col_name] = rewards_list_for_ed[i]
    
    plot_experiments(
        ed_rewards_pd,
        "Figure 5: Rewards per episode for different epsilon(ε) decay",
        "Figure 5: Rewards per episode for different epsilon(ε) decay values",
        "Episodes",
        "Reward",
        (-600, 300)
    )

In [None]:
# Run experiments for hyper-parameter
run_experiment_for_lr()
run_experiment_for_ed()
run_experiment_for_gamma()