# 0. Install and Import Dependencies

In [None]:
from google.colab import drive

drive.mount('/content/drive')

In [None]:
%cd /content/drive/My Drive/Github/RLProject/

In [None]:
# %tensorflow_version 1.x
# !pip install tensorflow==1.15.0 tensorflow-gpu==1.15.0 stable_baselines gym box2d-py --user
# %tensorflow_version 1.x

In [None]:
# Stable Baselines only supports tensorflow 1.x for now
%tensorflow_version 1.x;
!apt-get install ffmpeg freeglut3-dev xvfb;  # For visualization
!pip install stable-baselines[mpi]==2.10.2;

In [None]:
# !pip install swig
!pip install box2d-py;

In [None]:
import stable_baselines
stable_baselines.__version__

In [None]:
import gym 
from stable_baselines import A2C,PPO2,DQN
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines.common.evaluation import evaluate_policy
from stable_baselines.common.callbacks import EvalCallback,  StopTrainingOnRewardThreshold

import pandas as pd
import numpy as np
import time
import tensorflow as tf

In [None]:
%cd /content/drive/My Drive/Github/RLProject/DQNModelData

In [None]:
def findHypParams():
  """
  Finds the optimal hyper-parameter configuration for each model by looping through
  parameter dictionaries. Returns the optimal models and writes the parameters to 
  a file.

  Returns
  ------- 
  optimalA2CModel: <class 'stable_baselines.a2c.a2c.A2C'>

  optimalPPO2Model: <class 'stable_baselines.ppo2.ppo2.PPO2'>

  optimalDQNModel: <class 'stable_baselines.deepq.dqn.DQN'>
  """
  env = gym.make('LunarLander-v2')
  env = DummyVecEnv([lambda: env])
  PPO2ParamDict = {'gamma':(0.99,0.95),
                 'n_steps':(128,132),
                 'ent_coef':(0.01,0.02),
                 'learning_rate':(0.0005,0.00025)
  }
  A2CParamDict = {'gamma':(0.99,0.95),
                  'n_steps':(5,7),
                  'ent_coef':(0.01,0.02),  
                  'learning_rate':(0.0007,0.0005)
      
  }
  DQNParamDict = {'gamma':(0.99,0.95),
                  'double_q':(True,False),
                  'batch_size':(32,64),  
                  'learning_rate':(0.0007,0.0005)
      
  }

  bestReward=-np.inf
  iter=0
  keys = list(DQNParamDict.keys())
  for gamma in DQNParamDict[keys[0]]:
    for q in DQNParamDict[keys[1]]:
      for batch in DQNParamDict[keys[2]]:
        for lr in DQNParamDict[keys[3]]:
          iter+=1
          print(iter)
          model = DQN('MlpPolicy', 
                        env, 
                        verbose = 0,
                        tensorboard_log="./compareModels_tensorboard/",
                        seed=42,
                        gamma=gamma,
                        double_q=q,
                        batch_size=batch,
                        learning_rate=lr)
          model.learn(total_timesteps=1000)
          meanReward, stdReward  = evaluate_policy(model,
                                                  env,
                                                  n_eval_episodes=100,
                                                  render=False)
          if meanReward>bestReward:
            bestReward=meanReward
            optimalDQNParams=[gamma,q,batch,lr]
            optimalDQNModel = model

    
    with open('optimalDQNParams.txt', 'w') as f:
      for item in optimalDQNParams:
        f.write("%s\n" % item)



  bestReward=-np.inf
  iter=0
  keys = list(PPO2ParamDict.keys())
  for gamma in PPO2ParamDict[keys[0]]:
   for steps in PPO2ParamDict[keys[1]]:
     for ent in PPO2ParamDict[keys[2]]:
       for lr in PPO2ParamDict[keys[3]]:
         iter+=1
         print(iter)
         model = PPO2('MlpPolicy', 
                      env, 
                      verbose = 0,
                      tensorboard_log="./compareModels_tensorboard/",
                      seed=42,
                      gamma=gamma,
                      n_steps=steps,
                      ent_coef=ent,
                      learning_rate=lr)
         model.learn(total_timesteps=10000)
         meanReward, stdReward  = evaluate_policy(model,
                                                 env,
                                                 n_eval_episodes=100,
                                                 render=False)
         if meanReward>bestReward:
           bestReward=meanReward
           optimalPPO2Params=[gamma,steps,ent,lr]
           optimalPPO2Model = model

   
  with open('optimalPPO2Params.txt', 'w') as f:
    for item in optimalPPO2Params:
      f.write("%s\n" % item)
  bestReward=-100000
  keys = list(A2CParamDict.keys())
  for gamma in A2CParamDict[keys[0]]:
    for steps in A2CParamDict[keys[1]]:
      for ent in A2CParamDict[keys[2]]:
        for lr in A2CParamDict[keys[3]]:
          iter+=1
          print(iter)
          model = A2C('MlpPolicy', 
                      env, 
                      verbose = 0,
                      tensorboard_log="./compareModels_tensorboard/",
                      seed=42,
                      gamma=gamma,
                      n_steps=steps,
                      ent_coef=ent,
                      learning_rate=lr)
          model.learn(total_timesteps=10000)
          meanReward, stdReward  = evaluate_policy(model,
                                                 env,
                                                 n_eval_episodes=100,
                                                 render=False)
          if meanReward>bestReward:
            bestReward=meanReward
            optimalA2CParams=[gamma,steps,ent,lr]
            optimalA2CModel = model



  with open('optimalA2CParams.txt', 'w') as f:
      for item in optimalA2CParams:
        f.write("%s\n" % item)
  
  return optimalA2CModel,optimalPPO2Model,optimalDQNModel

In [None]:
# %cd DQNModelData
# optimalA2CModel,optimalPPO2Model,optimalDQNModel=findHypParams()

# Model training

In [None]:
%cd /content/drive/My Drive/Github/RLProject/SavedModels

In [None]:
#All models are trained for 100000 timesteps
#All use same callback system

In [None]:
#Path to save checkpoints and validation data
logPath = '/content/drive/My Drive/Github/RLProject/SavedModels/A2CLogs/'
eval_env = gym.make('LunarLander-v2')
#Stops training when envrionment is 'completed'
callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=200, verbose=1)
#Saves training and validation data
eval_callback = EvalCallback(eval_env, best_model_save_path=logPath,
                             callback_on_new_best=callback_on_best,
                             log_path=logPath, eval_freq=500,
                             deterministic=True, render=False,
                             verbose=1)

In [None]:
#Initalise environment 
env = gym.make('LunarLander-v2')
#OpenAI Gym recommends using the dummy vec env
env = DummyVecEnv([lambda: env])
A2CModel = A2C('MlpPolicy', 
                      env, 
                      verbose = 0,
                      tensorboard_log="./lunarlander_tensorboard/",
                      seed=42,
                      gamma=0.95,
                      n_steps=5,
                      ent_coef=0.01,
                      learning_rate=0.0007)
A2CModel.learn(total_timesteps=100000,
            callback=eval_callback)
modelPath = '/content/drive/My Drive/Github/RLProject/SavedModels/'
#Save trained model
A2CModel.save((modelPath + 'finalA2CModel'))

In [None]:
logPath = '/content/drive/My Drive/Github/RLProject/SavedModels/PPO2Logs/'
eval_env = gym.make('LunarLander-v2')
callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=200, verbose=1)
eval_callback = EvalCallback(eval_env, best_model_save_path=logPath,
                             callback_on_new_best=callback_on_best,
                             log_path=logPath, eval_freq=500,
                             deterministic=True, render=False,
                             verbose=1)

In [None]:
env = gym.make('LunarLander-v2')
env = DummyVecEnv([lambda: env])
PPO2Model = PPO2('MlpPolicy', 
                      env, 
                      verbose = 0,
                      tensorboard_log="./lunarlander_tensorboard/",
                      seed=42,
                      gamma=0.99,
                      n_steps=132,
                      ent_coef=0.02,
                      learning_rate=0.00025)
PPO2Model.learn(total_timesteps=100000,
            callback=eval_callback)
modelPath = '/content/drive/My Drive/Github/RLProject/SavedModels/'
PPO2Model.save((modelPath + 'finalPPO2Model'))

In [None]:
logPath = '/content/drive/My Drive/Github/RLProject/SavedModels/DQNLogs/'
eval_env = gym.make('LunarLander-v2')
callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=200, verbose=1)
eval_callback = EvalCallback(eval_env, best_model_save_path=logPath,
                             callback_on_new_best=callback_on_best,
                             log_path=logPath, eval_freq=500,
                             deterministic=True, render=False,
                             verbose=1)

In [None]:
env = gym.make('LunarLander-v2')
env = DummyVecEnv([lambda: env])
DQNModel = DQN('MlpPolicy', 
                      env, 
                      verbose = 0,
                      tensorboard_log="./lunarlander_tensorboard/",
                      seed=42,
                      gamma=0.99,
                      double_q=True,
                      batch_size=32,
                      learning_rate=0.0007)
DQNModel.learn(total_timesteps=100000,
            callback=eval_callback)
modelPath = '/content/drive/My Drive/Github/RLProject/SavedModels/'
DQNModel.save((modelPath + 'finalDQNModel'))