<a href="https://colab.research.google.com/github/kparnis3/Deep-RL-Assignment/blob/main/code/REINFORCE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Installation of packages


In [None]:
%%bash

apt-get install swig

git clone https://github.com/pybox2d/pybox2d
cd pybox2d
python setup.py build
python setup.py install

apt-get install -y xvfb

pip install \
    gym==0.21 \
    gym[box2d]==0.21 \
    pytorch-lightning==1.6.0 \
    optuna==2.7.0 \
    pyglet==1.5.27 \
    pyvirtualdisplay

#### Setup of Virtual Display

In [None]:
from pyvirtualdisplay import Display
Display(visible=False, size=(1400, 900)).start()

#### Packages used for Gym / Pytorch Lightning / Optuna

In [None]:
import copy
import gym 
import torch
import random
import statistics
import optuna

import numpy as np
import torch.nn.functional as F

from collections import deque, namedtuple
from IPython.display import HTML
from base64 import b64encode

from torch import Tensor, nn #create the Neural network
from torch.utils.data import DataLoader #load training data
from torch.utils.data.dataset import IterableDataset #Define where we get our data
from torch.optim import AdamW #optimizer

from pytorch_lightning import LightningModule, Trainer 
from pytorch_lightning.callbacks import EarlyStopping

from gym.wrappers import RecordVideo, RecordEpisodeStatistics, TimeLimit, NormalizeObservation, NormalizeReward

from optuna.integration import PyTorchLightningPruningCallback
     
device = 'cuda:0' if torch.cuda.is_available() else 'cpu' # Run code on GPU (if possible)
num_gpus = torch.cuda.device_count()

### Helper functions to display video in notebook and test trained model

In [None]:
@torch.no_grad()
def test_env(env_name, policy, obs_rms, **kwargs):
  env = gym.make(env_name, **kwargs)
  env = RecordVideo(env, 'videos', episode_trigger=lambda e: True)
  env = NormalizeObservation(env)
  env.obs_rms = obs_rms

  for episode in range(10):
    done = False
    obs = env.reset()
    while not done:
      action = policy(obs).multinomial(1).cpu().item()
      obs, _, done, _ = env.step(action)
  del env


def display_video(episode=0):
  video_file = open(f'/content/videos/rl-video-episode-{episode}.mp4', "r+b").read()
  video_url = f"data:video/mp4;base64,{b64encode(video_file).decode()}"
  return HTML(f"<video width=600 controls><source src='{video_url}'></video>")

### Creating the gradient policy

In [None]:
class GradientPolicy(nn.Module):

  def __init__(self, obs_size, out_dims,hidden_size=128):
    super().__init__()

    self.layer1 = nn.Linear(obs_size, hidden_size)
    self.layer2 = nn.Linear(hidden_size, hidden_size)
    self.layer_final = nn.Linear(hidden_size, out_dims)

  def forward(self, x):
    x = torch.tensor(x).float().to(device)
    x = F.relu(self.layer1(x))
    x = F.relu(self.layer2(x))
    #x = torch.tanh(self.layer_final(x))
    x = F.softmax(self.layer_final(x), dim=-1) #dim for par enviroments [[x1, x2, x3], [y1, y2, y3], ...] , softmax for probabilities 

    return x # [[p1, p2], [py1, py2] ....]

### Checking gym

In [None]:
env = gym.vector.make('LunarLander-v2', num_envs=1)

In [None]:
env.reset()

In [None]:
env.observation_space, env.action_space

### Creating the environment

In [None]:
def create_environment(env_name, num_envs):
  env = gym.vector.make(env_name, num_envs, asynchronous=False)
  env = RecordEpisodeStatistics(env) #Keep history of rewards from the env
  env = NormalizeObservation(env)
  env = NormalizeReward(env)
  return env

#### Creating the Dataset

In [None]:
class RLDataset(IterableDataset):
  def __init__(self,env,policy,samples_per_epoch, gamma):
    self.env = env
    self.policy = policy
    self.samples_per_epoch = samples_per_epoch
    self.gamma = gamma #discount factor
    self.obs = env.reset()
  
  @torch.no_grad()
  def __iter__(self):
    transitions = []
    for step in range(self.samples_per_epoch):
      action = self.policy(self.obs)
      action = action.multinomial(1).cpu().numpy()
      next_obs, reward, done, info = self.env.step(action.flatten())
      transitions.append((self.obs, action, reward, done))
      self.obs = next_obs
    
    obs_b, action_b, reward_b, done_b = map(np.stack, zip(*transitions))

    running_return = np.zeros(self.env.num_envs, dtype=np.float32)
    return_b = np.zeros_like(reward_b)

    for row in range(self.samples_per_epoch -1, -1, -1): #from samples_per_epoch -1 -> -1 (moving backwards) (last to first)
      running_return = reward_b[row] + (1 - done_b[row]) * self.gamma * running_return # return in each timestep in a signle backwards pass
      return_b[row] = running_return

    num_samples = self.samples_per_epoch * self.env.num_envs
    obs_b = obs_b.reshape(num_samples, -1)
    action_b = action_b.reshape(num_samples, -1)
    return_b = return_b.reshape(num_samples, -1)

    idx = list(range(num_samples))
    random.shuffle(idx)

    for i in idx:
      yield obs_b[i], action_b[i], return_b[i]


#### The REINFORCE algorithm

In [None]:
class Reinforce(LightningModule):
  def __init__(self, env_name, num_envs=64, batch_size=256, hidden_size=64, policy_lr=1e-4, 
               samples_per_epoch=100, loss_fn=F.smooth_l1_loss,
               gamma=0.99, optim=AdamW, entropy_coef=0.01):
    super().__init__()
    self.env = create_environment(env_name, num_envs=num_envs)
    self.obs = self.env.reset()

    obs_size = self.env.single_observation_space.shape[0] 
    action_dims =  self.env.single_action_space.n #discrete
    self.policy = GradientPolicy(obs_size, action_dims, hidden_size) #create the policy

    self.dataset = RLDataset(self.env, self.policy, samples_per_epoch, gamma)
    self.save_hyperparameters()

  def configure_optimizers(self):
    policy_optimizer = self.hparams.optim(self.policy.parameters(), lr=self.hparams.policy_lr)
    return policy_optimizer
  
  def train_dataloader(self): #specify how to get training data
    return DataLoader(dataset=self.dataset, batch_size=self.hparams.batch_size)

  def training_step(self, batch, batch_idx): #called twice, once with the actor another for critic
    
    obs_b, action_b, return_b = batch

    prob_b = self.policy(obs_b)
    log_prob_b = torch.log(prob_b + 1e-6) #calculate our log probabilities for our loss function
    action_log_prob_b = log_prob_b.gather(1, action_b)

    entropy = - torch.sum(prob_b * log_prob_b, dim =-1, keepdim=True)

    pg_loss = - action_log_prob_b * return_b #minimize negative -> maximize
    loss = (pg_loss - self.hparams.entropy_coef * entropy).mean()

    self.log("episode/Policy Loss", pg_loss.mean())
    self.log("episode/Entropy", entropy.mean())

    return loss.mean()
  
  def training_epoch_end(self, training_step_outputs): #when a epoch ends
    self.log("episode/Return", self.env.return_queue[-1])

#### Purge logs and run the visualization tool (Tensorboard)

In [None]:
!rm -r /content/lightning_logs/
!rm -r /content/videos/
%load_ext tensorboard
%tensorboard --logdir /content/lightning_logs/

#### Training using REINFORCE

In [None]:
algorithm = Reinforce('LunarLander-v2')
callback = EarlyStopping(
    monitor='episode/Return',
    patience=300,
    strict=False,
    verbose=1,
    mode='max'
)
trainer = Trainer(gpus=num_gpus,
                  max_epochs=5_000,
                  log_every_n_steps=10,
                  callbacks=[callback])
trainer.fit(algorithm)

In [None]:
import warnings
warnings.filterwarnings('ignore')

test_env('LunarLander-v2', algorithm.policy, algorithm.env.obs_rms)

In [None]:
display_video(episode=9)

In [None]:
display_video(episode=8)