In [None]:
################################################################################
# BELOW IS CODE ADAPTED FROM:
# 
# Escape Velocity Labs, 28/08/22, 'Advanced Reinforcement Learning in Python: from DQN to SAC', 
# Accessed at https://www.udemy.com/course/advanced-reinforcement/ 
#
# Note: There is a section towards the bottom which is purely self coded (all except for the Training function used), this has been labelled clearly.
################################################################################

In [None]:
### Install dependencies
### Note: you need to restart the runtime after you run this cell

!apt-get install -y xvfb # Used to display videos and GUI's in this notebook

!pip install \
    gym==0.21 \
    gym[box2d] \
    pytorch-lightning==1.6.0 \
    pyvirtualdisplay

# gym -> Environment 
# gym[box2d] -> Dependency for box2d category in the gym
# pytorch-lightning -> Reinforcement learning and neural networks
# pyvirtualdisplay -> Used to display videos in the environment

In [None]:
### Create a display to render the environment in google servers

from pyvirtualdisplay import Display
Display(visible=False, size=(1400, 900)).start()

In [None]:
### Import remaining packages and set up variables related to hardware

import copy
import gym
import torch
import random
import itertools
import matplotlib.pyplot as plt
import pandas as pd

import numpy as np
import torch.nn.functional as F

from collections import deque, namedtuple
from IPython.display import HTML
from base64 import b64encode

from torch import Tensor, nn
from torch.utils.data import DataLoader
from torch.utils.data.dataset import IterableDataset
from torch.optim import AdamW

from pytorch_lightning import LightningModule, Trainer

from pytorch_lightning.callbacks import EarlyStopping

from gym.wrappers import RecordVideo, RecordEpisodeStatistics, TimeLimit

# Use GPU if there is one available
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

num_gpus = torch.cuda.device_count()

In [None]:
### Create a function which takes episode number as input, and outputs a video of the environment of the given episode

def display_video(episode=0):
  video_file = open(f'/content/videos/rl-video-episode-{episode}.mp4', "r+b").read()
  video_url = f"data:video/mp4;base64,{b64encode(video_file).decode()}"
  return HTML(f"<video width=600 controls><source src='{video_url}'></video>")

In [None]:
### Create the actor network class

class Actor(nn.Module):

  def __init__(self, hidden_size, obs_size, out_dims, min, max, num_layers):
    super().__init__()
    self.min = torch.from_numpy(min).to(device)
    self.max = torch.from_numpy(max).to(device)
    if num_layers == 1:
      self.net = nn.Sequential(
          nn.Linear(obs_size, hidden_size),
          nn.ReLU(),
          nn.Linear(hidden_size, hidden_size),
          nn.ReLU(),
          nn.Linear(hidden_size, out_dims),
          nn.Tanh()
      )
    elif num_layers == 2:
      self.net = nn.Sequential(
          nn.Linear(obs_size, hidden_size),
          nn.ReLU(),
          nn.Linear(hidden_size, hidden_size),
          nn.ReLU(),
          nn.Linear(hidden_size, hidden_size),
          nn.ReLU(),
          nn.Linear(hidden_size, out_dims),
          nn.Tanh()
      )
    elif num_layers == 3:
      self.net = nn.Sequential(
          nn.Linear(obs_size, hidden_size),
          nn.ReLU(),
          nn.Linear(hidden_size, hidden_size),
          nn.ReLU(),
          nn.Linear(hidden_size, hidden_size),
          nn.ReLU(),
          nn.Linear(hidden_size, hidden_size),
          nn.ReLU(),
          nn.Linear(hidden_size, out_dims),
          nn.Tanh()
      )
    
  # This function takes a state as input and outputs the highest return action
  def mu(self, x):
    if isinstance(x, np.ndarray):
      x = torch.from_numpy(x).to(device)
    return self.net(x.float()) * self.max

  # This function takes a state as input, and returns the highest return action with clipped noise
  def forward(self, x, epsilon=0.0, noise_clip=None):
    mu = self.mu(x)
    noise = torch.normal(0, epsilon, mu.size(), device=mu.device)
    if noise_clip is not None:
      noise = torch.clamp(noise, -noise_clip, noise_clip)
    mu = mu + noise
    action = torch.max(torch.min(mu, self.max), self.min)
    action = action.detach().cpu().numpy() # Maybe not detach
    return action


In [None]:
### Create the Q-Network class

class DQN(nn.Module):

  def __init__(self, hidden_size, obs_size, out_dims, num_layers):
    super().__init__()
    if num_layers == 1:
      self.net = nn.Sequential(
          nn.Linear(obs_size + out_dims, hidden_size),
          nn.ReLU(),
          nn.Linear(hidden_size, hidden_size),
          nn.ReLU(),           
          nn.Linear(hidden_size, 1),
      )
    elif num_layers == 2:
      self.net = nn.Sequential(
          nn.Linear(obs_size + out_dims, hidden_size),
          nn.ReLU(),
          nn.Linear(hidden_size, hidden_size),
          nn.ReLU(),
          nn.Linear(hidden_size, hidden_size),
          nn.ReLU(),                
          nn.Linear(hidden_size, 1),
      )
    elif num_layers == 3:
        self.net = nn.Sequential(
            nn.Linear(obs_size + out_dims, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),     
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),                
            nn.Linear(hidden_size, 1),
        )

  # This function takes a state-action pair and returns a Q-value (predicted expected return)
  def forward(self, state, action):
    if isinstance(state, np.ndarray):
      state = torch.from_numpy(state).to(device)
    if isinstance(action, np.ndarray):
      action = torch.from_numpy(action).to(device)
    in_vector = torch.hstack((state, action))
    return self.net(in_vector.float())


In [None]:
### Create the ReplayBuffer class, from which we make out buffer

class ReplayBuffer:

  def __init__(self, capacity):
    self.buffer = deque(maxlen=capacity)
  
  def __len__(self):
    return len(self.buffer)

  def append(self, experience):
    self.buffer.append(experience)
  
  def sample(self, batch_size):
    return random.sample(self.buffer, batch_size)

In [None]:
### Create the RLDataset class, which is used to create a dataset object out of samples from the buffer to be used as input for the training step

class RLDataset(IterableDataset):

  def __init__(self, buffer, sample_size=200):
    self.buffer = buffer
    self.sample_size = sample_size
  
  # This function sequentially gives an experience from the buffer to the pyTorch when requested
  def __iter__(self):
    for experience in self.buffer.sample(self.sample_size):
      yield  experience


In [None]:
### Create a function to incrementally move the weights of the target network closer to those of the main network

def polyak_average(net, target_net, tau=0.01):
    for qp, tp in zip(net.parameters(), target_net.parameters()):
        tp.data.copy_(tau * qp.data + (1 - tau) * tp.data)

In [None]:
### Create a function which wraps the default environment from gym in layers of additional funcitonality

def create_environment(name, max_steps):
  env = gym.make(name)
  env = TimeLimit(env, max_episode_steps=max_steps)
  #env = RecordVideo(env, video_folder='./videos', episode_trigger=lambda x: x % 50 == 0)
  env = RecordEpisodeStatistics(env)
  return env

In [None]:
### This class enacts the TD3 algorithm

class TD3(LightningModule):

  def __init__(self, env_name, capacity=50_000, batch_size=100, 
               actor_lr=1e-3, critic_lr=1e-3, hidden_size=128, gamma=0.995,
               loss_fn=F.smooth_l1_loss, optim=AdamW, eps_start=1.0, eps_end=0.1,
               eps_last_episode=100, samples_per_epoch=1_000, tau=0.004, epsilon=1, max_steps=400, num_layers=1):
    
    super().__init__()

    self.env = create_environment(env_name, max_steps)
    obs_size = self.env.observation_space.shape[0]
    action_dims = self.env.action_space.shape[0]
    max_action = self.env.action_space.high
    min_action = self.env.action_space.low

    self.q_net1 = DQN(hidden_size, obs_size, action_dims, num_layers)
    self.q_net2 = DQN(hidden_size, obs_size, action_dims, num_layers)
    self.actor = Actor(hidden_size, obs_size, action_dims, min_action, max_action, num_layers)

    self.target_q_net1 = copy.deepcopy(self.q_net1)
    self.target_q_net2 = copy.deepcopy(self.q_net2)
    self.target_actor = copy.deepcopy(self.actor)

    self.epsilon = epsilon

    self.buffer = ReplayBuffer(capacity=capacity)

    self.ep_returns = []

    self.max_return = 0
    self.best_actor = copy.deepcopy(self.actor)

    self.save_hyperparameters()

    # Fill the buffer up with experience (1000 experiences - i.e. self.hparams.samples_per_epoch)
    while len(self.buffer) < self.hparams.samples_per_epoch:
      print(f"{len(self.buffer)} samples in experience buffer. Filling...")
      self.play_episode(epsilon=self.hparams.eps_start)

  # This function plays a single full episode and stores all of the experience in the buffer
  @torch.no_grad()
  def play_episode(self, actor=None, epsilon=0.):
    state = self.env.reset()
    done = False

    while not done:
      if actor:
        action = actor(state, epsilon=epsilon)
      else:
        action = self.env.action_space.sample()
      next_state, reward, done, info = self.env.step(action)
      exp = (state, action, reward, done, next_state)
      self.buffer.append(exp)
      state = next_state


  # This function performs returns a predicted high returning action with noise applied
  def forward(self, x):
    output = self.actor(x)
    return output
  
  # This function configures the optimizers before the process begins
  def configure_optimizers(self):
    q_net_params = itertools.chain(self.q_net1.parameters(), self.q_net2.parameters())
    q_net_optimizer = self.hparams.optim(q_net_params, lr=self.hparams.critic_lr)
    actor_optimizer = self.hparams.optim(self.actor.parameters(), lr=self.hparams.actor_lr)
    return [q_net_optimizer, actor_optimizer]
  
  # This function creates the dataloader, which feeds experience to the training_step() function.
  def train_dataloader(self):
    dataset = RLDataset(self.buffer, self.hparams.samples_per_epoch)
    dataloader = DataLoader(
        dataset=dataset,
        batch_size = self.hparams.batch_size
    )
    return dataloader
  
  # Training step to update the neural networks
  def training_step(self, batch, batch_idx, optimizer_idx):

    states, actions, rewards, dones, next_states = batch
    rewards = rewards.unsqueeze(1)
    dones = dones.unsqueeze(1)

    # Update the target networks
    polyak_average(self.q_net1, self.target_q_net1, tau=self.hparams.tau)
    polyak_average(self.q_net2, self.target_q_net2, tau=self.hparams.tau)
    polyak_average(self.actor, self.target_actor, tau=self.hparams.tau)

    # If the q_net optimizer is selected, update the Q-network
    if optimizer_idx == 0:
      state_action_values1 = self.q_net1(states, actions)
      state_action_values2 = self.q_net2(states, actions)
      next_actions = self.target_actor(next_states, epsilon=self.epsilon, noise_clip=0.05)
      # Select the lowest next state/action value
      next_state_action_values = torch.min(
          self.target_q_net1(next_states, next_actions),
          self.target_q_net2(next_states, next_actions)
      )
      # Set all terminal states to have a value of 0
      next_state_action_values[dones] = 0.0
      expected_state_action_values = rewards + self.hparams.gamma * next_state_action_values

      q_loss1 = self.hparams.loss_fn(state_action_values1, expected_state_action_values)
      q_loss2 = self.hparams.loss_fn(state_action_values2, expected_state_action_values)
      total_loss = q_loss1 + q_loss2
      self.log_dict({"episode/Q-Loss": total_loss})
      return total_loss

    # If the actor optimizer is selected, update the Actor network
    elif optimizer_idx == 1 and batch_idx % 2 == 0:
      mu = self.actor.mu(states)
      actor_loss = - self.q_net1(states, mu).mean()
      self.log_dict({"episode/Actor Loss": actor_loss})
      return actor_loss


  # This function is called at the end of each epoch
  def training_epoch_end(self, training_step_outputs):

    # Decrease epsilon value as time goes on to reduce exploration and increase reward
    self.epsilon = max(
        self.hparams.eps_end,
        self.hparams.eps_start - self.current_epoch / self.hparams.eps_last_episode
    )

    # An episode is played out in full at the end of each epoch, therefore one epoch lasts as long as one episode
    self.play_episode(actor=self.actor, epsilon=self.epsilon)
    self.log('episode/Return', self.env.return_queue[-1])

    self.ep_returns.append(self.env.return_queue[-1])

    if self.env.return_queue[-1] > self.max_return:
      self.best_actor = copy.deepcopy(self.actor)





In [None]:
### Setting up the tensorboard to display the results

!rm -r /content/lightning_logs/
!rm -r /content/videos/
%load_ext tensorboard
%tensorboard --logdir /content/lightning_logs/

In [None]:
################################################################################
BELOW IS MY CODE FOR HPS-RL
################################################################################

In [None]:
### The fitness function of the GA algorithm

def fitness_prob(algoA_actor, algoB_actor, algoC_actor, algoD_actor, num_tests, gene_A, gene_B, gene_C, gene_D, gen):
  env_name = 'BipedalWalker-v3'
  max_steps = 1600
  env = create_environment(env_name, max_steps)
  actors_list = [algoA_actor, algoB_actor, algoC_actor, algoD_actor]
  genes_list = [gene_A, gene_B, gene_C, gene_D]
  la_test_returns = []

  # Run the 100 tests with the last actors of each algorithm
  # Create the list la_test_returns = [[results of algoA eps 1-100], [results of algoB eps 1-100], [results of algoC eps 1-100]]
  for i, actor in enumerate(actors_list):
    la_test_returns.append([])
    for test in range(num_tests):
      state = env.reset()
      ep_return = 0
      done = False
      while not done:
        action = actor(state, epsilon=0)
        next_state, reward, done, info = env.step(action)
        exp = (state, action, reward, done, next_state)
        state = next_state
        ep_return += reward
      la_test_returns[i].append(ep_return)
    
    # Store the results of the test
    this_Algo = pd.DataFrame(la_test_returns[i])
    this_Algo.columns = ['Gene: ' +  f'{genes_list[i]}']
    if i==0:
      results = this_Algo
    else:
      results = pd.concat([results, this_Algo], axis=1)
  results.to_csv(f'TD3_GA_Generation_{gen}_fitness_test.csv')

  
  # Calculate the fitnesses of each algorithm (*note: ignoring loss at this stage - unclear what it is + appears it should work without also)
  # Add 150 to each score to work with any negative numbers
  algoA_fitness = (1/num_tests) + sum(la_test_returns[0]) + 150*num_tests
  algoB_fitness = (1/num_tests) + sum(la_test_returns[1]) + 150*num_tests
  algoC_fitness = (1/num_tests) + sum(la_test_returns[2]) + 150*num_tests
  algoD_fitness = (1/num_tests) + sum(la_test_returns[3]) + 150*num_tests

  # If fitness still < 0 despite + 100
  fitness_list = []
  for fitness in [algoA_fitness, algoB_fitness, algoC_fitness, algoD_fitness]:
    if fitness < 0:
      fitness = 0
    fitness_list.append(fitness)

  fitness_total = sum(fitness_list)

  algoA_prob = fitness_list[0]/fitness_total
  algoB_prob = fitness_list[1]/fitness_total
  algoC_prob = fitness_list[2]/fitness_total
  algoD_prob = fitness_list[3]/fitness_total

  return (algoA_prob,algoB_prob, algoC_prob, algoD_prob, la_test_returns)

In [None]:
### Randomly select a gene by considering its fitness

def randomly_select(gene_A, algoA_prob, gene_B, algoB_prob, gene_C, algoC_prob, gene_D, algoD_prob):
  # Declare parents
  parent_11 = [0,0]
  parent_22 = [0,0]
  # Get parent 1
  rand_1 = random.uniform(0, 1)
  if rand_1 <= algoA_prob:
    parent_11 = gene_A
  elif rand_1 <= algoA_prob + algoB_prob:
    parent_11 = gene_B
  elif rand_1 <= algoA_prob + algoB_prob + algoC_prob:
    parent_11 = gene_C
  elif rand_1 <= algoA_prob + algoB_prob + algoC_prob + algoD_prob:
    parent_11 = gene_D

  # Get parent 2
  parent_22 = copy.deepcopy(parent_11)
  while parent_22 == parent_11:
    rand_2 = random.uniform(0, 1)
    if rand_2 <= algoA_prob:
      parent_22 = gene_A
    elif rand_2 <= algoA_prob + algoB_prob:
      parent_22 = gene_B
    elif rand_2 <= algoA_prob + algoB_prob + algoC_prob:
      parent_22 = gene_C
    elif rand_2 <= algoA_prob + algoB_prob + algoC_prob + algoD_prob:
      parent_22 = gene_D

  return (parent_11, parent_22)

  


In [None]:
### Perform crossover

def crossover(parent_11, parent_22):
  child_11 = copy.deepcopy(parent_11)
  child_22 = copy.deepcopy(parent_22)
  # Select which hyperparameter to switch
  index = random.choice([0, 1])
  if index == 0:
    child_11 = (parent_22[0], parent_11[1])
    child_22 = (parent_11[0], parent_22[1])
  elif index == 1:
    child_11 = (parent_11[0], parent_22[1])
    child_22 = (parent_22[0], parent_11[1])
  return (child_11, child_22)
  


In [None]:
### Perform mutation

def mutate(child_11, child_22, max_hidden_size, min_hidden_size, max_num_layers, min_num_layers):
  # Declare children
  child_33 = (0, 0)
  child_44 = (0, 0)
  # Select which hyperparameter to mutate for child_3
  index_1 = random.choice([0, 1])
  if index_1 == 0:
    child_33 = (random.randint(min_hidden_size, max_hidden_size), child_11[1])
  elif index_1 == 1:
    child_33 = (child_11[0], random.randint(min_num_layers, max_num_layers))

  # Select which hyperparameter to mutate for child_4
  index_2 = random.choice([0, 1])
  if index_2 == 0:
    child_44 = (random.randint(min_hidden_size, max_hidden_size), child_22[1])
  elif index_2 == 1:
    child_44 = (child_22[0], random.randint(min_num_layers, max_num_layers))

  return (child_33, child_44)


In [None]:
### Start the GA process

# Set values
num_gens = 13
num_tests = 100
eps_per_it = 3000
fitness_list = []

# Initialize genes, GA parameters, and set limits
max_hidden_size = 500
min_hidden_size = 50

max_num_layers = 3
min_num_layers = 1

gene_A = (500, 1)
gene_B = (250, 2)
gene_C = (50, 3)
gene_D = (128, 1)
gene_history = [gene_A,  gene_B, gene_C, gene_D]


for gen in range(num_gens):
  
  # Reset returns lists
  gene_A_returns = []
  gene_B_returns = []
  gene_C_returns = []
  gene_D_returns = []

  # Train the models with gene A
  algoA = TD3('BipedalWalker-v3', hidden_size=gene_A[0], num_layers=gene_A[1])
  trainer = Trainer(
      gpus=num_gpus,
      max_epochs=eps_per_it
  )
  trainer.fit(algoA)
  gene_A_returns.append(algoA.ep_returns)

  # Train the models with gene B
  algoB = TD3('BipedalWalker-v3', hidden_size=gene_B[0], num_layers=gene_B[1])
  trainer = Trainer(
      gpus=num_gpus,
      max_epochs=eps_per_it
  )
  trainer.fit(algoB)
  gene_B_returns.append(algoB.ep_returns)

  # Train the models with gene C
  algoC = TD3('BipedalWalker-v3', hidden_size=gene_C[0], num_layers=gene_C[1])
  trainer = Trainer(
      gpus=num_gpus,
      max_epochs=eps_per_it
  )
  trainer.fit(algoC)
  gene_C_returns.append(algoC.ep_returns)

  # Train the models with gene D
  algoD = TD3('BipedalWalker-v3', hidden_size=gene_D[0], num_layers=gene_D[1])
  trainer = Trainer(
      gpus=num_gpus,
      max_epochs=eps_per_it
  )
  trainer.fit(algoD)
  gene_D_returns.append(algoD.ep_returns)

  # Test the fitness of the models
  algoA_prob, algoB_prob, algoC_prob, algoD_prob, la_test_returns  = fitness_prob(algoA.actor, algoB.actor, algoC.actor, algoD.actor, num_tests, gene_A, gene_B, gene_C, gene_D, gen)
  fitness_list.append([algoA_prob, algoB_prob, algoC_prob, algoD_prob])

  # Store the results of the fitness of the models
  this_Algo = pd.DataFrame([algoA_prob, algoB_prob, algoC_prob, algoD_prob])
  this_Algo.columns = ['Algo generation: ' +  f'{gen}']
  if gen==0:
    fitness_results = this_Algo
  else:
    fitness_results = pd.concat([fitness_results, this_Algo], axis=1)
  
  # Select the next parents
  parent_1, parent_2 = randomly_select(gene_A, algoA_prob, gene_B, algoB_prob, gene_C, algoC_prob, gene_D, algoD_prob)

  # Perform crossover
  child_1, child_2 = crossover(copy.deepcopy(parent_1), copy.deepcopy(parent_2))

  # Perform mutation
  child_3, child_4 = mutate(copy.deepcopy(child_1), copy.deepcopy(child_2), max_hidden_size, min_hidden_size, max_num_layers, min_num_layers)

  # Create excel file for the last 4 algorithms episode returns
  algo_A_df = pd.DataFrame(gene_A_returns[0])
  algo_A_df.columns = ['Algo generation: ' +  f'{gen}' + ', gene: ' + f'{gene_A}']
  algo_A_df.to_csv(f'TD3_GA_returns_gen_{gen}_gene_[{gene_A[0]},{gene_A[1]}].csv')
  algo_B_df = pd.DataFrame(gene_B_returns[0])
  algo_B_df.columns = ['Algo generation: ' +  f'{gen}' + ', gene: ' + f'{gene_B}']
  algo_B_df.to_csv(f'TD3_GA_returns_gen_{gen}_gene_[{gene_B[0]},{gene_B[1]}].csv')
  algo_C_df = pd.DataFrame(gene_C_returns[0])
  algo_C_df.columns = ['Algo generation: ' +  f'{gen}' + ', gene: ' + f'{gene_C}']
  algo_C_df.to_csv(f'TD3_GA_returns_gen_{gen}_gene_[{gene_C[0]},{gene_C[1]}].csv')
  algo_D_df = pd.DataFrame(gene_D_returns[0])
  algo_D_df.columns = ['Algo generation: ' +  f'{gen}' + ', gene: ' + f'{gene_D}']
  algo_D_df.to_csv(f'TD3_GA_returns_gen_{gen}_gene_[{gene_D[0]},{gene_D[1]}].csv')

  print(f"Generation: {gen}, Gene [{gene_A[0]},{gene_A[1]}], mean 100 ep score = {sum(la_test_returns[0])/num_tests}, probability = {algoA_prob}")
  print(f"Generation: {gen}, Gene [{gene_B[0]},{gene_B[1]}], mean 100 ep score = {sum(la_test_returns[1])/num_tests}, probability = {algoB_prob}")
  print(f"Generation: {gen}, Gene [{gene_C[0]},{gene_C[1]}], mean 100 ep score = {sum(la_test_returns[2])/num_tests}, probability = {algoC_prob}")
  print(f"Generation: {gen}, Gene [{gene_D[0]},{gene_D[1]}], mean 100 ep score = {sum(la_test_returns[3])/num_tests}, probability = {algoD_prob}")

  # Set the genes for the next round of training
  gene_A = copy.deepcopy(child_1)
  gene_B = copy.deepcopy(child_2)
  gene_C = copy.deepcopy(child_3)
  gene_D = copy.deepcopy(child_4)

  print(f"Next genes are: [{gene_A[0]},{gene_A[1]}] + [{gene_B[0]},{gene_B[1]}] + [{gene_C[0]},{gene_C[1]}] + [{gene_D[0]},{gene_D[1]}]")

# Create excel file for GA gens fitness results
fitness_results.to_csv(f'TD3_GA_fitness_generation.csv')



