# Getting started with PPO and ProcGen

Here's a bit of code that should help you get started on your projects.

The cell below installs `procgen` and downloads a small `utils.py` script that contains some utility functions. You may want to inspect the file for more details.

In [None]:
!pip install procgen
!wget https://raw.githubusercontent.com/nicklashansen/ppo-procgen-utils/main/utils.py
# !wget https://raw.githubusercontent.com/pokaxpoka/rad_procgen/346fb852fa16e739601ca998eebd1d56b95aa2e8/train_procgen/data_augs.py
!wget https://raw.githubusercontent.com/MishaLaskin/rad/master/TransformLayer.py

--2020-12-03 14:23:05--  https://raw.githubusercontent.com/nicklashansen/ppo-procgen-utils/main/utils.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 14807 (14K) [text/plain]
Saving to: ‘utils.py.6’


2020-12-03 14:23:05 (127 MB/s) - ‘utils.py.6’ saved [14807/14807]

--2020-12-03 14:23:05--  https://raw.githubusercontent.com/MishaLaskin/rad/master/TransformLayer.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7657 (7.5K) [text/plain]
Saving to: ‘TransformLayer.py.6’


2020-12-03 14:23:05 (102 MB/s) - ‘TransformLayer.py.6’ saved [7657/

Hyperparameters. These values should be a good starting point. You can modify them later once you have a working implementation.


In [None]:
# Hyperparameters
total_steps = 2e6 #1e4 #8e6  
num_envs = 32 # 32
num_levels = 100 # 10
num_steps = 256 # 256
num_epochs = 3 # 3
batch_size = 512 # 512
eps = .2 # .2
grad_eps = .5 # .5
value_coef = .5 # .5
entropy_coef = .01 # .01

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


Network definitions. We have defined a policy network for you in advance. It uses the popular `NatureDQN` encoder architecture (see below), while policy and value functions are linear projections from the encodings. There is plenty of opportunity to experiment with architectures, so feel free to do that! Perhaps implement the `Impala` encoder from [this paper](https://arxiv.org/pdf/1802.01561.pdf) (perhaps minus the LSTM).

In [None]:
# Add the necessary libraries
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
from utils import make_env, Storage, orthogonal_init
from time import time

In [None]:
# For Impala encoder
def xavier_uniform_init(module, gain=1.0):
    if isinstance(module, nn.Linear) or isinstance(module, nn.Conv2d):
        nn.init.xavier_uniform_(module.weight.data, gain)
        nn.init.constant_(module.bias.data, 0)
    return module

class ResidualBlock(nn.Module):
    def __init__(self,in_channels):
        super(ResidualBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=in_channels, out_channels=in_channels, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(in_channels=in_channels, out_channels=in_channels, kernel_size=3, stride=1, padding=1)

    def forward(self, x):
        out = nn.ReLU()(x)
        out = self.conv1(out)
        out = nn.ReLU()(out)
        out = self.conv2(out)
        return out + x

class ImpalaBlock(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(ImpalaBlock, self).__init__()
        self.conv = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=3, stride=1, padding=1)
        self.res1 = ResidualBlock(out_channels)
        self.res2 = ResidualBlock(out_channels)

    def forward(self, x):
        x = self.conv(x)
        x = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)(x)
        x = self.res1(x)
        x = self.res2(x)
        return x

class Encoder(nn.Module):
    def __init__(self,in_channels,out_features,**kwargs):
        super().__init__()
        self.block1 = ImpalaBlock(in_channels=in_channels, out_channels=16)
        self.block2 = ImpalaBlock(in_channels=16, out_channels=32)
        self.block3 = ImpalaBlock(in_channels=32, out_channels=32)
        self.fc = nn.Linear(in_features=32 * 8 * 8, out_features=out_features)

        self.output_dim = feature_dim
        self.apply(xavier_uniform_init)

    def forward(self, x):
        x = self.block1(x)
        x = self.block2(x)
        x = self.block3(x)
        x = nn.ReLU()(x)
        x = Flatten()(x)
        x = self.fc(x)
        x = nn.ReLU()(x)
        return x


In [None]:
# Augmentation code
class RandGray(object):
    def __init__(self,  
                 batch_size, 
                 p_rand=0.5,
                 *_args, 
                 **_kwargs):
        
        self.p_gray = p_rand
        self.batch_size = batch_size
        self.random_inds = np.random.choice([True, False], 
                                            batch_size, 
                                            p=[self.p_gray, 1 - self.p_gray])
        
    def grayscale(self, imgs):
        # imgs: b x h x w x c
        # the format is incorrect
        b, c, h, w = imgs.shape # format changed, hlynur
        imgs = imgs[:, 0, :, :] * 0.2989 + imgs[:, 1, :, :] * 0.587 + imgs[:, 2, :, :] * 0.114 
        imgs = np.tile(imgs.reshape(b,-1,h,w), (1, 3, 1, 1)) # .astype(np.uint8)
        return imgs

    def do_augmentation(self, images):
        # images: [B, C, H, W]
        bs, channels, h, w = images.shape
        # print(images.shape)
        if self.random_inds.sum() > 0:
            print(self.random_inds)
            # print(sum(self.random_inds))
            # print(images[self.random_inds].shape)
            images[self.random_inds] =  self.grayscale(images[self.random_inds])

        return images
    
    def change_randomization_params(self, index_):
        self.random_inds[index_] = np.random.choice([True, False], 1, 
                                                    p=[self.p_gray, 1 - self.p_gray])
        
    def change_randomization_params_all(self):
        self.random_inds = np.random.choice([True, False], 
                                            self.batch_size, 
                                            p=[self.p_gray, 1 - self.p_gray])
        
    def print_parms(self):
        print(self.random_inds)


In [None]:
class Center_Crop(object):
    def __init__(self, 
                 *_args, 
                 **_kwargs):
        self.crop_size = 64
    
    def do_augmentation(self, image):
        h, w = image.shape[1], image.shape[2]
        new_h, new_w = self.crop_size, self.crop_size

        top = (h - new_h)//2
        left = (w - new_w)//2
        image = image[:, top:top + new_h, left:left + new_w, :]
        # print('returning image as:', image[0,:,:,:])
        return image.copy()
    
    def change_randomization_params(self, index_):
        index_ = index_
        
    def change_randomization_params_all(self):
        index_ = 0
    
    def print_parms(self):
        print('nothing')

In [None]:
from TransformLayer import ColorJitterLayer
transform_module = nn.Sequential(ColorJitterLayer(brightness=0.4, 
                                                  contrast=0.4,
                                                  saturation=0.4, 
                                                  hue=0.5, 
                                                  p=1.0, 
                                                  batch_size=num_envs,
                                                  stack_size=1))

In [None]:
def color_jitter(obs):
  
  # device = torch.device('cpu')
  in_stacked_x = obs.to(device)
  # in_stacked_x= in_stacked_x / 255.0
  # in_stacked_x = in_stacked_x.reshape(-1,3,64,64)
  # start = time()
  randconv_x = transform_module(obs)
  # return (randconv_x)


In [None]:
def grayscale(imgs,device):
    # imgs: b x c x h x w
    b, c, h, w = imgs.shape
    # frames = c // 3
    
    # imgs = imgs.view([b,frames,3,h,w])
    # imgs = imgs[:, 0, :, :] * 0.2989 + imgs[:, 1, :, :] * 0.587 + imgs[:, 2, :, :] * 0.114 
    
    imgs = imgs.view([b,3,h,w])
    imgs = imgs[:, 0, ...] * 0.2989 + imgs[:, 1, ...] * 0.587 + imgs[:, 2, ...] * 0.114 
    

    # imgs = imgs.type(torch.uint8).float()
    # assert len(imgs.shape) == 3, imgs.shape
    imgs = imgs[:, None, :, :]
    imgs = imgs * torch.ones([1, 3, 1, 1], dtype=imgs.dtype).float().to(device) # broadcast tiling
    return imgs

In [None]:
def random_cutout(imgs, min_cut,max_cut):
    """
        args:
        imgs: shape (B,C,H,W)
        out: output size (e.g. 84)
    """
    n, c, h, w = imgs.shape
    w1 = np.random.randint(min_cut, max_cut, n)
    h1 = np.random.randint(min_cut, max_cut, n)
    
    cutouts = np.empty((n, c, h, w), dtype=imgs.dtype)
    for i, (img, w11, h11) in enumerate(zip(imgs, w1, h1)):
        cut_img = img.copy()
        cut_img[:, h11:h11 + h11, w11:w11 + w11] = 0
        #print(img[:, h11:h11 + h11, w11:w11 + w11].shape)
        cutouts[i] = cut_img
    return cutouts

In [None]:
class Flatten(nn.Module):
    def forward(self, x):
        return x.view(x.size(0), -1)

class Encoder2(nn.Module):
  def __init__(self, in_channels, feature_dim):
    super().__init__()
    self.layers = nn.Sequential(
        nn.Conv2d(in_channels=in_channels, out_channels=32, kernel_size=8, stride=4), nn.ReLU(),
        nn.Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2), nn.ReLU(),
        nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1), nn.ReLU(),
        Flatten(),
        nn.Linear(in_features=1024, out_features=feature_dim), nn.ReLU()
    )
    self.apply(orthogonal_init)

  def forward(self, x):
    return self.layers(x)


class Policy(nn.Module):
  def __init__(self, encoder, feature_dim, num_actions):
    super().__init__()
    self.encoder = encoder
    self.policy = orthogonal_init(nn.Linear(feature_dim, num_actions), gain=.01)
    self.value = orthogonal_init(nn.Linear(feature_dim, 1), gain=1.)

  def act(self, x):
    with torch.no_grad():
      x = x.cuda().contiguous()
      dist, value = self.forward(x)
      action = dist.sample()
      log_prob = dist.log_prob(action)
    
    return action.cpu(), log_prob.cpu(), value.cpu()

  def forward(self, x):
    x = self.encoder(x)
    logits = self.policy(x)
    value = self.value(x).squeeze(1)
    dist = torch.distributions.Categorical(logits=logits)
    
    return dist, value

class Policy_add_FCL(nn.Module):
  def __init__(self, encoder, feature_dim, num_actions):
    super().__init__()
    self.encoder = encoder
    self.policy_add_FCL = nn.Linear(feature_dim, int(feature_dim/2))
    self.policy = orthogonal_init(nn.Linear(int(feature_dim/2), num_actions), gain=.01)
    self.value_add_FCL = nn.Linear(feature_dim, int(feature_dim/2))
    self.value = orthogonal_init(nn.Linear(int(feature_dim/2), 1), gain=1.)

  def act(self, x):
    with torch.no_grad():
      x = x.cuda().contiguous()
      dist, value = self.forward(x)
      action = dist.sample()
      log_prob = dist.log_prob(action)
    
    return action.cpu(), log_prob.cpu(), value.cpu()

  def forward(self, x):
    x = self.encoder(x)
    pol_add = F.relu(self.policy_add_FCL(x))
    logits = self.policy(pol_add)
    val_add = F.relu(self.value_add_FCL(x))
    value = self.value(val_add).squeeze(1)
    dist = torch.distributions.Categorical(logits=logits)
    
    return dist, value

In [None]:
setup_run = {
    'ex1': {
      'Encoder':'impala',
      'Data_aug': 'regular',
      'Mixreg': False,
      'Policy': None,
      'add_FCL': False
    }
}

setup_run['ex2'] = {
    'Encoder':'impala',
    'Data_aug': 'regular',
    'Mixreg': False,
    'Policy': None,
    'add_FCL': False
}

setup_run['ex3'] = {
    'Encoder':'impala',
    'Data_aug': 'grayscale',
    'Mixreg': False,
    'Policy': None,
    'add_FCL': False
}

setup_run['ex4'] = {
    'Encoder':'impala',
    'Data_aug': 'color_jitter',
    'Mixreg': False,
    'Policy': None,
    'add_FCL': False
}

setup_run['ex5'] = {
    'Encoder':'impala',
    'Data_aug': 'cut_out',
    'Mixreg': False,
    'Policy': None,
    'add_FCL': False
}

setup_run['ex6'] = {
    'Encoder':'nature',
    'Data_aug': 'regular',
    'Mixreg': False,
    'Policy': None,
    'add_FCL': True
}

setup_run['ex7'] = {
    'Encoder':'impala',
    'Data_aug': 'regular',
    'Mixreg': False,
    'Policy': None,
    'add_FCL': True
}

In [None]:
#  ============================       RUNNING LOOP      ==============================

# run_setups =['Run1','Run2','Run3','Run4','Run5'] # Specify a list of setups to run
#run_setups =['ex1','ex2','ex3','ex4','ex5', 'ex6', 'ex7'] # Specify a list of setups to run
run_setups = ['ex6', 'ex7']

for run in run_setups: # Run the training for all our setups
  # Define environment
  # check the utils.py file for info on arguments
  env_name = 'starpilot'
  env = make_env(num_envs, num_levels=num_levels, env_name='starpilot')
  print('Observation space:', env.observation_space)
  print('Action space:', env.action_space.n)
  print(f'Run setup for {run}: playing {env_name}')
  for setup_key in setup_run[run]:
    print(f'{setup_key}: {setup_run[run][setup_key]}', end = '\t')
  print('')

  # Read in the setup
  do_mixreg = setup_run[run]['Mixreg']
  data_aug = setup_run[run]['Data_aug']
  encoder_use = setup_run[run]['Encoder']
  additional_FCL = setup_run[run]['add_FCL']

  # Define network
  feature_dim = 512
  lambda_mix = 0.95
  num_actions = env.action_space.n
  in_channels = env.observation_space.shape[0]

  # Define the encoder
  if encoder_use == 'impala':
    print('Using Impala') 
    encoder = Encoder(in_channels, feature_dim) # added
  else:
    encoder = Encoder2(in_channels, feature_dim) # added

  # Initialize the policy
  if additional_FCL:
    policy = Policy_add_FCL(encoder, feature_dim, num_actions)
  else:
    policy = Policy(encoder, feature_dim, num_actions)
  policy.cuda()

  # Define optimizer
  # these are reasonable values but probably not optimal
  optimizer = torch.optim.Adam(policy.parameters(), lr=5e-4, eps=1e-5)

  # Define temporary storage
  # we use this to collect transitions during each iteration
  storage = Storage(
      env.observation_space.shape,
      num_steps,
      num_envs
  )

  # Run training
  obs = env.reset()
  nenv = env.num_envs
  device = torch.device('cpu')

  # Change the first observations to desired augmentation
  if data_aug == 'grayscale':
    obs = grayscale(obs,device)
  elif data_aug == 'random_cutout':
    # Initialize as a numpy array then convert to tensor
    obs = np.zeros((nenv,) + env.observation_space.shape, dtype=env.observation_space.dtype.name)
    obs[:] = env.reset()

    # Do the cutout and transfer to tensor
    obs = random_cutout(obs,12,24)
    obs = torch.from_numpy(obs)
  elif data_aug == 'color_jitter':
    color_jitter(obs)

  step = 0
  # Initilize mean_reward for each setup that we store in the end
  mean_rewards = []
  mean_rewards_done = []
  first_loop = True
  start = time() # Lets measure how long each training task takes
  while step < total_steps:
    # Use policy to collect data for num_steps steps
    policy.eval()
    for _ in range(num_steps):
      # Use policy
      action, log_prob, value = policy.act(obs)
      
      # Take step in environment
      next_obs = np.zeros((nenv,) + env.observation_space.shape, dtype=env.observation_space.dtype.name)

      # numpy obs
      next_obs[:], reward, done, info = env.step(action)

      # Store data
      storage.store(obs, action, reward, done, info, log_prob, value)
      
      # Update current observation
      # obs = next_obs

      # Make augmented transformation, probably possible to do this another way, like in a class to avoid the if statements
      if data_aug == 'grayscale':
        obs = torch.from_numpy(next_obs)
        obs = grayscale(obs,device)
      elif data_aug == 'random_cutout':
        obs = random_cutout(next_obs,12,24)
        obs = torch.from_numpy(obs)
      elif data_aug == 'color_jitter':
        obs = torch.from_numpy(next_obs)
        color_jitter(obs)
      else:
        obs = torch.from_numpy(next_obs)

    # Add the last observation to collected data
    _, _, value = policy.act(obs)
    storage.store_last(obs, value)

    # Compute return and advantage
    storage.compute_return_advantage()

    # Optimize policy
    policy.train()
    for epoch in range(num_epochs):

      # Iterate over batches of transitions
      generator = storage.get_generator(batch_size)
      for batch in generator:
        b_obs, b_action, b_log_prob, b_value, b_returns, b_advantage = batch

        if do_mixreg: 
          index_ij = torch.randint(0, batch_size-1, (batch_size,2))
          b_obs = lambda_mix*b_obs[index_ij[:,0]] + (1-lambda_mix)*b_obs[index_ij[:,1]]
          b_log_prob = lambda_mix*b_log_prob[index_ij[:,0]] + (1-lambda_mix)*b_log_prob[index_ij[:,1]]
          b_value = lambda_mix*b_value[index_ij[:,0]] + (1-lambda_mix)*b_value[index_ij[:,1]]
          b_returns = lambda_mix*b_returns[index_ij[:,0]] + (1-lambda_mix)*b_returns[index_ij[:,1]]
          b_advantage = lambda_mix*b_advantage[index_ij[:,0]] + (1-lambda_mix)*b_advantage[index_ij[:,1]]
          if (lambda_mix >= 0.5):
            b_action = b_action[index_ij[:,0]]
          else:
            b_action = b_action[index_ij[:,1]]

        # Get current policy outputs
        new_dist, new_value = policy(b_obs)
        new_log_prob = new_dist.log_prob(b_action)

        # Clipped policy objective
        ratio = torch.exp(new_log_prob - b_log_prob) # added
        # ratio = b_log_prob/new_log_prob # added
        clipped_ratio = ratio.clamp(min=1.0 - eps,max=1.0 + eps) # added
        # pi_loss = torch.min(rt_theta*b_advantage,) # added
        pi_loss = -torch.min(ratio * b_advantage,clipped_ratio * b_advantage).mean() # added

        # Clipped value function objective
        clipped_value = b_value + (new_value - b_value).clamp(min=-eps, max=eps) # added
        # value_loss = (new_value - b_value)**2 # added
        value_loss = 0.5 * torch.max((b_value - b_returns) ** 2, (clipped_value - b_returns) **2).mean() # added

        # Entropy loss
        entropy_loss = -new_dist.entropy().mean() # added

        # Backpropagate losses
        loss = pi_loss + value_coef*value_loss + entropy_coef*entropy_loss # added
        loss.backward()

        # Clip gradients
        torch.nn.utils.clip_grad_norm_(policy.parameters(), grad_eps) # added

        # Update policy
        optimizer.step()
        optimizer.zero_grad()

    # Update stats
    #mean_rewards.append(storage.get_reward(normalized_reward=False))
    mean_rewards.append(storage.get_reward())
    done_reward = sum((sum(storage.reward)/(sum(storage.done)+1)))/num_envs

    # TODO: If you never die implement an if statement that doesn't include the plus 1
    mean_rewards_done.append(done_reward)
    step += num_envs * num_steps
    print(f'Step: {step}\tMean reward: {storage.get_reward()}, \tMean reward done: {done_reward}')
    if first_loop:
      end = time()
      time_total = end-start
      estimated_time = (8e6/(8192/time_total))/3600
      print(f'Estimated time of completion: {estimated_time} hours')
    first_loop = False
  end = time()
  time_total = end-start
  # Save the newest version after every epoch
  torch.save({
              'Setup': setup_run[run], # Have 
              'policy_state_dict': policy.state_dict(), # This is the policy
              'encoder_state_dict': encoder.state_dict(),
              'optimizer_state_dict': optimizer.state_dict(), # The optimizer used
              'Mean Reward': mean_rewards,
              'Mean Reward Done': mean_rewards_done,
              'Training time': time_total,
              }, f'/content/gdrive/MyDrive/Deep Learning Project 2020/data/{run}.pt')
  print(f'Completed training of {run}!')
  torch.save(policy.state_dict(), f'{run}_policy.pt')

# torch.save(policy.state_dict(), f'{run}_policy.pt')
print('Completed all runs!')

Observation space: Box(0.0, 1.0, (3, 64, 64), float32)
Action space: 15
Run setup for ex6: playing starpilot
Encoder: nature	Data_aug: regular	Mixreg: False	Policy: None	add_FCL: True	
Step: 8192	Mean reward: 5.28125, 	Mean reward done: 2.391862630844116
Estimated time of completion: 3.250933109989597 hours
Step: 16384	Mean reward: 6.125, 	Mean reward done: 1.333879828453064
Step: 24576	Mean reward: 5.875, 	Mean reward done: 1.3695173263549805
Step: 32768	Mean reward: 6.53125, 	Mean reward done: 1.5071823596954346
Step: 40960	Mean reward: 7.5625, 	Mean reward done: 1.6400834321975708
Step: 49152	Mean reward: 7.5625, 	Mean reward done: 1.65478515625
Step: 57344	Mean reward: 5.25, 	Mean reward done: 1.1469238996505737
Step: 65536	Mean reward: 8.78125, 	Mean reward done: 2.026515007019043
Step: 73728	Mean reward: 7.59375, 	Mean reward done: 1.4697065353393555
Step: 81920	Mean reward: 5.71875, 	Mean reward done: 1.202273964881897
Step: 90112	Mean reward: 7.78125, 	Mean reward done: 1.64345