<a href="https://colab.research.google.com/github/jbpacker/deep-rl-class/blob/main/unit8/ppo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# PPO

resources
* [huggingface deep rl class readme](https://github.com/huggingface/deep-rl-class/tree/main/unit8)
* [course example code](https://github.com/huggingface/deep-rl-class/blob/main/unit8/unit8.ipynb)
* [course ppo chapter](https://huggingface.co/blog/deep-rl-ppo)
* [cleanrl ppo](https://github.com/vwxyzjn/cleanrl/blob/master/cleanrl/ppo.py)

## Setup

### Installs

In [21]:
!apt install python-opengl
!apt install ffmpeg
!apt install xvfb
!pip3 install pyvirtualdisplay

# Virtual display
from pyvirtualdisplay import Display

virtual_display = Display(visible=0, size=(500, 500))
virtual_display.start()

!pip install pybullet
!pip install gym
!pip install stable-baselines3[extra]
!pip install git+https://github.com/ntasfi/PyGame-Learning-Environment.git
!pip install git+https://github.com/qlan3/gym-games.git
!pip install huggingface_hub
!pip install wandb
!pip install imageio-ffmpeg

!pip install pyyaml==6.0 # avoid key error metadata

!pip install pyglet # Virtual Screen

Reading package lists... Done
Building dependency tree       
Reading state information... Done
python-opengl is already the newest version (3.1.0+dfsg-1).
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 19 not upgraded.
Reading package lists... Done
Building dependency tree       
Reading state information... Done
ffmpeg is already the newest version (7:3.4.11-0ubuntu0.1).
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 19 not upgraded.
Reading package lists... Done
Building dependency tree       
Reading state information... Done
xvfb is already the newest version (2:1.19.6-1ubuntu4.11).
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.


### Imports

In [22]:
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
%matplotlib inline

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

import wandb

import pybullet_envs
import gym
import gym_pygame

from huggingface_hub import notebook_login # To log to our Hugging Face account to be able to upload models to the Hub.

import imageio

### device allocation

In [23]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

## Helper functions



In [24]:
def record_video(env, policy, out_director="/content/out.mp4", fps=30):
    images = []  
    done = False
    state = env.reset()
    img = env.render(mode='rgb_array')
    images.append(img)
    while not done:
        # Take the action (index) that have the maximum expected future reward given that state
        action, _ = policy.act(state)
        state, reward, done, info = env.step(action.item()) # We directly put next_state = state for recording logic
        img = env.render(mode='rgb_array')
        images.append(img)
        action.detach()
    imageio.mimsave(out_directory, [np.array(img) for i, img in enumerate(images)], fps=fps)
    wandb.log({"videos": wandb.Video(out_directory, fps=fps)})

# env_id = "CartPole-v1"
# env = gym.make(env_id)
# policy = PolicyNetwork(num_obs, num_act)
# record_video(env, policy, "/home/out.gif", fps=30)

## Network

In [25]:
class ActorCriticPolicy(nn.Module):
    def __init__(self, num_obs, num_acts):
        super(ActorCriticPolicy, self).__init__()

        self.l1_actor = nn.Linear(num_obs, 128)
        self.l2_actor = nn.Linear(128, 256)
        self.l3_actor = nn.Linear(256, num_acts)

        self.l1_critic = nn.Linear(num_obs, 128)
        self.l2_critic = nn.Linear(128, 256)
        self.l3_critic = nn.Linear(256, 1)

    def forward(self, x):
        x_actor = self.l1_actor(x)
        x_actor = F.relu(x_actor)
        x_actor = self.l2_actor(x_actor)
        x_actor = F.relu(x_actor)
        action_scores = self.l3_actor(x_actor)
        action_probs = F.softmax(action_scores, dim=1)

        x_critic = self.l1_critic(x)
        x_critic = F.relu(x_critic)
        x_critic = self.l2_critic(x_critic)
        x_critic = F.relu(x_critic)
        value = self.l3_critic(x_critic)

        return action_probs, value

## Training

### util classes

In [80]:
class Buffer():
    def __init__(self, env, batch_size, minibatch_size = None, gamma = 0.99):
        self.batch_size = batch_size
        self.gamma = gamma
        if minibatch_size is None:
            self.minibatch_size = batch_size
        else:
            self.minibatch_size = minibatch_size

        assert self.batch_size % self.minibatch_size == 0, "batch size must be evenly divisible by minibatch size"

        self.num_states = env.observation_space.shape[0]
        self.num_actions = env.action_space.n
        
        self.reset()
    
    def add(self, state, next_state, action, action_prob, reward, done, value):
        self.states[self.add_idx] = state
        self.next_states[self.add_idx] = next_state
        self.actions[self.add_idx] = action
        self.action_probs[self.add_idx] = action_prob
        self.values[self.add_idx] = value
        self.rewards[self.add_idx] = reward
        self.dones[self.add_idx] = done

        self.add_idx += 1
        assert self.add_idx <= self.batch_size, "adding too many samples to buffer!"
        assert len(self) <= self.batch_size, "adding too many samples to buffer!"

    def reset(self):
        self.states = np.zeros((self.batch_size, self.num_states))
        self.next_states = np.zeros((self.batch_size, self.num_states))
        self.actions = np.zeros(self.batch_size)
        self.action_probs = np.zeros((self.batch_size, self.num_actions))
        self.values = np.zeros(self.batch_size)
        self.rewards = np.zeros(self.batch_size)
        self.dones = np.zeros(self.batch_size, dtype=bool)
        self.advantages = np.zeros(self.batch_size)

        self.add_idx = 0

        self.shuffled_idxs = np.zeros(batch_size, dtype=int)
        self.minibatch_idxs = np.zeros(minibatch_size, dtype=int)

    def __len__(self):
        return len(self.states)

    def calculate_advantages(self, policy, next_state):
        _, next_value = policy(torch.from_numpy(next_state).float().unsqueeze(0))
        next_value = next_value.detach().numpy()
        for i in reversed(range(len(self))):
            mask = 1 - self.dones[i]
            if i < len(self) - 1:
                next_value = self.values[i + 1]
            advantage = self.rewards[i] + self.gamma * mask * next_value - self.values[i]
            self.advantages[i] = advantage

        assert(self.advantages.shape[0] == len(self)), "final adv sizes don't match (batch size: {} adv size {})".format(len(self), self.advantages.shape[0])

    def num_minibatches(self):
        return (int)(len(self) / self.minibatch_size)

    def shuffle_minibatches(self):
        self.minibatch_idx = 0

        
        self.shuffled_idxs = np.arange(len(self))
        np.random.shuffle(self.shuffled_idxs)
        
    def get_minibatch_idxs(self):
        start_idx = self.minibatch_idx * self.minibatch_size
        end_idx = (self.minibatch_idx+1) * self.minibatch_size
        self.minibatch_idxs = self.shuffled_idxs[start_idx:end_idx]
        self.minibatch_idx += 1
        return self.minibatch_idxs

    def print(self):
        for i in range(len(self)):
            print("[{}] s: {} a: {} r: {} d: {}".format(i, self.states[i], self.actions[i], self.rewards[i], self.dones[i]))

## Tests
# steps = 40
# env = gym.make("CartPole-v1")
# b = Buffer(env, steps, 5)
# b.reset()
# policy = ActorCriticPolicy(env.observation_space.shape[0], env.action_space.n)
# next_state = env.reset()
# for i in range(steps):
#     state = next_state

#     # sample action
#     probs, value = policy(torch.from_numpy(state).float().unsqueeze(0))
#     action_dist = Categorical(probs)
#     action = action_dist.sample()

#     next_state, reward, done, info = env.step(action.item())
#     b.add(state, next_state, action, probs.detach().numpy(), reward, done, value)

#     if done:
#         next_state = env.reset()

## advantages
# b.calculate_advantages(policy, next_state)
# for i in range(len(b)):
#     print("[{}] r: {} d: {} value: {} adv: {}".format(i, b.rewards[i], b.dones[i], b.values[i], b.advantages[i]))

## minibatches
# b.shuffle_minibatches()
# print(b.shuffled_idxs)
# for i in range(b.num_minibatches()):
#     print(b.get_minibatch_idxs())

## data is added correctly
# for i in range(len(b)):
#     print("[{}] s: {} a: {} r: {} d: {}".format(i, b.states[i], b.actions[i], b.rewards[i], b.dones[i]))
# b.print()

In [27]:
class RolloutGenerator():
    def __init__(self, env, batch_size, minibatch_size, max_episode_steps, log):
        self.log = log
        self.max_episode_steps = max_episode_steps
        self.buffer = Buffer(env, batch_size, minibatch_size)
        
        self.episode_reward = 0
        self.episode_steps = 1
        self.num_episodes = 1

        self.next_state = env.reset()

    def fill_buffer(self, env, policy):
        self.buffer.reset()
        for step in range(0, self.buffer.batch_size):
            #      (state)
            #  (-->)  o
            state = self.next_state

            # sample action
            probs, value = policy(torch.from_numpy(state).float().unsqueeze(0))
            action_dist = Categorical(probs)
            action = action_dist.sample()

            #      (state)  r,a  (next_state)
            #  (-->)  o ------------> o
            self.next_state, reward, done, info = env.step(action.item())

            self.buffer.add(state, self.next_state, action, probs.detach().numpy(), reward, done, value)

            self.episode_reward += reward
            self.episode_steps += 1

            # If episode is done or past max steps reset the env
            if done or self.episode_steps > self.max_episode_steps:
                if self.log:
                    wandb.log({
                        "episode_steps": self.episode_steps,
                        "episode_reward": self.episode_reward,
                        "num_episodes": self.num_episodes,
                    })

                self.num_episodes += 1
                self.episode_reward = 0
                self.episode_steps = 1
                
                # (next_state)
                #      o
                self.next_state = env.reset()
        
        self.buffer.calculate_advantages(policy, self.next_state)

    def get_buffer(self):
        return self.buffer

# env = gym.make("CartPole-v1")
# policy = ActorCriticPolicy(env.observation_space.shape[0], env.action_space.n)
# r = RolloutGenerator(env, 40, 10, 100, False)
# r.fill_buffer(env, policy)
# r.buffer.print()
    

### Training loop

In [97]:
def train(env_id, log, lr, batch_size, minibatch_size, max_episode_steps, n_epochs, eps = 0.2):
    if log: 
        name = "ppo_" + env_id
        wandb.init(project=name)

    env = gym.make(env_id)
    policy = ActorCriticPolicy(env.observation_space.shape[0], env.action_space.n)

    if log:
        wandb.watch(policy, log_freq=1)

    optimizer = optim.Adam(policy.parameters(), lr=lr)

    rollout = RolloutGenerator(env, batch_size, minibatch_size, max_episode_steps, log)

    # each epoch collects N steps regardless of episode length and trains
    for epoch in range(n_epochs):
        # this also calculates advantages
        rollout.fill_buffer(env, policy)
        rollout.buffer.shuffle_minibatches()

        # Go thru all minibatches
        for i in range(rollout.buffer.num_minibatches()):
            # sample minibatch idxs from buffer
            idxs = rollout.buffer.get_minibatch_idxs()

            # Calculate r(t) = pi(a, s) / pi_old(a, s)
            state = rollout.buffer.states[idxs]
            input = torch.from_numpy(state).float()
            # If a single row, then unsqueeze to make a batch of 1
            if len(input.shape) == 1:
                input = input.unsqueeze(0)


            ## TODO(jef) remove this!!! hack to test policy
            jacl = ActorCriticPolicy(env.observation_space.shape[0], env.action_space.n)
            new_probs, new_value = jacl(input)


            new_probs = new_probs.detach().numpy()
            old_probs = rollout.buffer.action_probs[idxs]
            action = rollout.buffer.actions[idxs].astype('int')

            # Make a list of action probs based on selected action
            new_action_probs = [ new_probs[index,action[index]] for index in range(len(action)) ]
            old_action_probs = [ old_probs[index,action[index]] for index in range(len(action)) ]

            # Finally do the ratio calculation with the probabilities
            r = np.zeros(minibatch_size)
            np.divide(new_action_probs, old_action_probs, r)
            r = np.expand_dims(r, axis=1)
            
            # Find L_clip
            advantage = rollout.buffer.advantages[idxs]
            r_clip = np.clip(r, 1-eps, 1+eps)
            L_clip = np.minimum(np.multiply(advantage, r),
                                np.multiply(advantage, r_clip))
            L_clip = L_clip.mean()
            
            # find L_vf
            new_value = new_value.detach().numpy()
            old_value = rollout.buffer.values[idxs]
            old_value = np.expand_dims(old_value, axis=1)

            # adv = r + gamma*v_next - v
            # so target_v = adv + v = r + gamma*v_next
            target_v = advantage + old_value
            L_vf = ((new_value - target_v)**2).mean()
            
            # find L_entropy
            c_entropy = 0
            L_entropy = 0
            c_vf = 1.0
            
            # Train
            loss = L_clip + c_vf * L_vf + + c_entropy * L_entropy

            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            # If done next step them reset env
            if log:
                wandb.log({
                    "L_clip": L_clip,
                    "L_vf": L_vf,
                    "L_entropy": L_entropy,
                    "loss": loss,
                })

        # if record_vids and epoch % num_episodes_to_vid == 0:
        #         record_video(env, policy, "/content/out.mp4")


## TODO: policy is giving different results if input is batched!
log = False
batch_size = 4
minibatch_size = 4
max_episode_steps = 100
n_epochs = 1
lr = 1e-3
eps = 0.2
train("CartPole-v1", log, lr, batch_size, minibatch_size, max_episode_steps, n_epochs, eps)

new_value
(4, 1)
[[ 0.00096625]
 [-0.00107259]
 [ 0.0015604 ]
 [-0.00095775]]
old_value
(4, 1)
[[0.04354344]
 [0.04214137]
 [0.04447105]
 [0.04306226]]


AttributeError: ignored