<a href="https://colab.research.google.com/github/jbpacker/deep-rl-class/blob/main/unit5/HuggingFace_unit_5_%F0%9F%92%AA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Unit 5: Code your first Deep Reinforcement Learning Algorithm with PyTorch: Reinforce. And test its robustness 💪

link to [original colab](https://colab.research.google.com/github/huggingface/deep-rl-class/blob/main/unit5/unit5.ipynb)

🎮 Environments: 
- [CartPole-v1](https://www.gymlibrary.ml/environments/classic_control/cart_pole/)
- [PixelCopter](https://pygame-learning-environment.readthedocs.io/en/latest/user/games/pixelcopter.html)
- [Pong](https://pygame-learning-environment.readthedocs.io/en/latest/user/games/pong.html)

## get everything ready

### Step 1: install libraries

In [None]:
!apt install python-opengl
!apt install ffmpeg
!apt install xvfb
!pip3 install pyvirtualdisplay

# Virtual display
from pyvirtualdisplay import Display

virtual_display = Display(visible=0, size=(500, 500))
virtual_display.start()

In [2]:
!pip install gym
!pip install git+https://github.com/ntasfi/PyGame-Learning-Environment.git
!pip install git+https://github.com/qlan3/gym-games.git
!pip install huggingface_hub
!pip install wandb

!pip install pyyaml==6.0 # avoid key error metadata

!pip install pyglet # Virtual Screen

### Step 2: import packages

In [3]:
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
%matplotlib inline

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

import wandb

import gym
import gym_pygame

from huggingface_hub import notebook_login # To log to our Hugging Face account to be able to upload models to the Hub.

import imageio

will print the device to be used

In [4]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


## Build Architecture

### Step 3: Create the CartPole environment and understand how it works
#### [The environment 🎮](https://www.gymlibrary.ml/environments/classic_control/cart_pole/)

In [5]:
env_id = "CartPole-v1"
env = gym.make(env_id)

### Build Model

fully connected nn obs input and action output

In [6]:
class PolicyNetwork(nn.Module):
    def __init__(self, num_obs, num_act):
        super(PolicyNetwork, self).__init__()
        
        self.num_obs = num_obs
        self.num_act = num_act

        self.l1 = nn.Linear(num_obs, 128)
        self.dropout = nn.Dropout(p=0.6)
        self.l2 = nn.Linear(128, num_act)

    def forward(self, x):
        x = self.l1(x)
        x = self.dropout(x)
        x = F.relu(x)
        action_scores = self.l2(x)
        action_probs = F.softmax(action_scores, dim=1)

        return action_probs

    def act(self, state):
        """
        Given a state, take action
        """
        state = torch.from_numpy(state).float().unsqueeze(0)
        probs = self.forward(state)
        m = Categorical(probs)
        action = m.sample()
        return action.item(), m.log_prob(action)

## Build the Reinforce Training Algorithm

Start with loop that collects an episode and saves it into a replay buffer

In [7]:
def generate_episode_data(policy):
    data = [
            np.empty((0, policy.num_obs), dtype=np.float32), # obs
            np.empty((0, 1), dtype=np.float32), # action
            np.empty((0, 1), dtype=np.float32), # reward
            np.empty((0, 1), dtype=bool),  # done
            np.empty((0, policy.num_obs), dtype=np.float32), # next_obs
            ] 

    log_prob = []

    state = env.reset()
    done = False
    reward = 0

    while not done:
        action, lp = policy.act(state)
        log_prob.append(lp)
        data[0] = np.append(data[0], np.reshape(state, (1,-1)), axis=0)
        data[1] = np.append(data[1], np.reshape(action, (1,-1)), axis=0)
        data[2] = np.append(data[2], np.reshape(reward, (1,-1)), axis=0)
        data[3] = np.append(data[3], np.reshape(done, (1,-1)), axis=0)

        state, reward, done, info = env.step(action)

        data[4] = np.append(data[4], np.reshape(state, (1,-1)), axis=0)

    # The final replay buffer idx won't have a "next_state" or "action"
    data[0] = np.append(data[0], np.reshape(state, (1,-1)), axis=0)
    data[2] = np.append(data[2], np.reshape(reward, (1,-1)), axis=0)
    data[3] = np.append(data[3], np.reshape(done, (1,-1)), axis=0)

    return data, log_prob

## Debug printing
# data, _ = generate_episode_data(policy)
# for i in range(5):
#     print(len(data[i]))
# print(data)


Next a function that takes the replay buffer and calculates cumulative reward

In [8]:
def find_cumulative_reward(data, gamma):
    num_states = len(data[0])
    cumulative_reward = np.empty((num_states, 1), dtype=np.float32)
    cumulative_reward[num_states - 1] = data[2][num_states - 1]
    for i in reversed(range(num_states-1)):
        cumulative_reward[i] = data[2][i] + gamma * cumulative_reward[i + 1]

    return cumulative_reward

## For debugging
# gamma = 0.99
# data, _ = generate_episode_data(policy)
# R = find_cumulative_reward(data, gamma)

# for i in range(len(data[2])):
#     print("[{}] r: {} cr: {}".format(i, data[2][i], R[i]))

**Notes:**

In the huggingface class G(t) is only calculated for the entire episode. Here we calculate G(t) for each state in the episode and sum them together.

This trick is then used to increase performance found [in the pytorch reinforce implementation](https://github.com/pytorch/examples/blob/main/reinforcement_learning/reinforce.py)
```
R(t) = G(t) - mean(G(t)) / std(G(t))
```


In [17]:
def train_single_episode(optimizer, policy):
    data, log_prob = generate_episode_data(policy)
    R = find_cumulative_reward(data, gamma)

    policy_losses = []
    R = torch.tensor(R)

    # This comes from the pytorch reinforce example
    # https://github.com/pytorch/examples/blob/main/reinforcement_learning/reinforce.py
    R = (R - R.mean()) / (R.std() + eps)
    for r, l_p in zip(R, log_prob):
        # Weird for me here that 
        policy_losses.append(-l_p * r)
    
    optimizer.zero_grad()
    # Note here that div by the len seems to degrade performance
    policy_loss = torch.cat(policy_losses).sum()# / len(data[0])

    if log:
        wandb.log({"loss": policy_loss})
        wandb.log({"reward sum": np.sum(data[2])})
        wandb.log({"episode length": len(data[0])})

    policy_loss.backward()
    optimizer.step()

    tensor_obs = torch.from_numpy(data[0])


## Debug - Single Step
# env_id = "CartPole-v1"
# env = gym.make(env_id)

# num_obs = env.observation_space.shape[0]
# num_act = env.action_space.n

# policy = PolicyNetwork(num_obs, num_act)
# optimizer = optim.Adam(policy.parameters(), lr=1e-2)
# # eps = np.finfo(np.float32).eps.item()

# train_single_episode(optimizer, policy)

In [10]:
def record_video(env, policy, out_directory, fps=30):
    images = []  
    done = False
    state = env.reset()
    img = env.render(mode='rgb_array')
    images.append(img)
    while not done:
        # Take the action (index) that have the maximum expected future reward given that state
        action, _ = policy.act(state)
        state, reward, done, info = env.step(action) # We directly put next_state = state for recording logic
        img = env.render(mode='rgb_array')
        images.append(img)
    imageio.mimsave(out_directory, [np.array(img) for i, img in enumerate(images)], fps=fps)
    wandb.log({"video": wandb.Video(out_directory, fps=fps)})

# env_id = "CartPole-v1"
# env = gym.make(env_id)
# policy = PolicyNetwork(num_obs, num_act)
# record_video(env, policy, "/home/out.gif", fps=30)

In [20]:
def train(env_id, policy_cls):
    if log: 
        name = "reinforce_" + env_id
        wandb.init(project=name)
    env = gym.make(env_id)

    num_obs = env.observation_space.shape[0]
    num_act = env.action_space.n

    policy = policy_cls(num_obs, num_act)

    if log: 
        wandb.watch(policy, log_freq=1)  

    optimizer = optim.Adam(policy.parameters(), lr=lr)
    # eps = np.finfo(np.float32).eps.item()

    for i in range(1, steps):
        # print(i)
        if log:
            wandb.log({"epoch": i})
        train_single_episode(optimizer, policy)

        if log and i % 50 == 0:
            record_video(env, policy, "/home/out.mp4", fps=30)

In [12]:
# Discount factor
gamma = 0.99
steps = 500
lr = 1e-3

log = True
eps = np.finfo(np.float32).eps.item()

In [None]:
env_id = "CartPole-v1"
train(env_id, PolicyNetwork)

## Let's try pong and pixelcopter!

seems like I'm coming up against a sparse rewards problem here, and it's having a hard time making it past the first few states.

In [22]:
class PongPolicyNetwork(nn.Module):
    def __init__(self, num_obs, num_act):
        super(PongPolicyNetwork, self).__init__()

        self.num_obs = num_obs
        self.num_act = num_act

        self.dropout1 = nn.Dropout(p=0.4)
        self.dropout2 = nn.Dropout(p=0.3)
        self.l1 = nn.Linear(self.num_obs, 256)
        self.l2 = nn.Linear(256, 512)
        self.l3 = nn.Linear(512, num_act)

    def forward(self, x):
        x = self.l1(x)
        x = self.dropout1(x)
        x = F.relu(x)
        x = self.l2(x)
        x = self.dropout2(x)
        x = F.relu(x)
        action_scores = self.l3(x)
        action_probs = F.softmax(action_scores, dim=1)

        return action_probs

    def act(self, state):
        """
        Given a state, take action
        """
        state = torch.from_numpy(state).float().unsqueeze(0)
        probs = self.forward(state)
        m = Categorical(probs)
        action = m.sample()
        return action.item(), m.log_prob(action)

# Discount factor
gamma = 0.99
steps = 1000
lr = 5e-3

env_id = "Pixelcopter-PLE-v0"
train(env_id, PongPolicyNetwork)

VBox(children=(Label(value='0.030 MB of 0.030 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
episode length,█▂▂▂▂▁▂▂▂▂▂▂▂▂▂▂▂▁▂▂▂▂▂▂▂▁▂▂▂▁▁▂▂▂▁▂▂▂▂▂
epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
loss,▁███████████████████████████████████████
reward sum,█▃▁▁▁▁▁▁▁▁▃▃▃▁▁▁▃▁▁▃▁▁▃▃▃▁▃▁▃▁▁▁▃▃▁▃▁▁▃▁

0,1
episode length,23.0
epoch,499.0
loss,0.0
reward sum,-2.0


obs: 7 act: 2
