<a href="https://colab.research.google.com/github/jbpacker/deep-rl-class/blob/main/unit7/a2c.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# My a2c implementation

* [view results](https://wandb.ai/jefsnacker/a2c_CartPole-v1?workspace=user-jefsnacker)

resources:

* [a2c walkthrough](https://towardsdatascience.com/understanding-actor-critic-methods-931b97b6df3f)
* [huggingface class](https://huggingface.co/blog/deep-rl-a2c)
* [huggingface a2c](https://github.com/huggingface/deep-rl-class/blob/main/unit7/unit7.ipynb)
* [a2c commic](https://cdn.discordapp.com/attachments/997489654565712002/1003348192093540462/unknown.png)
* [pytorch implementation](https://github.com/pytorch/examples/blob/main/reinforcement_learning/actor_critic.py)
* [single step example](https://medium.com/deeplearningmadeeasy/advantage-actor-critic-a2c-implementation-944e98616b) with [code](https://github.com/hermesdt/reinforcement-learning/blob/master/a2c/cartpole_a2c_online.ipynb)


## Get Everything Ready

### Install deps

In [None]:
!apt install python-opengl
!apt install ffmpeg
!apt install xvfb
!pip3 install pyvirtualdisplay

# Virtual display
from pyvirtualdisplay import Display

virtual_display = Display(visible=0, size=(500, 500))
virtual_display.start()

!pip install gym
!pip install stable-baselines3[extra]
!pip install git+https://github.com/ntasfi/PyGame-Learning-Environment.git
!pip install git+https://github.com/qlan3/gym-games.git
!pip install huggingface_hub
!pip install wandb
!pip install imageio-ffmpeg

!pip install pyyaml==6.0 # avoid key error metadata

!pip install pyglet # Virtual Screen

### Imports

In [2]:
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
%matplotlib inline

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

import wandb

import gym
import gym_pygame

from huggingface_hub import notebook_login # To log to our Hugging Face account to be able to upload models to the Hub.

import imageio

### Select training device

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

## Networks

### Critic

In [4]:
class CriticNetwork(nn.Module):
    def __init__(self, num_obs):
        """
        Takes the state as input and outputs Q(s), which is
        a vector of Q values for all possible actions
        """
        super(CriticNetwork, self).__init__()
        
        self.num_obs = num_obs

        self.l1 = nn.Linear(num_obs, 128)
        self.l2 = nn.Linear(128, 256)
        self.l3 = nn.Linear(256, 1)

    def forward(self, x):
        x = torch.from_numpy(x).float()
        x = self.l1(x)
        x = F.relu(x)

        x = self.l2(x)
        x = F.relu(x)
        
        return self.l3(x)
    
    ## Used if model output is Q
    # def get_all_q(self, state):
    #     state = torch.from_numpy(state).float().unsqueeze(0)
    #     qs = self.forward(state)
    #     return qs

    # def get_q(self, state, action):
    #     return self.get_all_q(state)[:,action]

## Debugging
# env = gym.make("CartPole-v1")
# c = CriticNetwork(env.observation_space.shape[0], env.action_space.n)
# print(c)
# s = env.reset()

# print(c.get_all_q(s))
# print(c.get_q(s, 1))

### Actor

In [5]:
class ActorNetwork(nn.Module):
    def __init__(self, num_obs, num_act):
        super(ActorNetwork, self).__init__()
        
        self.num_obs = num_obs
        self.num_act = num_act

        self.l1 = nn.Linear(num_obs, 128)
        self.l2 = nn.Linear(128, 256)
        self.l3 = nn.Linear(256, num_act)

    def forward(self, x):
        x = self.l1(x)
        x = F.relu(x)
        x = self.l2(x)
        x = F.relu(x)
        action_scores = self.l3(x)
        action_probs = F.softmax(action_scores, dim=1)

        return action_probs

    def act(self, state):
        """
        Given a state, take action
        """
        state = torch.from_numpy(state).float().unsqueeze(0)
        probs = self.forward(state)
        m = Categorical(probs)
        action = m.sample()
        return action, m.log_prob(action)

## Training

### utils

In [6]:
def make_networks(env):
    num_obs = env.observation_space.shape[0]
    num_act = env.action_space.n

    actor = ActorNetwork(num_obs, num_act)
    critic = CriticNetwork(num_obs)

    return actor, critic

In [12]:
def record_video(env, policy, out_directory, fps=30):
    images = []  
    done = False
    state = env.reset()
    img = env.render(mode='rgb_array')
    images.append(img)
    while not done:
        # Take the action (index) that have the maximum expected future reward given that state
        action, _ = policy.act(state)
        state, reward, done, info = env.step(action.item()) # We directly put next_state = state for recording logic
        img = env.render(mode='rgb_array')
        images.append(img)
        action.detach()
    imageio.mimsave(out_directory, [np.array(img) for i, img in enumerate(images)], fps=fps)
    wandb.log({"videos": wandb.Video(out_directory, fps=fps)})

# env_id = "CartPole-v1"
# env = gym.make(env_id)
# policy = PolicyNetwork(num_obs, num_act)
# record_video(env, policy, "/home/out.gif", fps=30)

### Training Loop

In [8]:
def train(env_id):
    if log: 
        name = "a2c_" + env_id
        wandb.init(project=name)

    env = gym.make(env_id)
    reward = 0
    done = False

    actor, critic = make_networks(env)

    if log:
        wandb.watch((actor, critic), log_freq=1)

    actor_optimizer = optim.Adam(actor.parameters(), lr=a_lr)
    critic_optimizer = optim.Adam(critic.parameters(), lr=c_lr)


    # (next_state)
    #      o
    next_state = env.reset()

    episode_steps = 0
    episode_reward = 0
    num_episodes = 1

    for step in range(1, n_steps):
        #      (state)
        #  (-->)  o
        state = next_state

        #      (state)  r,a  (next_state)
        #  (-->)  o ------------> o
        action, log_prob = actor.act(state)
        next_state, reward, done, info = env.step(action.item())
        if done:
            advantage = reward - critic(state)
        else:
            advantage = reward + gamma*critic(next_state) - critic(state)

        episode_steps += 1
        episode_reward += reward

        ## update critic
        critic_loss = advantage.pow(2).mean()
        critic_loss.backward()

        critic_optimizer.step()
        critic_optimizer.zero_grad()

        ## update actor
        # detach advantage to update the 2nd network
        actor_loss = -log_prob * advantage.detach()
        actor_loss.backward()

        actor_optimizer.step()
        actor_optimizer.zero_grad()

        ## If done next step them reset env
        if done or episode_steps > max_episode_steps:

            if log:
                wandb.log({"episode_steps": episode_steps,
                           "episode_reward": episode_reward,
                           "num_episodes": num_episodes})

            # (next_state, next_done)
            #           o
            next_state = env.reset()
            next_done = False
            episode_steps = 0
            episode_reward = 0
            num_episodes += 1


        if log and step % log_rate == 0:
            wandb.log({"step": step, 
                       "actor_loss": actor_loss,
                       "critic_loss": critic_loss,
                       "advantage": advantage})
            if record_vids and num_episodes % num_episodes_to_vid == 0:
                record_video(env, actor, "/out.mp4")

### episodic implementation
[example](https://github.com/hermesdt/reinforcement-learning/blob/master/a2c/cartpole_a2c_episodic.ipynb)

In [None]:
def train_episodic(env_id):
    if log: 
        name = "a2c_" + env_id
        wandb.init(project=name)

    env = gym.make(env_id)
    reward = 0
    done = False

    actor, critic = make_networks(env)

    if log:
        wandb.watch((actor, critic), log_freq=1)

    actor_optimizer = optim.Adam(actor.parameters(), lr=a_lr)
    critic_optimizer = optim.Adam(critic.parameters(), lr=c_lr)

    num_episodes = 1
    episode_steps = 1
    episode_reward = 0
    next_state = env.reset()

    # each episode collects 1 episode or N steps and trains
    for epoch in range(n_epochs):
        # (next_state)
        #      o
        advantages = []
        log_probs = []

        for step in range(0, steps_per_epoch):
            #      (state)
            #  (-->)  o
            state = next_state

            #      (state)  r,a  (next_state)
            #  (-->)  o ------------> o
            action, log_prob = actor.act(state)
            next_state, reward, done, info = env.step(action.detach().item())
            if done:
                advantage = reward - critic(state)
            else:
                advantage = reward + gamma*critic(next_state) - critic(state)

            advantages.append(advantage)
            log_probs.append(log_prob)

            episode_reward += reward
            episode_steps += 1

            # print("[{}] a: {} r: {} d: {}".format(step, advantage, reward, done))

            if done or episode_steps > max_episode_steps:
                if log:
                    wandb.log({
                        "episode_steps": episode_steps,
                        "episode_reward": episode_reward,
                        "num_epochs": epoch,
                        "num_episodes": num_episodes,
                    })

                num_episodes += 1
                episode_reward = 0
                episode_steps = 1
                next_state = env.reset()

        ## update critic
        advantage = torch.stack(advantages)

        critic_loss = advantage.pow(2).mean()
        critic_loss.backward()

        critic_optimizer.step()
        critic_optimizer.zero_grad()

        ## update actor
        # detach advantage to update the 2nd network
        actor_loss = (-torch.stack(log_probs) * advantage.detach()).mean()
        actor_loss.backward()

        actor_optimizer.step()
        actor_optimizer.zero_grad()

        ## If done next step them reset env
        if log:
            wandb.log({
                "actor_loss": actor_loss,
                "critic_loss": critic_loss,
                "epoch": epoch,
            })
            if record_vids and epoch % num_episodes_to_vid == 0:
                    record_video(env, actor, "/content/out.mp4")

## Compare to stable baselines 3 a2c implementation

### My code


In [None]:

a_lr = 1e-3
c_lr = 1e-3
gamma = 0.99
n_epochs = 5000
steps_per_epoch = 128
max_episode_steps = 500

log = True
num_episodes_to_vid = 100
record_vids = True

env_id = "CartPole-v1"
# env_id = "Pixelcopter-PLE-v0"

train_episodic(env_id)



In [None]:
n_steps = 1000000
a_lr = 1e-3
c_lr = 1e-3
gamma = 0.99
max_episode_steps = 500

log = True
log_rate = 1
num_episodes_to_vid = 500
record_vids = True

# env_id = "CartPole-v1"
env_id = "Pixelcopter-PLE-v0"

train(env_id)

### Stable baselines 3 implementation

In [None]:
from stable_baselines3 import A2C

from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv, VecVideoRecorder

from wandb.integration.sb3 import WandbCallback

env_id = "CartPole-v1"
# env_id = "Pixelcopter-PLE-v0"
policy = 'MlpPolicy'

config = {
    "env_id": env_id,
    "policy": policy,
}
record_video_every_n_steps = 50000
total_timesteps = 400000


## Set up logging
name = "a2c_" + env_id
run = wandb.init(project=name, 
                 config=config,
                 sync_tensorboard=True, # auto-upload sb3's tensorboard metrics
                 monitor_gym=True,  # auto-upload the videos of agents playing the game
                 save_code=True)

## Make the environment
def make_env():
    env = gym.make(config["env_id"])
    env = Monitor(env)  # record stats such as returns
    return env

env = DummyVecEnv([make_env] * 1) # 1 simulation
env = VecVideoRecorder(
    env, 
    f"videos/{run.id}", 
    record_video_trigger=lambda x: x % record_video_every_n_steps == 0, 
    video_length=200
)

# Custom actor (pi) and value function (vf) networks
# of two layers of size 32 each with Relu activation function
policy_kwargs = dict(activation_fn=torch.nn.ReLU,
                     net_arch=[dict(pi=[128, 256], vf=[128, 256])])
# Create the agent
model = A2C("MlpPolicy", env_id, policy_kwargs=policy_kwargs, verbose=1)


## Make the model
model = A2C(
    policy = 'MlpPolicy',
    policy_kwargs=policy_kwargs,
    env = env,
    n_steps = 50000,
    # learning_rate=linear_schedule(init_learning_rate),
    # batch_size = batch_size,
    tensorboard_log=f"runs/{run.id}"
) 

## Train!
model.learn(
    total_timesteps=total_timesteps,
    callback=WandbCallback(
        verbose=2,
        model_save_path=f"models/{run.id}"
    )
)
run.finish()