<a href="https://colab.research.google.com/github/jbpacker/deep-rl-class/blob/main/unit7/a2c.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# My a2c implementation

* [view results](https://wandb.ai/jefsnacker/a2c_CartPole-v1?workspace=user-jefsnacker)

resources:

* [a2c walkthrough](https://towardsdatascience.com/understanding-actor-critic-methods-931b97b6df3f)
* [huggingface class](https://huggingface.co/blog/deep-rl-a2c)
* [huggingface a2c](https://github.com/huggingface/deep-rl-class/blob/main/unit7/unit7.ipynb)
* [a2c commic](https://cdn.discordapp.com/attachments/997489654565712002/1003348192093540462/unknown.png)
* [pytorch implementation](https://github.com/pytorch/examples/blob/main/reinforcement_learning/actor_critic.py)
* [single step example](https://medium.com/deeplearningmadeeasy/advantage-actor-critic-a2c-implementation-944e98616b) with [code](https://github.com/hermesdt/reinforcement-learning/blob/master/a2c/cartpole_a2c_online.ipynb)
* [post on A2C](https://towardsdatascience.com/understanding-actor-critic-methods-931b97b6df3f)


## Get Everything Ready

### Install deps

In [1]:
!apt install python-opengl
!apt install ffmpeg
!apt install xvfb
!pip3 install pyvirtualdisplay

# Virtual display
from pyvirtualdisplay import Display

virtual_display = Display(visible=0, size=(500, 500))
virtual_display.start()

!pip install pybullet
!pip install gym
!pip install stable-baselines3[extra]
!pip install git+https://github.com/ntasfi/PyGame-Learning-Environment.git
!pip install git+https://github.com/qlan3/gym-games.git
!pip install huggingface_hub
!pip install wandb
!pip install imageio-ffmpeg

!pip install pyyaml==6.0 # avoid key error metadata

!pip install pyglet # Virtual Screen

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
Suggested packages:
  libgle3
The following NEW packages will be installed:
  python-opengl
0 upgraded, 1 newly installed, 0 to remove and 19 not upgraded.
Need to get 496 kB of archives.
After this operation, 5,416 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 python-opengl all 3.1.0+dfsg-1 [496 kB]
Fetched 496 kB in 0s (4,377 kB/s)
Selecting previously unselected package python-opengl.
(Reading database ... 155680 files and directories currently installed.)
Preparing to unpack .../python-opengl_3.1.0+dfsg-1_all.deb ...
Unpacking python-opengl (3.1.0+dfsg-1) ...
Setting up python-opengl (3.1.0+dfsg-1) ...
Reading package lists... Done
Building dependency tree       
Reading state information... Done
ff

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/ntasfi/PyGame-Learning-Environment.git
  Cloning https://github.com/ntasfi/PyGame-Learning-Environment.git to /tmp/pip-req-build-wc7_jm6q
  Running command git clone -q https://github.com/ntasfi/PyGame-Learning-Environment.git /tmp/pip-req-build-wc7_jm6q
Building wheels for collected packages: ple
  Building wheel for ple (setup.py) ... [?25l[?25hdone
  Created wheel for ple: filename=ple-0.0.1-py3-none-any.whl size=50791 sha256=7a18acf638d68d0da11947f360bb16eb024d5fb29b4b8e0fef43298da5692b05
  Stored in directory: /tmp/pip-ephem-wheel-cache-46juc_gr/wheels/cd/51/18/46ce3a7c7b4a75d9ba91594b40e028f98b2001414f6c1da798
Successfully built ple
Installing collected packages: ple
Successfully installed ple-0.0.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/qlan3/gym-games.g

### Imports

In [2]:
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
%matplotlib inline

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

import wandb

import pybullet_envs
import gym
import gym_pygame

from huggingface_hub import notebook_login # To log to our Hugging Face account to be able to upload models to the Hub.

import imageio

### Select training device

In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


## Networks

### Critic

In [4]:
class CriticNetwork(nn.Module):
    def __init__(self, num_obs):
        """
        Takes the state as input and outputs Q(s), which is
        a vector of Q values for all possible actions
        """
        super(CriticNetwork, self).__init__()
        
        self.num_obs = num_obs

        self.l1 = nn.Linear(num_obs, 128)
        self.l2 = nn.Linear(128, 256)
        self.l3 = nn.Linear(256, 1)

    def forward(self, x):
        x = self.l1(x)
        x = F.relu(x)

        x = self.l2(x)
        x = F.relu(x)
        
        return self.l3(x)
    
    ## Used if model output is Q
    # def get_all_q(self, state):
    #     state = torch.from_numpy(state).float().unsqueeze(0)
    #     qs = self.forward(state)
    #     return qs

    # def get_q(self, state, action):
    #     return self.get_all_q(state)[:,action]

## Debugging
# env = gym.make("CartPole-v1")
# c = CriticNetwork(env.observation_space.shape[0], env.action_space.n)
# print(c)
# s = env.reset()

# print(c.get_all_q(s))
# print(c.get_q(s, 1))

### Actor

In [5]:
class ActorNetwork(nn.Module):
    def __init__(self, num_obs, num_act):
        super(ActorNetwork, self).__init__()
        
        self.num_obs = num_obs
        self.num_act = num_act

        self.l1 = nn.Linear(num_obs, 128)
        self.l2 = nn.Linear(128, 256)
        self.l3 = nn.Linear(256, num_act)

    def forward(self, x):
        x = self.l1(x)
        x = F.relu(x)
        x = self.l2(x)
        x = F.relu(x)
        action_scores = self.l3(x)
        action_probs = F.softmax(action_scores, dim=1)

        return action_probs

    def act(self, state):
        """
        Given a state, take action
        """
        probs = self.forward(state)
        m = Categorical(probs)
        action = m.sample()
        return action, m.log_prob(action)

## Training

### utils

In [6]:
def make_networks(env):
    num_obs = env.observation_space.shape[0]
    num_act = env.action_space.n

    actor = ActorNetwork(num_obs, num_act)
    critic = CriticNetwork(num_obs)

    return actor, critic

In [7]:
def record_video(env, policy, out_directory, fps=30):
    images = []  
    done = False
    state = env.reset()
    img = env.render(mode='rgb_array')
    images.append(img)
    while not done:
        # Take the action (index) that have the maximum expected future reward given that state
        with torch.no_grad():
            action, _ = policy.act(torch.from_numpy(state).float().unsqueeze(0))
            state, reward, done, info = env.step(action.item()) # We directly put next_state = state for recording logic
            img = env.render(mode='rgb_array')
            images.append(img)
    imageio.mimsave(out_directory, [np.array(img) for i, img in enumerate(images)], fps=fps)
    wandb.log({"videos": wandb.Video(out_directory, fps=fps)})

# env_id = "CartPole-v1"
# env = gym.make(env_id)
# policy = PolicyNetwork(num_obs, num_act)
# record_video(env, policy, "/home/out.gif", fps=30)

### Training Loop

In [8]:
def train(env_id):
    if log: 
        name = "a2c_" + env_id
        wandb.init(project=name)

    env = gym.make(env_id)
    reward = 0
    done = False

    actor, critic = make_networks(env)

    if log:
        wandb.watch((actor, critic), log_freq=1)

    actor_optimizer = optim.Adam(actor.parameters(), lr=a_lr)
    critic_optimizer = optim.Adam(critic.parameters(), lr=c_lr)


    # (next_state)
    #      o
    next_state = env.reset()

    episode_steps = 0
    episode_reward = 0
    num_episodes = 1

    for step in range(1, n_steps):
        #      (state)
        #  (-->)  o
        state = next_state

        #      (state)  r,a  (next_state)
        #  (-->)  o ------------> o
        action, log_prob = actor.act(state)
        next_state, reward, done, info = env.step(action.item())
        if done:
            advantage = reward - critic(state)
        else:
            advantage = reward + gamma*critic(next_state) - critic(state)

        episode_steps += 1
        episode_reward += reward

        ## update critic
        critic_loss = advantage.pow(2).mean()
        critic_loss.backward()

        critic_optimizer.step()
        critic_optimizer.zero_grad()

        ## update actor
        # detach advantage to update the 2nd network
        actor_loss = -log_prob * advantage.detach()
        actor_loss.backward()

        actor_optimizer.step()
        actor_optimizer.zero_grad()

        ## If done next step them reset env
        if done or episode_steps > max_episode_steps:

            if log:
                wandb.log({"episode_steps": episode_steps,
                           "episode_reward": episode_reward,
                           "num_episodes": num_episodes})

            # (next_state, next_done)
            #           o
            next_state = env.reset()
            episode_steps = 0
            episode_reward = 0
            num_episodes += 1


        if log and step % log_rate == 0:
            wandb.log({"step": step, 
                       "actor_loss": actor_loss,
                       "critic_loss": critic_loss,
                       "advantage": advantage})
            if record_vids and num_episodes % num_episodes_to_vid == 0:
                record_video(env, actor, "/out.mp4")

### episodic implementation
[example](https://github.com/hermesdt/reinforcement-learning/blob/master/a2c/cartpole_a2c_episodic.ipynb)

In [35]:
def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
    torch.nn.init.orthogonal_(layer.weight, std)
    torch.nn.init.constant_(layer.bias, bias_const)
    return layer

class CombinedActorCriticNetwork(nn.Module):
    def __init__(self, env):
        super().__init__()
        num_obs = env.observation_space.shape[0]
        num_actions = env.action_space.n
        self.actor = nn.Sequential(
          layer_init(nn.Linear(num_obs, 64)),
          nn.ReLU(),
        #   layer_init(nn.Linear(64, 64)),
        #   nn.ReLU(),
          layer_init(nn.Linear(64, num_actions), std=1.0)
        )
        self.critic = nn.Sequential(
          layer_init(nn.Linear(num_obs, 64)),
          nn.ReLU(),
        #   layer_init(nn.Linear(64, 64)),
        #   nn.ReLU(),
          layer_init(nn.Linear(64, 1), 0.01)
        )

    def value(self, x):
        return self.critic(x)

    def act(self, x):
        logits = self.actor(x)
        probs = Categorical(logits=logits)
        action = probs.sample()
        return action, probs.log_prob(action)


def train_epi2(env_id, log=False):
    training_cycles = 5000
    num_episodes_to_vid = 100
    lr = 3e-2
    gamma = 0.99
    eps = np.finfo(np.float32).eps.item()

    if log:
        name = "a2c_" + env_id
        wandb.init(project=name)

    env = gym.make(env_id)
    agent = CombinedActorCriticNetwork(env)#.to(device)

    if log:
        wandb.watch(agent, log_freq=1)

    optimizer = optim.Adam(agent.parameters(), lr=lr, eps=1e-5)

    for i in range(training_cycles):
        logprobs = []
        rewards = []
        values = []

        state = env.reset()
        episode_reward = 0
        episode_steps = 0
        for s in range(10000):
            action, logprob = agent.act(torch.Tensor(state).float())
            value = agent.value(torch.Tensor(state).float())


            state, reward, done, info = env.step(action.item())

            logprobs.append(logprob)
            values.append(value)
            rewards.append(reward)

            episode_reward += reward
            episode_steps = s

            if done:
                break

        R = 0
        returns = []
        for r in reversed(rewards):
            R = r + gamma*R
            returns.insert(0, R)

        # convert everybody to tensors!
        returns = torch.Tensor(returns).unsqueeze(axis=1)
        avg_returns = (returns - returns.mean()) / (returns.std() + eps)

        ## NOTE: the for loop is here to preserve the grad_fn from each of the 
        # values/logprobs in the list. Without these backprop doesn't work!
        policy_losses = [] # list to save actor (policy) loss
        value_losses = [] # list to save critic (value) loss
        for logprob, value, r in zip(logprobs, values, avg_returns):
            advantage = r - value.item()
            # calculate actor (policy) loss 
            policy_losses.append(-logprob * advantage)

            # calculate critic (value) loss using L1 smooth loss
            value_losses.append(F.smooth_l1_loss(value, torch.tensor([r])))

        # reset gradients
        optimizer.zero_grad()
        loss = torch.stack(policy_losses).sum() + torch.stack(value_losses).sum()

        # perform backprop
        loss.backward()
        optimizer.step()

        if log:
            wandb.log({
                "episode_steps": episode_steps,
                "episode_reward": episode_reward,
                "num_episodes": i,
                "loss": loss,
            })
            if i % num_episodes_to_vid == 0:
                    record_video(env, agent, "/content/out.mp4")

# env_id = "CartPole-v1"
env_id = "Pixelcopter-PLE-v0"

train_epi2(env_id, log=True)

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

pygame 2.1.2 (SDL 2.0.16, Python 3.7.13)
Hello from the pygame community. https://www.pygame.org/contribute.html
couldn't import doomish
Couldn't import doom


In [15]:
def train_episodic(env_id):
    if log: 
        name = "a2c_" + env_id
        wandb.init(project=name)

    eps = np.finfo(np.float32).eps.item()
    env = gym.make(env_id)
    reward = 0
    done = False

    actor, critic = make_networks(env)

    if log:
        wandb.watch((actor, critic), log_freq=1)

    actor_optimizer = optim.Adam(actor.parameters(), lr=a_lr)
    critic_optimizer = optim.Adam(critic.parameters(), lr=c_lr)

    num_episodes = 1
    episode_steps = 1
    episode_reward = 0
    next_state = env.reset()

    # each episode collects 1 episode or N steps and trains
    for epoch in range(n_epochs):
        # (next_state)
        #      o
        rewards = []
        log_probs = []
        values = []

        for step in range(0, max_episode_steps):
            #      (state)
            #  (-->)  o
            state = next_state

            #      (state)  r,a  (next_state)
            #  (-->)  o ------------> o
            action, log_prob = actor.act(torch.from_numpy(state).float().unsqueeze(0))
            next_state, reward, done, info = env.step(action.detach().item())

            values.append(critic(torch.from_numpy(state).float().unsqueeze(0)))
            rewards.append(reward)
            log_probs.append(log_prob)

            episode_reward += reward
            episode_steps += 1

            # print("[{}] a: {} r: {} d: {}".format(step, advantage, reward, done))

            if done or episode_steps > max_episode_steps:
                if log:
                    wandb.log({
                        "episode_steps": episode_steps,
                        "episode_reward": episode_reward,
                        "num_epochs": epoch,
                        "num_episodes": num_episodes,
                    })

                num_episodes += 1
                episode_reward = 0
                episode_steps = 1
                next_state = env.reset()
                break

        R = 0
        returns = []
        for r in reversed(rewards):
            R = r + gamma*R
            returns.insert(0, R)

        returns = torch.Tensor(returns)
        returns = (returns - returns.mean()) / (returns.std() + eps)

        # for i in range(len(rewards)):
        #     print("[{}] rew: {} val: {} lp: {} R: {}".format(i, rewards[i], values[i], log_probs[i], returns[i]))

        # print("values")
        # print(values)

        ## update critic
        critic_loss = F.smooth_l1_loss(returns, torch.stack(values))
        critic_loss.backward()

        critic_optimizer.step()
        critic_optimizer.zero_grad()

        ## update actor
        # detach advantage to update the 2nd network
        advantages = returns - torch.Tensor(values)
        actor_loss = (-torch.stack(log_probs) * advantages.detach()).mean()
        actor_loss.backward()

        actor_optimizer.step()
        actor_optimizer.zero_grad()

        ## If done next step them reset env
        if log:
            wandb.log({
                "actor_loss": actor_loss,
                "critic_loss": critic_loss,
                "epoch": epoch,
            })
            if record_vids and epoch % num_episodes_to_vid == 0:
                    record_video(env, actor, "/content/out.mp4")

a_lr = 1e-3
c_lr = 1e-3
gamma = 0.99
n_epochs = 1
max_episode_steps = 500

log = False
record_vids = False

env_id = "CartPole-v1"

train_episodic(env_id)



In [None]:
a_lr = 1e-3
c_lr = 1e-3
gamma = 0.99
n_epochs = 2000
max_episode_steps = 500

log = True
num_episodes_to_vid = 100
record_vids = True

# env_id = "CartPole-v1"
env_id = "Pixelcopter-PLE-v0"

train_episodic(env_id)

### buffer
implementation with buffer to backward calculate advantage

In [8]:
class ActorCriticNetwork(nn.Module):
    def __init__(self, num_obs, num_act):
        super(ActorCriticNetwork, self).__init__()

        self.l1_actor = nn.Linear(num_obs, 64)
        self.l2_actor = nn.Linear(64, 64)
        self.l3_actor = nn.Linear(64, num_act)

        self.l1_critic = nn.Linear(num_obs, 64)
        self.l2_critic = nn.Linear(64, 64)
        self.l3_critic = nn.Linear(64, 1)

    def forward(self, x):
        x_actor = self.l1_actor(x)
        x_actor = F.relu(x_actor)

        x_actor = self.l2_actor(x_actor)
        x_actor = F.relu(x_actor)
        action_scores = self.l3_actor(x_actor)
        action_probs = F.softmax(action_scores, dim=1)

        x_critic = self.l1_critic(x)
        x_critic = F.relu(x_critic)

        x_critic = self.l2_critic(x_critic)
        x_critic = F.relu(x_critic)
        
        value = self.l3_critic(x_critic)

        return action_probs, value

    def act(self, state):
        probs, value = self.forward(state)
        m = Categorical(probs)
        action = m.sample()
        return action, m.log_prob(action), value

### Train with a buffer
I have a feeling that this doesn't work

In [25]:
def train_buffer(env_id):
    if log: 
        name = "a2c_" + env_id
        wandb.init(project=name)

    env = gym.make(env_id)
    reward = 0
    done = False

    actor, critic = make_networks(env)

    if log:
        wandb.watch((actor, critic), log_freq=1)

    actor_optimizer = optim.Adam(actor.parameters(), lr=a_lr)
    critic_optimizer = optim.Adam(critic.parameters(), lr=c_lr)

    num_episodes = 1
    episode_steps = 1
    episode_reward = 0

    # (next_state)
    #      o
    next_state = torch.from_numpy(env.reset())
    next_done = torch.Tensor([False])

    for epoch in range(n_epochs):
        dones = torch.zeros((steps_per_epoch, 1))
        actions = torch.zeros((steps_per_epoch, 1), dtype=int)
        states = torch.zeros((steps_per_epoch, env.observation_space.shape[0]))
        rewards = torch.zeros((steps_per_epoch, 1))
        logprobs = torch.zeros((steps_per_epoch, 1))

        for step in range(0, steps_per_epoch):
            #      (state, done)
            #  (-->)     o
            states[step] = next_state
            dones[step] = next_done

            #      (state, done)  r,a  (next_state, next_done)
            #  (-->)  o ----------------------> o
            actions[step], logprobs[step] = actor.act(states[step].float().unsqueeze(0))
            next_state, rewards[step], next_done_np, _ = env.step(actions[step].detach().item())
            next_state, next_done = torch.from_numpy(next_state), torch.Tensor([next_done_np])

            episode_reward += rewards[step]
            episode_steps += 1

            if next_done or episode_steps > max_episode_steps:
                if log:
                    wandb.log({
                        "episode_steps": episode_steps,
                        "episode_reward": episode_reward,
                        "num_epochs": epoch,
                        "num_episodes": num_episodes,
                    })

                num_episodes += 1
                episode_reward = 0
                episode_steps = 1
                next_state = torch.from_numpy(env.reset())
                next_done = torch.Tensor([True])
            elif dones[step]:
                next_done = torch.Tensor([False])

        # for i in range(steps_per_epoch):
        #     print("[{}] d: {} r: {} state: {} a: {}".format(
        #         i, 
        #         dones[i], 
        #         rewards[i],
        #         states[i],
        #         actions[i]))

        returns = torch.zeros((steps_per_epoch, 1))

        with torch.no_grad():
            next_return = critic(next_state)
            mask = 1 - next_done
            for i in reversed(range(steps_per_epoch)):
                if i < steps_per_epoch - 1:
                    mask = 1 - dones[i + 1]
                    next_return = returns[i+1]
                returns[i] = rewards[i] + mask*gamma*next_return

        # for i in range(steps_per_epoch):
        #     print("[{}] d: {} r: {} R: {}".format(
        #         i, 
        #         dones[i], 
        #         rewards[i],
        #         returns[i]))

        values = critic(states).float()

        ## update critic
        critic_loss = F.smooth_l1_loss(returns, values).sum()
        critic_loss.backward()

        critic_optimizer.step()
        critic_optimizer.zero_grad()

        ## update actor
        _, logprobs = actor.act(states.float().unsqueeze(0))
        advantages = returns - values.detach()
        actor_loss = (-logprobs * advantages).sum()
        actor_loss.backward()

        actor_optimizer.step()
        actor_optimizer.zero_grad()


        ## If done next step them reset env
        if log:
            wandb.log({
                "actor_loss": actor_loss,
                "critic_loss": critic_loss,
                "epoch": epoch,
            })
            if record_vids and epoch % num_episodes_to_vid == 0:
                    record_video(env, actor, "/content/out.mp4")

a_lr = 1e-3
c_lr = 1e-3
gamma = 0.99
n_epochs = 1
steps_per_epoch = 40
max_episode_steps = 500

log = False
num_episodes_to_vid = 100
record_vids = False

train_buffer("CartPole-v1")

## Compare to stable baselines 3 a2c implementation

### My code


In [None]:
a_lr = 1e-3
c_lr = 1e-3
gamma = 0.99
n_epochs = 200
steps_per_epoch = 128
max_episode_steps = 500

log = True
num_episodes_to_vid = 100
record_vids = True

env_id = "CartPole-v1"
# env_id = "Pixelcopter-PLE-v0"

train_buffer(env_id)

VBox(children=(Label(value='0.009 MB of 0.009 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
actor_loss,█▇▅▆▅▃▃▂▂▂▃▃▁▁▁▂▃▂▂▂▁▂▂▃▂▁▁▂▂▃▂▃▂▁▂▂▁▂▂▁
critic_loss,█▇▅▇▅▄▄▃▄▃▅▄▃▂▂▂▃▂▂▂▁▂▃▃▂▂▁▁▂▄▂▃▁▁▂▂▁▁▂▁
episode_reward,▅▃▃▂▃▃▆▄▅▅▂▄▂▂▂▃▂▂▅▃▅▂▂▂▄▄▂▂▄▂▃██▂▅▇▁▁█▂
episode_steps,▅▃▃▂▃▃▆▄▅▅▂▄▂▂▂▃▂▂▅▃▅▂▂▂▄▄▂▂▄▂▃██▂▅▇▁▁█▂
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
num_episodes,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
num_epochs,▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇███

0,1
actor_loss,-2557.83154
critic_loss,0.68763
episode_reward,8.0
episode_steps,9.0
epoch,199.0
num_episodes,1753.0
num_epochs,199.0




## SB3

In [None]:
env_id = "AntBulletEnv-v0"
env = gym.make(env_id)

print(env.observation_space.shape[0])

28


In [None]:
n_steps = 1000000
a_lr = 1e-3
c_lr = 1e-3
gamma = 0.99
max_episode_steps = 500

log = True
log_rate = 1
num_episodes_to_vid = 500
record_vids = True

# env_id = "CartPole-v1"
env_id = "Pixelcopter-PLE-v0"

train(env_id)

### Stable baselines 3 implementation

In [None]:
from stable_baselines3 import A2C

from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv, VecVideoRecorder

from wandb.integration.sb3 import WandbCallback

env_id = "CartPole-v1"
# env_id = "Pixelcopter-PLE-v0"
policy = 'MlpPolicy'

config = {
    "env_id": env_id,
    "policy": policy,
}
record_video_every_n_steps = 50000
total_timesteps = 400000


## Set up logging
name = "a2c_" + env_id
run = wandb.init(project=name, 
                 config=config,
                 sync_tensorboard=True, # auto-upload sb3's tensorboard metrics
                 monitor_gym=True,  # auto-upload the videos of agents playing the game
                 save_code=True)

## Make the environment
def make_env():
    env = gym.make(config["env_id"])
    env = Monitor(env)  # record stats such as returns
    return env

env = DummyVecEnv([make_env] * 1) # 1 simulation
env = VecVideoRecorder(
    env, 
    f"videos/{run.id}", 
    record_video_trigger=lambda x: x % record_video_every_n_steps == 0, 
    video_length=200
)

# Custom actor (pi) and value function (vf) networks
# of two layers of size 32 each with Relu activation function
policy_kwargs = dict(activation_fn=torch.nn.ReLU,
                     net_arch=[dict(pi=[128, 256], vf=[128, 256])])
# Create the agent
model = A2C("MlpPolicy", env_id, policy_kwargs=policy_kwargs, verbose=1)


## Make the model
model = A2C(
    policy = 'MlpPolicy',
    policy_kwargs=policy_kwargs,
    env = env,
    n_steps = 50000,
    # learning_rate=linear_schedule(init_learning_rate),
    # batch_size = batch_size,
    tensorboard_log=f"runs/{run.id}"
) 

## Train!
model.learn(
    total_timesteps=total_timesteps,
    callback=WandbCallback(
        verbose=2,
        model_save_path=f"models/{run.id}"
    )
)
run.finish()