# Implementing Advantage-Actor Critic (A2C) - 2 pts

In [2]:
%%capture
!pip install comet_ml --quiet
!pip install tensorboardX --quiet
!pip install setuptools==65.5.0
!pip install gym[atari,accept-rom-license]==0.21.0

In [158]:
import sys
sys.path.insert(0, '/kaggle/input/utils')


In [459]:
from comet_ml import Experiment
from comet_ml.integration.pytorch import log_model
from tensorboardX import SummaryWriter


experiment = Experiment(
  api_key = "KA73UNUPM3eR56uID83Ii8HH4",
  project_name = "actor_critic_gae",
  workspace="katyanaveka"
)

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/katyanaveka/general/01e1c70662784aa4bd5c7216a4cfd156
[1;38;5;39mCOMET INFO:[0m   Metrics [count] (min, max):
[1;38;5;39mCOMET INFO:[0m     Episodes/episode_length [442]           : (137.5, 594.75)
[1;38;5;39mCOMET INFO:[0m     Episodes/max_reward [442]               : (125.0, 825.0)
[1;38;5;39mCOMET INFO:[0m     Episodes/min_reward [442]               : (5.0, 135.0)
[1;38;5;39mCOMET INFO:[0m     Episodes/reward_mean_100 [442]          : (138.83460968379447, 182.5)
[1;38;5;39mCOMET INFO:[0m     Episodes/total_reward [442]  

In [460]:
# Helper function to display logged assets in the Comet UI
import comet_ml
def display(tab=None):
    experiment = comet_ml.get_global_experiment()
    experiment.display(tab=tab)
    
write = SummaryWriter(comet_config={"disabled": False})

In this notebook you will implement Advantage Actor Critic algorithm that trains on a batch of Atari 2600 environments running in parallel. 

Firstly, we will use environment wrappers implemented in file `atari_wrappers.py`. These wrappers preprocess observations (resize, grayscal, take max between frames, skip frames, stack them together, prepares for PyTorch and normalizes to [0, 1]) and rewards. Some of the wrappers help to reset the environment and pass `done` flag equal to `True` when agent dies.
File `env_batch.py` includes implementation of `ParallelEnvBatch` class that allows to run multiple environments in parallel. To create an environment we can use `nature_dqn_env` function.

In [461]:
import multiprocessing

multiprocessing.cpu_count()

2

In [462]:
import numpy as np
from atari_wrappers import nature_dqn_env

nenvs = 8    # change this if you have more than 8 CPU ;)

env = nature_dqn_env("SpaceInvadersNoFrameskip-v4", nenvs=nenvs)

n_actions = env.action_space.spaces[0].n
obs = env.reset()
assert obs.shape == (nenvs, 4, 84, 84)
assert obs.dtype == np.float32

Process Process-70:
Process Process-66:
Process Process-65:
Process Process-69:
Process Process-72:
Process Process-71:
Process Process-67:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
Traceback (most recent call last):
Process Process-68:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/opt/conda/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/opt/conda/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/opt/conda/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/opt/conda/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    

Next, we will need to implement a model that predicts logits of policy distribution and critic value. Use shared backbone. You may use same architecture as in DQN task with one modification: instead of having a single output layer, it must have two output layers taking as input the output of the last hidden layer (one for actor, one for critic). 

Still it may be very helpful to make more changes:
* use orthogonal initialization with gain $\sqrt{2}$ and initialize biases with zeros;
* use more filters (e.g. 32-64-64 instead of 16-32-64);
* use two-layer heads for actor and critic or add a linear layer into backbone;

**Danger:** do not divide on 255, input is already normalized to [0, 1] in our wrappers!

In [1]:
import torch
import torch.nn as nn

class ACnet(nn.Module):
    '''
    input:
        states - tensor, (batch_size x channels x width x height)
    output:
        logits - tensor, logits of action probabilities for your actor policy, (batch_size x num_actions)
        V - tensor, critic estimation, (batch_size)
    '''
    def __init__(self, state_shape, n_actions, device='cpu'):
        super().__init__()
        self.state_shape = state_shape
        self.n_actions = n_actions
        self.device = device
        self.conv = nn.Sequential(
              nn.Conv2d(in_channels=state_shape[1], out_channels=32, kernel_size=3, stride=2),
              nn.ReLU(),
              nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=2),
              nn.ReLU(),
              nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=2),
              nn.ReLU(),
              nn.Flatten()
          )
        # actor
        self.classif_v = nn.Sequential(
              nn.Linear(in_features = 5184, out_features = 256),
              nn.ReLU(),
              nn.Linear(in_features = 256, out_features = n_actions),
              nn.Softmax(dim=1)
          ) 
        # critic
        self.classif_Ev = nn.Sequential(
              nn.Linear(in_features = 5184, out_features = 256),
              nn.ReLU(),
              nn.Linear(in_features = 256, out_features = 1)
          )
        self.apply(self._init_weights)
    
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            # module.weight.data.normal_(mean=0.0, std=1.0)
            torch.nn.init.orthogonal_(module.weight.data, gain = 2 ** 0.5)
            if module.bias is not None:
                module.bias.data.zero_()

        
    def forward(self, states):
        pre_v = self.conv(states)
        logits = self.classif_v(pre_v)
        values = self.classif_Ev(pre_v)
        return logits, values

You will also need to define and use a policy that wraps the model. While the model computes logits for all actions, the policy will sample actions and also compute their log probabilities.  `policy.act` should return a **dictionary** of all the arrays that are needed to interact with an environment and train the model.

**Important**: "actions" will be sent to environment, they must be numpy array or list, not PyTorch tensor.

Note: you can add more keys, e.g. it can be convenient to compute entropy right here.

In [465]:
from torch.distributions import Categorical

class Policy:
    def __init__(self, model):
        self.model = model

    def act(self, inputs):
        '''
        input:
            inputs - numpy array, (batch_size x channels x width x height)
        output: dict containing keys ['actions', 'logits', 'log_probs', 'values']:
            'actions' - selected actions, numpy, (batch_size)
            'logits' - actions logits, tensor, (batch_size x num_actions)
            'log_probs' - log probs of selected actions, tensor, (batch_size)
            'values' - critic estimations, tensor, (batch_size)
        '''
        model = self.model
        logits, values = model(torch.tensor(inputs).to(model.device))
        cat_distr = Categorical(logits)
        actions = cat_distr.sample()
        log_probs = cat_distr.log_prob(actions)
        entropy = cat_distr.entropy() # tensor, (batch_size)
        
        return {
            "actions": actions.detach().cpu().numpy(),
            "logits": logits,
            "log_probs": log_probs.squeeze(0),
            "values": values.squeeze(1),
            "entropy": entropy
        }

Next we will pass the environment and policy to a runner that collects rollouts from the environment. 
The class is already implemented for you.

In [466]:
from runners import EnvRunner

This runner interacts with the environment for a given number of steps and returns a dictionary containing
keys 

* 'observations' 
* 'rewards' 
* 'dones'
* 'actions'
* all other keys that you defined in `Policy`

under each of these keys there is a python `list` of interactions with the environment of specified length $T$ &mdash; the size of partial trajectory, or rollout length. Let's have a look at how it works.

In [467]:
model = ACnet(obs.shape, n_actions)
policy = Policy(model)
runner = EnvRunner(env, policy, nsteps=5)

In [468]:
# generates new rollout
trajectory = runner.get_next()

In [470]:
# what is inside
print(trajectory.keys())

dict_keys(['actions', 'logits', 'log_probs', 'values', 'entropy', 'observations', 'rewards', 'dones'])


In [471]:
# Sanity checks
assert 'logits' in trajectory, "Not found: policy didn't provide logits"
assert 'log_probs' in trajectory, "Not found: policy didn't provide log_probs of selected actions"
assert 'values' in trajectory, "Not found: policy didn't provide critic estimations"
assert trajectory['logits'][0].shape == (nenvs, n_actions), "logits wrong shape"
assert trajectory['log_probs'][0].shape == (nenvs,), "log_probs wrong shape"
assert trajectory['values'][0].shape == (nenvs,), "values wrong shape"

for key in trajectory.keys():
    assert len(trajectory[key]) == 5, \
    f"something went wrong: 5 steps should have been done, got trajectory of length {len(trajectory[key])} for '{key}'"

Now let's work with this trajectory a bit. To train the critic you will need to compute the value targets. It will also be used as an estimation of $Q$ for actor training.

You should use all available rewards for value targets, so the formula for the value targets is simple:

$$
\hat v(s_t) = \sum_{t'=0}^{T - 1}\gamma^{t'}r_{t+t'} + \gamma^T \hat{v}(s_{t+T}),
$$

where $s_{t + T}$ is the latest observation of the environment.

Any callable could be passed to `EnvRunner` to be applied to each partial trajectory after it is collected. 
Thus, we can implement and use `ComputeValueTargets` callable. 

**Do not forget** to use `trajectory['dones']` flags to check if you need to add the value targets at the next step when 
computing value targets for the current step.

**Bonus (+0.5 pts):** implement [Generalized Advantage Estimation (GAE)](https://arxiv.org/pdf/1506.02438.pdf) instead; use $\lambda \approx 0.95$ or even closer to 1 in experiment. 

In [472]:
class ComputeValueTargets:
    def __init__(self, policy, gamma=0.99, GAE=False):
        self.policy = policy
        self.gamma = gamma
        self.GAE = GAE

    def __call__(self, trajectory, latest_observation):
        '''
        This method should modify trajectory inplace by adding 
        an item with key 'value_targets' to it
        
        input:
            trajectory - dict from runner
            latest_observation - last state, numpy, (num_envs x channels x width x height)
        '''
        device = self.policy.model.device
        gamma = self.gamma
        next_tr = self.policy.act(latest_observation)
            
        is_done = torch.tensor(trajectory['dones']).to(device)
        rewards = torch.tensor(trajectory['rewards']).to(device)
        T = len(rewards)
        cumulative_rewards = [0 for i in range(T)]
        if not self.GAE:
            G_curr = next_tr['values'].to(device)
            for i in range(T-1, -1,  -1):
                G_curr = rewards[i] + gamma * G_curr * (~ is_done[i])
                cumulative_rewards[i] = G_curr
            
        else:
            factor = gamma * 0.95 # * lambda
            G_curr = 0#rewards - gamma * next_tr['values'].cpu() + trajectory['values'][0] # \psi_1
            v = trajectory['values'].copy()
            v.append(next_tr['values'])
            for i in range(T-1, -1,  -1):
                G_curr = rewards[i] + gamma * v[i+1].to(device) - v[i].to(device) + factor * G_curr * (~is_done[i])
                cumulative_rewards[i] = G_curr
                
        trajectory['value_targets'] = cumulative_rewards



After computing value targets we will transform lists of interactions into tensors
with the first dimension `batch_size` which is equal to `T * nenvs`.

You need to make sure that after this transformation `"log_probs"`, `"value_targets"`, `"values"` are 1-dimensional PyTorch tensors.

In [482]:
class MergeTimeBatch:
    """ Merges first two axes typically representing time and env batch. """
    def __call__(self, trajectory, latest_observation):
        # Modify trajectory inplace.
        T = len(trajectory['rewards'])
        nenvs = trajectory['rewards'][0].shape[0]
        for key, item in trajectory.items():
            if key in {"log_probs", "values", "value_targets", "entropy"}:
                trajectory[key] = torch.stack(trajectory[key]).view(T * nenvs)
            if key == 'fff':
                trajectory[key] = torch.tensor(trajectory[key]).view(T * nenvs)

Let's do more sanity checks!

In [483]:
runner = EnvRunner(env, policy, nsteps=5, transforms=[ComputeValueTargets(policy, GAE=True),
                                                      MergeTimeBatch()])

trajectory = runner.get_next()

In [486]:
# More sanity checks
assert 'value_targets' in trajectory, "Value targets not found"
assert trajectory['log_probs'].shape == (5 * nenvs,)
assert trajectory['value_targets'].shape == (5 * nenvs,)
assert trajectory['values'].shape == (5 * nenvs,)

assert trajectory['log_probs'].requires_grad, "Gradients are not available for actor head!"
assert trajectory['values'].requires_grad, "Gradients are not available for critic head!"

Now is the time to implement the advantage actor critic algorithm itself. You can look into [Mnih et al. 2016](https://arxiv.org/abs/1602.01783) paper, and lectures ([part 1](https://www.youtube.com/watch?v=Ds1trXd6pos&list=PLkFD6_40KJIwhWJpGazJ9VSj9CFMkb79A&index=5), [part 2](https://www.youtube.com/watch?v=EKqxumCuAAY&list=PLkFD6_40KJIwhWJpGazJ9VSj9CFMkb79A&index=6)) by Sergey Levine.

In [487]:
from collections import defaultdict
from torch.nn.utils import clip_grad_norm_

class A2C:
    def __init__(self, policy, optimizer, device = 'cpu', value_loss_coef=0.25, entropy_coef=0.01, max_grad_norm=0.5):
        self.policy = policy
        self.optimizer = optimizer
        self.device = device
        self.value_loss_coef = value_loss_coef
        self.entropy_coef = entropy_coef
        self.max_grad_norm = max_grad_norm
        self.writer = runner.write #SummaryWriter(comet_config={"disabled": False})
    
    def loss(self, trajectory, write):
        # compute all losses
        # do not forget to use weights for critic loss and entropy loss
        # print(trajectory['value_targets'].device, trajectory['values'].device)
        if not GAE:
            adv = trajectory['value_targets'].to(self.device) - trajectory['values']
        else:
            adv = trajectory['value_targets'].to(self.device)
        critic_loss = torch.mean(adv ** 2)
        
        policy_loss = torch.mean(trajectory['log_probs'] * adv)
        
        entropy_loss = - torch.mean(trajectory['entropy'])
        
        
        # log all losses
        self.writer('losses', {
            'policy loss': policy_loss.detach(),
            'critic loss': critic_loss.detach(),
            'entropy loss': entropy_loss.detach()
        })
        
        # additional logs
        self.writer('critic/advantage', adv.detach())
        self.writer('critic/values', {
            'value predictions': torch.mean(trajectory['values']),
            'value targets':     torch.mean(trajectory['value_targets']),
        })
        
        # return scalar loss
        return - policy_loss + self.value_loss_coef * critic_loss + self.entropy_coef * entropy_loss

    def train(self, runner):
        # collect trajectory using runner
        trajectory = runner.get_next()
        # compute loss and perform one step of gradient optimization
        loss = self.loss(trajectory, self.writer)
        # do not forget to clip gradients
        
        loss.backward()
        grad_norm = nn.utils.clip_grad_norm_(self.policy.model.parameters(), self.max_grad_norm)
        optimizer.step()
        optimizer.zero_grad()
        
        # use runner.write to log scalar to tensorboard
        self.writer('gradient norm', grad_norm)

Now you can train your model. For optimization we suggest you use RMSProp with learning rate 7e-4 (you can also linearly decay it to 0), smoothing constant (alpha in PyTorch) equal to 0.99 and epsilon equal to 1e-5.

We recommend to train for at least 10 million environment steps across all batched environments (takes ~3 hours on a single GTX1080 with 8 CPU). It should be possible to achieve *average raw reward over last 100 episodes* (the average is taken over 100 last episodes in each environment in the batch) of about 600. **Your goal is to reach 500**.

Notes:
* if your reward is stuck at ~200 for more than 2M steps then probably there is a bug
* if your gradient norm is >10 something probably went wrong
* make sure your `entropy loss` is negative, your `critic loss` is positive
* make sure you didn't forget `.detach` in losses where it's needed
* `actor loss` should oscillate around zero or near it; do not expect loss to decrease in RL ;)
* you can experiment with `nsteps` ("rollout length"); standard rollout length is 5 or 10. Note that this parameter influences how many algorithm iterations is required to train on 10M steps (or 40M frames --- we used frameskip in preprocessing).

In [488]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

GAE = True

model = ACnet(obs.shape, n_actions, device)
policy = Policy(model)
runner = EnvRunner(env, policy, nsteps=10, transforms=[ComputeValueTargets(policy, GAE=GAE),
                                                      MergeTimeBatch()])

optimizer = torch.optim.RMSprop(model.parameters(), lr=7e-4, eps=1e-5)

a2c = A2C(policy, optimizer, device)

In [489]:
model = model.to(device)

In [490]:
from tqdm import trange
total_steps = 10 * 10**6

# runner.reset()
for i in trange(total_steps):
    a2c.train(runner) # KA73UNUPM3eR56uID83Ii8HH4

  0%|          | 0/10000000 [00:00<?, ?it/s][1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/katyanaveka/actor-critic-gae/4dd1d8e77bc34a7486eb3c299665635c
[1;38;5;39mCOMET INFO:[0m   Uploads:
[1;38;5;39mCOMET INFO:[0m     conda-environment-definition : 1
[1;38;5;39mCOMET INFO:[0m     conda-info                   : 1
[1;38;5;39mCOMET INFO:[0m     conda-specification          : 1
[1;38;5;39mCOMET INFO:[0m     environment details          : 1
[1;38;5;39mCOMET INFO:[0m     filename                     : 1
[1;38;5;39mCOMET INFO:[0m     installed packages           : 1
[1;3

KeyboardInterrupt: 

In [458]:
Experiment.end(experiment)

In [491]:
# save your model just in case 
torch.save(model.state_dict(), "A2C_GAE_rmsprop")  

In [492]:
env.close()

BrokenPipeError: [Errno 32] Broken pipe

## Evaluation

In [493]:
env = nature_dqn_env("SpaceInvadersNoFrameskip-v4", nenvs=None, 
                     clip_reward=False, summaries=False, episodic_life=False)

In [494]:
def evaluate(env, policy, n_games=1, t_max=10000):
    '''
    Plays n_games and returns rewards
    '''
    rewards = []
    
    for _ in range(n_games):
        s = env.reset()
        
        R = 0
        for _ in range(t_max):
            action = policy.act(np.array([s]))["actions"][0]
            
            s, r, done, _ = env.step(action)
            
            R += r
            if done:
                break

        rewards.append(R)
    return np.array(rewards)

In [495]:
# evaluation will take some time!
sessions = evaluate(env, policy, n_games=30)
score = sessions.mean()
print(f"Your score: {score}")

assert score >= 500, "Needs more training?"
print("Well done!")

Your score: 245.0


AssertionError: Needs more training?

In [None]:
env.close()

## Record

In [None]:
env_monitor = nature_dqn_env("SpaceInvadersNoFrameskip-v4", nenvs=None, monitor=True,
                             clip_reward=False, summaries=False, episodic_life=False)

In [None]:
# record sessions
sessions = evaluate(env_monitor, policy, n_games=3)

In [None]:
# rewards for recorded games
sessions

In [None]:
env_monitor.close()