# Policy gradient methods on Pong

The three main differences from the previous example's code are as follows:
- The baseline is estimated with a moving average for 1M past transitions,
instead of all examples
- Several concurrent environments are used
- Gradients are clipped to improve training stability

To make moving average calculations faster, a deque-backed buffer is created.

In [3]:
import gym
import ptan
import numpy as np
import argparse
import collections
from tensorboardX import SummaryWriter
import sys 
sys.path.append("../Chapter11/")

import torch
import torch.nn.functional as F
import torch.nn.utils as nn_utils
import torch.optim as optim

from lib import common

GAMMA = 0.99
LEARNING_RATE = 0.0001
ENTROPY_BETA = 0.01
BATCH_SIZE = 128

REWARD_STEPS = 10
BASELINE_STEPS = 1000000
GRAD_L2_CLIP = 0.1

ENV_COUNT = 32

In [5]:
def make_env():
    return ptan.common.wrappers.wrap_dqn(gym.make("PongNoFrameskip-v4"))


class MeanBuffer:
    def __init__(self, capacity):
        self.capacity = capacity
        self.deque = collections.deque(maxlen=capacity)
        self.sum = 0.0

    def add(self, val):
        if len(self.deque) == self.capacity:
            self.sum -= self.deque[0]
        self.deque.append(val)
        self.sum += val

    def mean(self):
        if not self.deque:
            return 0.0
        return self.sum / len(self.deque)

The second difference in this example is working with multiple environments, and this functionality is supported by the PTAN library. The only action we have to take is to pass the array of Env objects to the ExperienceSource class. All the rest is done automatically. In the case of several environments, the experience source
asks them for transitions in round-robin, providing us with less-correlated training
samples. The last difference from the CartPole example is gradient clipping, which
is performed using the PyTorch clip_grad_norm function from the torch.nn.utils
package.

In [6]:
device = torch.device("cuda")

envs = [make_env() for _ in range(ENV_COUNT)]
writer = SummaryWriter(comment="-pong-pg-" + "First")

net = common.AtariPGN(envs[0].observation_space.shape, envs[0].action_space.n).to(device)
print(net)

agent = ptan.agent.PolicyAgent(net, apply_softmax=True, device=device)
exp_source = ptan.experience.ExperienceSourceFirstLast(envs, agent, gamma=GAMMA, steps_count=REWARD_STEPS)

optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE, eps=1e-3)

total_rewards = []
step_idx = 0
done_episodes = 0
train_step_idx = 0
baseline_buf = MeanBuffer(BASELINE_STEPS)

batch_states, batch_actions, batch_scales = [], [], []
m_baseline, m_batch_scales, m_loss_entropy, m_loss_policy, m_loss_total = [], [], [], [], []
m_grad_max, m_grad_mean = [], []
sum_reward = 0.0

with common.RewardTracker(writer, stop_reward=18) as tracker:
    for step_idx, exp in enumerate(exp_source):
        baseline_buf.add(exp.reward)
        baseline = baseline_buf.mean()
        batch_states.append(np.array(exp.state, copy=False))
        batch_actions.append(int(exp.action))
        batch_scales.append(exp.reward - baseline)

        # handle new rewards
        new_rewards = exp_source.pop_total_rewards()
        if new_rewards:
            if tracker.reward(new_rewards[0], step_idx):
                break

        if len(batch_states) < BATCH_SIZE:
            continue

        train_step_idx += 1
        states_v = torch.FloatTensor(np.array(batch_states, copy=False)).to(device)
        batch_actions_t = torch.LongTensor(batch_actions).to(device)

        scale_std = np.std(batch_scales)
        batch_scale_v = torch.FloatTensor(batch_scales).to(device)

        optimizer.zero_grad()
        logits_v = net(states_v)
        log_prob_v = F.log_softmax(logits_v, dim=1)
        log_prob_actions_v = batch_scale_v * log_prob_v[range(BATCH_SIZE), batch_actions_t]
        loss_policy_v = -log_prob_actions_v.mean()

        prob_v = F.softmax(logits_v, dim=1)
        entropy_v = -(prob_v * log_prob_v).sum(dim=1).mean()
        entropy_loss_v = -ENTROPY_BETA * entropy_v
        loss_v = loss_policy_v + entropy_loss_v
        loss_v.backward()
        nn_utils.clip_grad_norm_(net.parameters(), GRAD_L2_CLIP)
        optimizer.step()

        # calc KL-div
        new_logits_v = net(states_v)
        new_prob_v = F.softmax(new_logits_v, dim=1)
        kl_div_v = -((new_prob_v / prob_v).log() * prob_v).sum(dim=1).mean()
        writer.add_scalar("kl", kl_div_v.item(), step_idx)

        grad_max = 0.0
        grad_means = 0.0
        grad_count = 0
        for p in net.parameters():
            grad_max = max(grad_max, p.grad.abs().max().item())
            grad_means += (p.grad ** 2).mean().sqrt().item()
            grad_count += 1

        writer.add_scalar("baseline", baseline, step_idx)
        writer.add_scalar("entropy", entropy_v.item(), step_idx)
        writer.add_scalar("batch_scales", np.mean(batch_scales), step_idx)
        writer.add_scalar("batch_scales_std", scale_std, step_idx)
        writer.add_scalar("loss_entropy", entropy_loss_v.item(), step_idx)
        writer.add_scalar("loss_policy", loss_policy_v.item(), step_idx)
        writer.add_scalar("loss_total", loss_v.item(), step_idx)
        writer.add_scalar("grad_l2", grad_means / grad_count, step_idx)
        writer.add_scalar("grad_max", grad_max, step_idx)

        batch_states.clear()
        batch_actions.clear()
        batch_scales.clear()

writer.close()

AtariPGN(
  (conv): Sequential(
    (0): Conv2d(4, 32, kernel_size=(8, 8), stride=(4, 4))
    (1): ReLU()
    (2): Conv2d(32, 64, kernel_size=(4, 4), stride=(2, 2))
    (3): ReLU()
    (4): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
    (5): ReLU()
  )
  (fc): Sequential(
    (0): Linear(in_features=3136, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=6, bias=True)
  )
)
23869: done 1 games, mean reward -21.000, speed 497.59 f/s
24494: done 2 games, mean reward -21.000, speed 506.98 f/s
24593: done 3 games, mean reward -21.000, speed 502.76 f/s
24628: done 4 games, mean reward -21.000, speed 517.76 f/s
24794: done 5 games, mean reward -21.000, speed 476.07 f/s
24811: done 6 games, mean reward -21.000, speed 557.85 f/s
24860: done 7 games, mean reward -21.000, speed 431.39 f/s
24891: done 8 games, mean reward -21.000, speed 474.12 f/s
25810: done 9 games, mean reward -21.000, speed 493.20 f/s
25849: done 10 games, mean reward -21.000, s

KeyboardInterrupt: 