# Policy gradient methods on CartPole

In [1]:
import gym
import ptan
import numpy as np
from tensorboardX import SummaryWriter
from typing import Optional

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

Besides the already familiar hyperparameters, we have two new ones: the ENTROPY_
BETA value is the scale of the entropy bonus and the REWARD_STEPS value specifies
how many steps ahead the Bellman equation is unrolled to estimate the discounted
total reward of every transition.

In [2]:
GAMMA = 0.99
LEARNING_RATE = 0.001
ENTROPY_BETA = 0.01
BATCH_SIZE = 8

REWARD_STEPS = 10

The network architecture is exactly the same as in the previous examples for
CartPole: a two-layer network with 128 neurons in the hidden layer. The preparation
code is also the same as before, except the experience source is asked to unroll
the Bellman equation for 10 steps.
The following is the part that differs from 04_cartpole_pg.py:

``` python
exp_source = ptan.experience.ExperienceSourceFirstLast(
env, agent, gamma=GAMMA, steps_count=REWARD_STEPS)
```

In [3]:
class PGN(nn.Module):
    def __init__(self, input_size, n_actions):
        super(PGN, self).__init__()

        self.net = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.ReLU(),
            nn.Linear(128, n_actions)
        )

    def forward(self, x):
        return self.net(x)

In [4]:
def smooth(old: Optional[float], val: float, alpha: float = 0.95) -> float:
    if old is None:
        return val
    return old * alpha + (1-alpha)*val

In the loss calculation, we use the same code as before to calculate the policy loss
(which is the negated policy gradient):

``` python
optimizer.zero_grad()
logits_v = net(states_v)
log_prob_v = F.log_softmax(logits_v, dim=1)
log_prob_actions_v = batch_scale_v * log_prob_v[range(BATCH_SIZE),batch_actions_t]
loss_policy_v = -log_prob_actions_v.mean()
```

Then we add the entropy bonus to the loss by calculating the entropy of the batch
and subtracting it from the loss. As entropy has a maximum for uniform probability
distribution and we want to push the training toward this maximum, we need to
subtract from the loss.

``` python
prob_v = F.softmax(logits_v, dim=1)
entropy_v = -(prob_v * log_prob_v).sum(dim=1).mean()
entropy_loss_v = -ENTROPY_BETA * entropy_v
loss_v = loss_policy_v + entropy_loss_v
loss_v.backward()
optimizer.step()
```

Then, we calculate the Kullback-Leibler (KL) divergence between the new policy
and the old policy. KL divergence is an information theory measurement of how one
probability distribution diverges from another expected probability distribution. In
our example, it is being used to compare the policy returned by the model before and
after the optimization step. High spikes in KL are usually a bad sign, showing that
our policy was pushed too far from the previous policy, which is a bad idea most of
the time (as our NN is a very nonlinear function in a high-dimensional space, such
large changes in the model weight could have a very strong influence on the policy).

``` python
new_logits_v = net(states_v)
new_prob_v = F.softmax(new_logits_v, dim=1)
kl_div_v = -((new_prob_v / prob_v).log() *
prob_v).sum(dim=1).mean()
writer.add_scalar("kl", kl_div_v.item(), step_idx)
```

Finally, we calculate the statistics about the gradients on this training step. It's usually good practice to show the graph of the maximum and L2 norm of gradients to get an idea about the training dynamics.

``` python
grad_max = 0.0
grad_means = 0.0
grad_count = 0
for p in net.parameters():
grad_max = max(grad_max, p.grad.abs().max().item())
grad_means += (p.grad ** 2).mean().sqrt().item()
grad_count += 1
```

In [None]:
env = gym.make("CartPole-v0")
writer = SummaryWriter(comment="-cartpole-pg")

net = PGN(env.observation_space.shape[0], env.action_space.n)
print(net)

agent = ptan.agent.PolicyAgent(net, preprocessor=ptan.agent.float32_preprocessor,
                               apply_softmax=True)
exp_source = ptan.experience.ExperienceSourceFirstLast(
    env, agent, gamma=GAMMA, steps_count=REWARD_STEPS)

optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE)

total_rewards = []
step_rewards = []
step_idx = 0
done_episodes = 0
reward_sum = 0.0
bs_smoothed = entropy = l_entropy = l_policy = l_total = None

batch_states, batch_actions, batch_scales = [], [], []

for step_idx, exp in enumerate(exp_source):
    reward_sum += exp.reward
    baseline = reward_sum / (step_idx + 1)
    writer.add_scalar("baseline", baseline, step_idx)
    batch_states.append(exp.state)
    batch_actions.append(int(exp.action))
    batch_scales.append(exp.reward - baseline)

    # handle new rewards
    new_rewards = exp_source.pop_total_rewards()
    if new_rewards:
        done_episodes += 1
        reward = new_rewards[0]
        total_rewards.append(reward)
        mean_rewards = float(np.mean(total_rewards[-100:]))
        print("%d: reward: %6.2f, mean_100: %6.2f, episodes: %d" % (
            step_idx, reward, mean_rewards, done_episodes))
        writer.add_scalar("reward", reward, step_idx)
        writer.add_scalar("reward_100", mean_rewards, step_idx)
        writer.add_scalar("episodes", done_episodes, step_idx)
        if mean_rewards > 195:
            print("Solved in %d steps and %d episodes!" % (step_idx, done_episodes))
            break

    if len(batch_states) < BATCH_SIZE:
        continue

    states_v = torch.FloatTensor(batch_states)
    batch_actions_t = torch.LongTensor(batch_actions)
    batch_scale_v = torch.FloatTensor(batch_scales)

    optimizer.zero_grad()
    logits_v = net(states_v)
    log_prob_v = F.log_softmax(logits_v, dim=1)
    log_prob_actions_v = batch_scale_v * log_prob_v[range(BATCH_SIZE), batch_actions_t]
    loss_policy_v = -log_prob_actions_v.mean()

    prob_v = F.softmax(logits_v, dim=1)
    entropy_v = -(prob_v * log_prob_v).sum(dim=1).mean()
    entropy_loss_v = -ENTROPY_BETA * entropy_v
    loss_v = loss_policy_v + entropy_loss_v

    loss_v.backward()
    optimizer.step()

    # calc KL-div
    new_logits_v = net(states_v)
    new_prob_v = F.softmax(new_logits_v, dim=1)
    kl_div_v = -((new_prob_v / prob_v).log() * prob_v).sum(dim=1).mean()
    writer.add_scalar("kl", kl_div_v.item(), step_idx)

    grad_max = 0.0
    grad_means = 0.0
    grad_count = 0
    for p in net.parameters():
        grad_max = max(grad_max, p.grad.abs().max().item())
        grad_means += (p.grad ** 2).mean().sqrt().item()
        grad_count += 1

    bs_smoothed = smooth(bs_smoothed, np.mean(batch_scales))
    entropy = smooth(entropy, entropy_v.item())
    l_entropy = smooth(l_entropy, entropy_loss_v.item())
    l_policy = smooth(l_policy, loss_policy_v.item())
    l_total = smooth(l_total, loss_v.item())

    writer.add_scalar("baseline", baseline, step_idx)
    writer.add_scalar("entropy", entropy, step_idx)
    writer.add_scalar("loss_entropy", l_entropy, step_idx)
    writer.add_scalar("loss_policy", l_policy, step_idx)
    writer.add_scalar("loss_total", l_total, step_idx)
    writer.add_scalar("grad_l2", grad_means / grad_count, step_idx)
    writer.add_scalar("grad_max", grad_max, step_idx)
    writer.add_scalar("batch_scales", bs_smoothed, step_idx)

    batch_states.clear()
    batch_actions.clear()
    batch_scales.clear()

writer.close()

PGN(
  (net): Sequential(
    (0): Linear(in_features=4, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=2, bias=True)
  )
)
38: reward:  38.00, mean_100:  38.00, episodes: 1
70: reward:  32.00, mean_100:  35.00, episodes: 2
140: reward:  70.00, mean_100:  46.67, episodes: 3
216: reward:  76.00, mean_100:  54.00, episodes: 4
239: reward:  23.00, mean_100:  47.80, episodes: 5
255: reward:  16.00, mean_100:  42.50, episodes: 6
283: reward:  28.00, mean_100:  40.43, episodes: 7
302: reward:  19.00, mean_100:  37.75, episodes: 8
318: reward:  16.00, mean_100:  35.33, episodes: 9
354: reward:  36.00, mean_100:  35.40, episodes: 10
367: reward:  13.00, mean_100:  33.36, episodes: 11
386: reward:  19.00, mean_100:  32.17, episodes: 12
426: reward:  40.00, mean_100:  32.77, episodes: 13
437: reward:  11.00, mean_100:  31.21, episodes: 14
498: reward:  61.00, mean_100:  33.20, episodes: 15
542: reward:  44.00, mean_100:  33.88, episodes: 16
588: reward

14359: reward: 177.00, mean_100: 126.07, episodes: 152
14559: reward: 200.00, mean_100: 127.67, episodes: 153
14697: reward: 138.00, mean_100: 128.57, episodes: 154
14839: reward: 142.00, mean_100: 129.14, episodes: 155
14967: reward: 128.00, mean_100: 130.11, episodes: 156
15135: reward: 168.00, mean_100: 131.38, episodes: 157
15286: reward: 151.00, mean_100: 132.68, episodes: 158
15416: reward: 130.00, mean_100: 133.59, episodes: 159
15566: reward: 150.00, mean_100: 134.71, episodes: 160
15766: reward: 200.00, mean_100: 135.92, episodes: 161
15966: reward: 200.00, mean_100: 137.39, episodes: 162
16085: reward: 119.00, mean_100: 137.84, episodes: 163
16285: reward: 200.00, mean_100: 139.38, episodes: 164
16485: reward: 200.00, mean_100: 141.10, episodes: 165
16685: reward: 200.00, mean_100: 142.68, episodes: 166
16858: reward: 173.00, mean_100: 144.19, episodes: 167
17034: reward: 176.00, mean_100: 145.13, episodes: 168
17234: reward: 200.00, mean_100: 146.96, episodes: 169
17434: rew

41300: reward: 200.00, mean_100: 183.47, episodes: 301
41500: reward: 200.00, mean_100: 183.47, episodes: 302
41700: reward: 200.00, mean_100: 183.47, episodes: 303
41900: reward: 200.00, mean_100: 183.47, episodes: 304
42100: reward: 200.00, mean_100: 183.47, episodes: 305
42300: reward: 200.00, mean_100: 183.47, episodes: 306
42487: reward: 187.00, mean_100: 183.34, episodes: 307
42687: reward: 200.00, mean_100: 183.34, episodes: 308
42860: reward: 173.00, mean_100: 183.07, episodes: 309
42982: reward: 122.00, mean_100: 182.29, episodes: 310
43089: reward: 107.00, mean_100: 181.36, episodes: 311
43220: reward: 131.00, mean_100: 180.67, episodes: 312
43335: reward: 115.00, mean_100: 179.82, episodes: 313
43448: reward: 113.00, mean_100: 178.95, episodes: 314
43571: reward: 123.00, mean_100: 178.18, episodes: 315
43735: reward: 164.00, mean_100: 177.82, episodes: 316
43879: reward: 144.00, mean_100: 177.26, episodes: 317
44079: reward: 200.00, mean_100: 177.48, episodes: 318
44279: rew