In [None]:
import gym
import numpy as np
import torch
from torch import nn

# create environment
env = gym.make('CartPole-v1')

# create neural network
hidden_size = 32
observation_dimensions = env.observation_space.shape[0]
n_actions = env.action_space.n

logits_net = nn.Sequential(nn.Linear(observation_dimensions, hidden_size),
                           nn.Tanh(),
                           nn.Linear(hidden_size, n_actions),
                           nn.Identity())

optimiser = torch.optim.Adam(logits_net.parameters(), lr=1e-2)

values_net = nn.Sequential(nn.Linear(observation_dimensions, hidden_size),
                           nn.Tanh(),
                           nn.Linear(hidden_size, 1),
                           nn.Identity())

values_optimiser = torch.optim.Adam(values_net.parameters(), lr=1e-2)


def get_policy(observation_tensor):
    return torch.distributions.categorical.Categorical(logits=logits_net(observation_tensor))


def cumulative(iter):
    c = iter.copy()
    for i in reversed(range(len(c) - 1)):
        c[i] += c[i + 1]
    return c


times_solved = 0
ep_returns = []
epoch = 0
while True:
    # train one epoch
    observations = []
    actions = []
    rewards = []
    observation = env.reset()
    done = False
    while not done:
        observations.append(observation.copy())

        # act
        observation_tensor = torch.as_tensor(observation, dtype=torch.float32)
        action = get_policy(observation_tensor).sample().item()
        actions.append(action)

        observation, reward, done, _ = env.step(action)
        rewards.append(reward)

    # policy gradient update
    optimiser.zero_grad()

    observations_tensor = torch.as_tensor(observations, dtype=torch.float32)

    values = values_net(observations_tensor).squeeze(axis=-1)
    deltas = torch.as_tensor(rewards[:-1]) + values[1:] - values[:-1]
    # advantages_tensor = torch.as_tensor(cumulative(deltas.detach().numpy()))
    advantages = cumulative(deltas.detach().numpy())
    advantages_tensor = torch.as_tensor((advantages - np.mean(advantages)) / np.std(advantages))

    actions_tensor = torch.as_tensor(actions, dtype=torch.int32)
    logp = get_policy(observations_tensor[:-1]).log_prob(actions_tensor[:-1])
    loss = -(logp * advantages_tensor).mean()
    loss.backward()

    optimiser.step()

    # value gradient update
    weights_tensor = torch.as_tensor(cumulative(rewards[:-1]), dtype=torch.float32)
    for _ in range(100):
        values_optimiser.zero_grad()
        values_loss = ((values_net(observations_tensor[:-1]).squeeze(axis=-1) - weights_tensor) ** 2).mean()
        values_loss.backward()
        values_optimiser.step()

    epoch += 1
    ep_returns.append(sum(rewards))
    if epoch % 10 == 0:
        print(epoch, np.mean(ep_returns), loss, values_loss)
        ep_returns = []

    if len(rewards) > 475:
        times_solved += 1
        if times_solved >= 100:
            break
    else:
        times_solved = 0