In [None]:
%matplotlib notebook
from logger import Plotter

import gym
import numpy as np
import torch
from torch import nn

plotter = Plotter('Return', 'Length', 'Entropy', 'Explained Variance', 'Policy Loss', 'Values Loss', figsize=(16, 20))

HIDDEN_SIZES = (64, 64)
def mlp(in_dim, out_dim, activation=nn.ReLU, output_activation=nn.Identity):
    hidden_layers = [nn.Linear(s1, s2) for s1, s2 in zip(HIDDEN_SIZES[:-1], HIDDEN_SIZES[1:])]
    return nn.Sequential(nn.Linear(in_dim, HIDDEN_SIZES[0]),
                         activation(),
                         *[l for layer in hidden_layers for l in (layer, activation())],
                         nn.Linear(HIDDEN_SIZES[-1], out_dim),
                         output_activation())


class CategoricalActor(nn.Module):
    def __init__(self, obs_dim, act_dim):
        super().__init__()
        self.pi = mlp(obs_dim, act_dim)

    def forward(self, obs):
        return self.pi(obs)

    def act(self, obs, test=False):
        with torch.no_grad():
            logits = self.pi(obs)
        return torch.distributions.Categorical(logits=logits).sample().item()

    def log_prob(self, obs, action):
        logits = self.pi(obs)
        return torch.distributions.Categorical(logits=logits).log_prob(action.squeeze())

class GaussianActor(nn.Module):
    def __init__(self, obs_dim, act_dim):
        super().__init__()
        self.pi = mlp(obs_dim, act_dim)

    def forward(self, obs):
        return self.pi(obs)

    def act(self, obs, test=False):
        with torch.no_grad():
            means = self.pi(obs)
        if test:
            return means
        return torch.distributions.Normal(means, torch.exp(torch.tensor([-0.5]))).sample()
    
    def log_prob(self, obs, acts):
        means = self.pi(obs)
        return torch.distributions.Normal(means, torch.exp(torch.tensor([-0.5]))).log_prob(acts).sum(axis=-1)

    
def product(iter):
    ret = 1
    for n in iter:
        ret *= n
    return ret


# create environment
env = gym.make('LunarLander-v2')
obs_dim = product(env.observation_space.shape)

act_dim = env.action_space.n
policy_net = CategoricalActor(obs_dim, act_dim)
act_dim2 = 1

# act_dim = env.action_space.shape[0]
# policy_net = GaussianActor(obs_dim, act_dim)

values_net = mlp(obs_dim, 1)

PI_LR = 3e-4
V_LR = 1e-3
policy_optimiser = torch.optim.Adam(policy_net.parameters(), lr=PI_LR)
values_optimiser = torch.optim.Adam(values_net.parameters(), lr=V_LR)

def cumulative(iter, discount):
    c = iter.clone().detach()
    for i in reversed(range(len(c) - 1)):
        c[i] += discount * c[i + 1]
    return c

STEPS_PER_EPOCH = 4000
EPOCHS = 10000
GAMMA = 0.99
TRAIN_PI_ITERS = 80
TRAIN_V_ITERS = 80
LAMBDA = 0.97
CLIP_RATIO = 0.2
TARGET_KL = 0.01

steps = 0
observations = torch.zeros((STEPS_PER_EPOCH, obs_dim), dtype=torch.float32)
values = torch.zeros(STEPS_PER_EPOCH, dtype=torch.float32)
actions = torch.zeros((STEPS_PER_EPOCH, act_dim2), dtype=torch.float32)
advantages = torch.zeros(STEPS_PER_EPOCH, dtype=torch.float32)
rewards_to_go = torch.zeros(STEPS_PER_EPOCH, dtype=torch.float32)
for epoch in range(EPOCHS):
    rewards = []
    obs = env.reset()
    ep_start = 0
    for i in range(STEPS_PER_EPOCH):
        steps += 1
        obs = torch.as_tensor(obs, dtype=torch.float32)
        observations[i] = obs
        with torch.no_grad():
            values[i] = values_net(obs)

        with torch.no_grad():
            action = policy_net.act(obs)
        obs, reward, done, _ = env.step(action)

        actions[i] = action
        rewards.append(reward)

        if done or i == STEPS_PER_EPOCH - 1:
            ep_end = ep_start + len(rewards)

            with torch.no_grad():
                last_val = 0.0 if done else values_net(torch.as_tensor(obs, dtype=torch.float32))
            rewards_tensor = torch.as_tensor(rewards + [last_val], dtype=torch.float32)

            deltas = rewards_tensor[:-1] + GAMMA * torch.cat((values[ep_start+1:ep_end], torch.tensor([last_val]))) - values[ep_start:ep_end]
            adv = cumulative(deltas, GAMMA * LAMBDA)
            advantages[ep_start:ep_end] = adv # (adv - adv.mean()) / (adv.std() + 1e-8)
            rewards_to_go[ep_start:ep_end] = cumulative(rewards_tensor, GAMMA)[:-1]

            rewards = []
            obs = env.reset()

            ep_start = ep_end

    with torch.no_grad():
        old_logp = policy_net.log_prob(observations, actions)

    for _ in range(TRAIN_PI_ITERS):
        policy_optimiser.zero_grad()
        logp = policy_net.log_prob(observations, actions)
        ratio = torch.exp(logp - old_logp)
        clipped_ratio = torch.clamp(ratio, 1 - CLIP_RATIO, 1 + CLIP_RATIO)
        policy_loss = -torch.min(ratio * advantages, clipped_ratio * advantages).mean()
        policy_loss.backward()
        policy_optimiser.step()

        approx_kl = (old_logp - logp).mean().item()
        
        if approx_kl > 1.5 * TARGET_KL:
            break

    for _ in range(TRAIN_V_ITERS):
        values_optimiser.zero_grad()
        vvalues = values_net(observations).squeeze(dim=-1)
        values_loss = ((vvalues - rewards_to_go) ** 2).mean()
        values_loss.backward()
        values_optimiser.step()

    if epoch % 1 == 0:
        lengths = []
        returns = []
        entropies = []
        explained_variances = []
        for _ in range(4):
            vvalues = []
            rewards = []
            ep_len, ep_entropy = 0, 0
            obs = env.reset()
            done = False
            while not done:
                ep_len += 1

                obs = torch.as_tensor(obs, dtype=torch.float32)
                with torch.no_grad():
                    vvalues.append(values_net(obs).squeeze(dim=-1))
                    action = policy_net.act(obs, True)
                    ep_entropy -= policy_net.log_prob(obs, torch.as_tensor(action))

                obs, reward, done, _ = env.step(action)
                rewards.append(reward)
            test_rewards_to_go = cumulative(torch.as_tensor(rewards), GAMMA)
            explained_variance = 1 - (np.array(vvalues) - np.array(test_rewards_to_go)).var() / np.array(test_rewards_to_go).var()
            explained_variances.append(explained_variance)
            
            lengths.append(ep_len)
            returns.append(sum(rewards))
            entropies.append(ep_entropy / ep_len)

        plotter.update(epoch,
                       (np.mean(returns), min(returns), max(returns)),
                       (np.mean(lengths), min(lengths), max(lengths)),
                       np.mean(entropies),
                       np.mean(explained_variance),
                       policy_loss.item(),
                       values_loss.item())

    # print(epoch, np.mean(returns), policy_loss, values_loss)

In [None]:
obs = env.reset()
l = 0
done = False
while not done:
    l += 1
    env.render()
    with torch.no_grad():
        action = policy_net.act(torch.as_tensor(obs.flatten(), dtype=torch.float32), True)
    obs, _, done, _ = env.step(action)
env.close()
print(l)