In [1]:
from collections import deque
from copy import deepcopy
from pprint import pprint

import gym
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical
from torch.utils.data import DataLoader

In [2]:
SEED = 1
BATCH_SIZE = 4
LR = 0.0001
EPOCHS = 4
CLIP = 0.1
GAMMA = 0.99
LAMBDA = 0.95
ENT_COEF = 0.01

# set device
use_cuda = torch.cuda.is_available()
print('cuda:', use_cuda)
device = torch.device('cuda' if use_cuda else 'cpu')

# random seed
np.random.seed(SEED)
torch.manual_seed(SEED)
if use_cuda:
    torch.cuda.manual_seed_all(SEED)

cuda: True


In [3]:
class ActorCriticNet(nn.Module):
    def __init__(self, obs_space, action_space):
        super().__init__()

        self.head = nn.Sequential(
            nn.Linear(obs_space, 256),
            nn.SELU()
        )

        self.pol = nn.Sequential(
            nn.Linear(256, 256),
            nn.SELU(),
            nn.Linear(256, action_space)
        )
        
        self.val = nn.Sequential(
            nn.Linear(256, 256),
            nn.SELU(),
            nn.Linear(256, 1)
        )
        
        self.log_softmax = nn.LogSoftmax(dim=-1)

    def forward(self, x):
        out = self.head(x)
        logit = self.pol(out).reshape(out.shape[0], -1)
        value = self.val(out).reshape(out.shape[0], 1)
        log_probs = self.log_softmax(logit)
        
        return log_probs, value

In [None]:
losses = []


def learn(net, old_net, optimizer, train_memory):
    global action_space

    net.train()
    old_net.train()
    dataloader = DataLoader(train_memory,
                        batch_size=BATCH_SIZE,
                        shuffle=True,
                        pin_memory=use_cuda)
    for i in range(EPOCHS):
#         dataloader = DataLoader(train_memory,
#                                 batch_size=BATCH_SIZE,
#                                 shuffle=True,
#                                 pin_memory=use_cuda)
        for (s, a, r, _s, d, adv) in dataloader:
            s_batch = s.to(device).float()
            a_batch = a.detach().to(device).long()
            _s_batch = _s.to(device).float()
            r_batch = r.to(device).float()
            adv_batch = adv.to(device).float()
#             ret_batch = ret.to(device).float()
            done_mask = 1. - d.to(device).float()

            with torch.no_grad():
                log_p_batch_old, v_batch_old = old_net(s_batch)
                log_p_acting_old = log_p_batch_old[range(BATCH_SIZE), a_batch]

            log_p_batch, v_batch = net(s_batch)
            _, _v_batch = net(_s_batch)
            log_p_acting = log_p_batch[range(BATCH_SIZE), a_batch]
            p_ratio = (log_p_acting - log_p_acting_old).exp()
            p_ratio_clip = torch.clamp(p_ratio, 1. - CLIP, 1. + CLIP)
            clip_p_loss = torch.min(p_ratio * adv_batch,
                                  p_ratio_clip * adv_batch)
            v_loss = (r_batch + GAMMA * done_mask * _v_batch - v_batch).pow(2)
#             clip_v_loss = v_batch_old + torch.clamp(v_batch - v_batch_old, -CLIP, CLIP)
#             v_loss_a = (clip_v_loss - ret_batch).pow(2)
#             v_loss_b = (v_batch - ret_batch).pow(2)
#             v_loss = torch.min(v_loss_a, v_loss_a) 
            entropy = -(log_p_batch.exp() * log_p_batch).sum(dim=1)

            # loss
            loss = -(clip_p_loss - v_loss + ENT_COEF * entropy).mean()
            losses.append(loss)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()


def get_action_and_value(obs, old_net):
    old_net.eval()
    with torch.no_grad():
        state = torch.tensor([obs]).to(device).float()
        log_p, v = old_net(state)
        m = Categorical(log_p.exp())
        action = m.sample()

    return action.item(), v.item()


def compute_adv(rewards, values, dones, roll_memory):
    rew = np.array(rewards, 'float')
    val = np.array(values[:-1], 'float')
    _val = np.array(values[1:], 'float')
    done_mask = 1. - np.array(dones, 'float')
    delta = rew + GAMMA * done_mask * _val - val
#     disc_v = delta + val
    gae_dt = np.array(
        [(GAMMA * LAMBDA)**(i) * dt for i, dt in enumerate(delta.tolist())],
        'float')
#     gae_dv = np.array(
#         [(GAMMA * LAMBDA)**(i) * dv for i, dv in enumerate(disc_v.tolist())],
#         'float')
    for i, data in enumerate(roll_memory):
        data.append(sum(gae_dt[i:] / (GAMMA * LAMBDA)**(i)))
#         data.append(sum(gae_dv[i:] / (GAMMA * LAMBDA)**(i)))
#     pprint(roll_memory)
    return roll_memory

## Main

In [None]:
# make an environment
# env = gym.make('CartPole-v0')
# env = gym.make('CartPole-v1')
# env = gym.make('MountainCar-v0')
env = gym.make('LunarLander-v2')

env.seed(SEED)
obs_space = env.observation_space.shape[0]
action_space = env.action_space.n

# hyperparameter
n_episodes = 3000
roll_len = 128
n_eval = env.spec.trials

# global values
steps = 0
learn_steps = 0
ep_rewards = []
reward_eval = deque(maxlen=n_eval)
is_rollout = False
is_solved = False

# make nerual networks
net = ActorCriticNet(obs_space, action_space).to(device)
old_net = deepcopy(net)

# make a optimizer
optimizer = optim.Adam(net.parameters(), lr=LR, eps=1e-5)

# make a rollout memory
train_memory = []
roll_memory = []
rewards = []
values = []
dones = []

In [None]:
use_cuda

True

In [None]:
env.spec.max_episode_steps

1000

In [None]:
env.spec.trials

100

In [None]:
env.spec.reward_threshold

200

In [None]:
# play
for i in range(1, n_episodes + 1):
    obs = env.reset()
    done = False
    ep_reward = 0
    while not done:
        env.render()

        action, value = get_action_and_value(obs, old_net)
        _obs, reward, done, _ = env.step(action)
        
        # store
        roll_memory.append([obs, action, reward, _obs, done])
        rewards.append(reward)
        values.append(value)
        dones.append(done)
        
        obs = _obs
        steps += 1
        ep_reward += reward
        
        if done or steps % roll_len == 0:
            _, _value = get_action_and_value(_obs, old_net)
            values.append(_value)
            train_memory.extend(compute_adv(rewards, values, dones, roll_memory))
            rewards.clear()
            values.clear()
            dones.clear()
            roll_memory.clear()
            
        if steps % roll_len == 0:
#             print('\n============  Start Learning  ============\n')
            learn(net, old_net, optimizer, train_memory)
            learn_steps += 1
            train_memory.clear()

        if learn_steps > 1:
            old_net.load_state_dict(net.state_dict())
            learn_steps = 1
    
    if done:        
        ep_rewards.append(ep_reward)
        print('{:3} Episode in {:5} steps, reward {:.2f}'.format(
            i, steps, ep_reward))

        if len(ep_rewards) >= n_eval:
            if np.mean(list(reversed(ep_rewards))[: n_eval]) >= env.spec.reward_threshold:
                print('\n{} is sloved! {:3} Episode in {:3} steps'.format(
                    env.spec.id, i, steps))
                torch.save(old_net.state_dict(),
                           f'./test/saved_models/{env.spec.id}_ep{i}_clear_model_ppo.pt')
                break
env.close()

  1 Episode in    67 steps, reward -255.99
  2 Episode in   142 steps, reward -107.45
  3 Episode in   222 steps, reward -130.95
  4 Episode in   311 steps, reward -435.51
  5 Episode in   417 steps, reward -219.62
  6 Episode in   514 steps, reward -103.82
  7 Episode in   579 steps, reward -87.73
  8 Episode in   679 steps, reward -155.95
  9 Episode in   791 steps, reward -88.26
 10 Episode in   857 steps, reward -172.01
 11 Episode in   947 steps, reward -287.10
 12 Episode in  1063 steps, reward -476.11
 13 Episode in  1151 steps, reward -91.61
 14 Episode in  1256 steps, reward -122.05
 15 Episode in  1315 steps, reward -93.43
 16 Episode in  1394 steps, reward -211.80
 17 Episode in  1461 steps, reward -114.64
 18 Episode in  1518 steps, reward -105.39
 19 Episode in  1639 steps, reward -154.00
 20 Episode in  1757 steps, reward -175.40
 21 Episode in  1832 steps, reward -229.74
 22 Episode in  1933 steps, reward -217.33
 23 Episode in  2012 steps, reward -72.65
 24 Episode in  

193 Episode in 49561 steps, reward -207.89
194 Episode in 50561 steps, reward -201.13
195 Episode in 51561 steps, reward -171.18
196 Episode in 52561 steps, reward -160.73
197 Episode in 53452 steps, reward -268.97
198 Episode in 53866 steps, reward -155.64
199 Episode in 54504 steps, reward -215.63
200 Episode in 55465 steps, reward -258.56
201 Episode in 56465 steps, reward -219.81
202 Episode in 57012 steps, reward -198.42
203 Episode in 57515 steps, reward -176.91
204 Episode in 58114 steps, reward -242.49
205 Episode in 58801 steps, reward -282.05
206 Episode in 59125 steps, reward -155.79
207 Episode in 59517 steps, reward -178.57
208 Episode in 60263 steps, reward -264.22
209 Episode in 61147 steps, reward -326.67
210 Episode in 61859 steps, reward -268.34
211 Episode in 62586 steps, reward -294.67
212 Episode in 63138 steps, reward -234.33
213 Episode in 64138 steps, reward -213.88
214 Episode in 64599 steps, reward -224.13
215 Episode in 65395 steps, reward -326.83
216 Episode

383 Episode in 217353 steps, reward -46.71
384 Episode in 218353 steps, reward -133.04
385 Episode in 219353 steps, reward -43.71
386 Episode in 220353 steps, reward -68.89
387 Episode in 221353 steps, reward -100.38
388 Episode in 222353 steps, reward -86.75
389 Episode in 223353 steps, reward -91.16
390 Episode in 224353 steps, reward -98.58
391 Episode in 225353 steps, reward -38.16
392 Episode in 226353 steps, reward -87.67
393 Episode in 226568 steps, reward -106.09
394 Episode in 226920 steps, reward -126.39
395 Episode in 227177 steps, reward -105.24
396 Episode in 228177 steps, reward -72.81
397 Episode in 229177 steps, reward -98.73
398 Episode in 229790 steps, reward -154.15
399 Episode in 230790 steps, reward -92.86
400 Episode in 231790 steps, reward -97.00
401 Episode in 232790 steps, reward -91.32
402 Episode in 233790 steps, reward -84.45
403 Episode in 233982 steps, reward -90.34
404 Episode in 234982 steps, reward -145.26
405 Episode in 235982 steps, reward -71.80
406 

In [None]:
plt.figure(figsize=(15, 5))
plt.title('Reward')
plt.plot(ep_rewards)
plt.figure(figsize=(15, 5))
plt.title('Loss')
plt.plot(losses)
plt.show()

In [None]:
[
    ('CartPole-v0', 161, 32, 256),
    ('CartPole-v1', 162, 32, 256),
    ('MountainCar-v0', 660),
    ('LunarLander-v2', 260)
]