In [1]:
from collections import deque
from copy import deepcopy

import gym
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical
from torch.utils.data import DataLoader

In [2]:
SEED = 1
BATCH_SIZE = 16
LR = 0.0003
EPOCHS = 4
ALPHA = 0.0001
CLIP = 0.1
GAMMA = 0.99
LAMBDA = 0.95
ENT_COEF = 0.01

# set device
use_cuda = torch.cuda.is_available()
print('cuda:', use_cuda)
device = torch.device('cuda' if use_cuda else 'cpu')

# random seed
np.random.seed(SEED)
torch.manual_seed(SEED)
if use_cuda:
    torch.cuda.manual_seed_all(SEED)

cuda: True


In [3]:
class ActorCriticNet(nn.Module):
    def __init__(self, obs_space, action_space):
        super().__init__()

        self.head = nn.Sequential(
            nn.Linear(obs_space, 256),
            nn.SELU()
        )

        self.pol = nn.Sequential(
            nn.Linear(256, 256),
            nn.SELU(),
            nn.Linear(256, action_space)
        )
        
        self.val_in = nn.Sequential(
            nn.Linear(256, 256),
            nn.SELU(),
            nn.Linear(256, 1)
        )
        
        self.val_ex = nn.Sequential(
            nn.Linear(256, 256),
            nn.SELU(),
            nn.Linear(256, 1)
        )
        
        self.log_softmax = nn.LogSoftmax(dim=-1)

    def forward(self, x):
        out = self.head(x)
        logit = self.pol(out).reshape(out.shape[0], -1)
        value_in = self.val_in(out).reshape(out.shape[0], 1)
        value_ex = self.val_ex(out).reshape(out.shape[0], 1)
        log_probs = self.log_softmax(logit)
        
        return log_probs, value_in, value_ex
    
class RandomNet(nn.Module):
    def __init__(self, obs_space):
        super().__init__()
        self.head = nn.Sequential(
            nn.Linear(obs_space, 256),
            nn.SELU()
        )

        self.fc = nn.Sequential(
            nn.Linear(256, 256),
            nn.SELU(),
            nn.Linear(256, 2)
        )

    def forward(self, x):
        out = self.head(x)
        obs_feature = self.fc(out).reshape(out.shape[0], -1)

        return obs_feature


class PredictNet(nn.Module):
    def __init__(self, obs_space):
        super().__init__()
        self.head = nn.Sequential(
            nn.Linear(obs_space, 256),
            nn.SELU()
        )

        self.fc = nn.Sequential(
            nn.Linear(256, 512),
            nn.SELU(),
            nn.Linear(512, 2)
        )

    def forward(self, x):
        out = self.head(x)
        obs_feature = self.fc(out).reshape(out.shape[0], -1)

        return obs_feature

In [None]:
losses = []


def learn(net, old_net, optimizer, train_memory):
    global action_space

    net.train()
    old_net.train()
    for i in range(EPOCHS):
        dataloader = DataLoader(train_memory,
                                batch_size=BATCH_SIZE,
                                shuffle=True,
                                pin_memory=use_cuda)
        for (s, a, adv) in dataloader:
            s_batch = s.to(device).float()
            a_batch = a.detach().to(device).long()
            adv_batch = adv.to(device).float()

            with torch.no_grad():
                log_p_batch_old, v_batch_old = old_net(s_batch)
                log_p_acting_old = log_p_batch_old[range(BATCH_SIZE), a_batch]

            log_p_batch, v_batch = net(s_batch)
            log_p_acting = log_p_batch[range(BATCH_SIZE), a_batch]

            p_ratio = (log_p_acting - log_p_acting_old).exp()
            p_ratio_clip = torch.clamp(p_ratio, 1 - CLIP, 1 + CLIP)
            clip_loss = torch.min(p_ratio * adv_batch,
                                  p_ratio_clip * adv_batch)

            v_loss = (v_batch - v_batch_old).pow(2)
            entropy = -(log_p_batch.exp() * log_p_batch).sum()

            # loss
            loss = -(clip_loss - v_loss + ENT_COEF * entropy).mean()
            losses.append(loss)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()


def get_action_and_value(obs, old_net):
    old_net.eval()
    with torch.no_grad():
        state = torch.tensor([obs]).to(device).float()
        log_p, v_in, v_ex = old_net(state)
        m = Categorical(log_p.exp())
        action = m.sample()

    return action.item(), v_in.item(), v_ex.item()


def compute_adv(rewards, values, roll_memory):
    rew = np.array(rewards, 'float')
    val = np.array(values[:-1], 'float')
    _val = np.array(values[1:], 'float')
    delta = rew + 0.99 * _val - val
    disc_dt = np.array(
        [(GAMMA * LAMBDA)**(i) * dt for i, dt in enumerate(delta.tolist())],
        'float')
    for i, data in enumerate(roll_memory):
        data.append(sum(disc_dt[i:]/(GAMMA * LAMBDA)**(i)))

    return roll_memory

## Main

In [None]:
# make an environment
# env = gym.make('CartPole-v0')
# env = gym.make('CartPole-v1')
env = gym.make('MountainCar-v0')
# env = gym.make('LunarLander-v2')

env.seed(SEED)
obs_space = env.observation_space.shape[0]
action_space = env.action_space.n

# hyperparameter
n_episodes = 1000
roll_len = 128
n_init = 3000
n_eval = env.spec.trials

# global values
steps = 0
learn_steps = 0
ep_rewards = []
reward_eval = deque(maxlen=n_eval)
is_rollout = False
is_solved = False

# make nerual networks
net = ActorCriticNet(obs_space, action_space).to(device)
old_net = deepcopy(net)
pred_net = PredictNet(obs_space).to(device)
rand_net = RandomNet(obs_space).to(device)

# make a optimizer
net_optim = optim.Adam(net.parameters(), lr=LR)
pred_optim = optim.Adam(pred_net.parameters(), lr=LR)

# make a rollout memory
train_memory = []
roll_memory = []
obs_memory = []
rewards = []
values = []

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [None]:
use_cuda

True

In [None]:
env.spec.max_episode_steps

200

In [None]:
env.spec.trials

100

In [None]:
env.spec.reward_threshold

-110.0

In [None]:
for i in range(n_init):
    
    
# play!
for i in range(1, n_episodes + 1):
    obs = env.reset()
    done = False
    ep_reward = 0
    while not done:
        env.render()

        action, value = get_action_and_value(obs, old_net)
        _obs, reward, done, _ = env.step(action)
        
        # store
        roll_memory.append([obs,action])
        rewards.append(reward)
        values.append(value)
        
        obs = _obs
        steps += 1
        ep_reward += reward
        
        if done or steps % roll_len == 0:
            _, _value = get_action_and_value(_obs, old_net)
            values.append(_value)
            train_memory.extend(compute_adv(rewards, values, roll_memory))
            rewards.clear()
            values.clear()
            roll_memory.clear()
            
        if steps % roll_len == 0:
#             print('\n============  Start Learning  ============\n')
            learn(net, old_net, optimizer, train_memory)
            learn_steps += 1
            train_memory.clear()

        if learn_steps > 1:
            old_net.load_state_dict(net.state_dict())
            learn_steps = 1
    
    if done:        
        ep_rewards.append(ep_reward)
        print('{:3} Episode in {:5} steps, reward {:.2f}'.format(
            i, steps, ep_reward))

        if len(ep_rewards) >= n_eval:
            if np.mean(list(reversed(ep_rewards))[: n_eval]) >= env.spec.reward_threshold:
                print('\n{} is sloved! {:3} Episode in {:3} steps'.format(
                    env.spec.id, i, steps))
                torch.save(old_net.state_dict(),
                           f'./test/saved_models/{env.spec.id}_ep{i}_clear_model_ppo.pt')
                break
env.close()

  1 Episode in   200 steps, reward -200.00
  2 Episode in   400 steps, reward -200.00
  3 Episode in   600 steps, reward -200.00
  4 Episode in   800 steps, reward -200.00
  5 Episode in  1000 steps, reward -200.00
  6 Episode in  1200 steps, reward -200.00
  7 Episode in  1400 steps, reward -200.00
  8 Episode in  1600 steps, reward -200.00
  9 Episode in  1800 steps, reward -200.00
 10 Episode in  2000 steps, reward -200.00
 11 Episode in  2200 steps, reward -200.00
 12 Episode in  2400 steps, reward -200.00
 13 Episode in  2600 steps, reward -200.00
 14 Episode in  2800 steps, reward -200.00
 15 Episode in  3000 steps, reward -200.00
 16 Episode in  3200 steps, reward -200.00
 17 Episode in  3400 steps, reward -200.00
 18 Episode in  3600 steps, reward -200.00
 19 Episode in  3800 steps, reward -200.00
 20 Episode in  4000 steps, reward -200.00
 21 Episode in  4200 steps, reward -200.00
 22 Episode in  4400 steps, reward -200.00
 23 Episode in  4600 steps, reward -200.00
 24 Episode

In [None]:
plt.figure(figsize=(15, 5))
plt.title('Reward')
plt.plot(ep_rewards)
plt.figure(figsize=(15, 5))
plt.title('Loss')
plt.plot(losses)
plt.show()

In [None]:
[
    ('CartPole-v0', 126, 16, 128),
    ('CartPole-v1', 225, 64, 256),
    ('MountainCar-v0', 660),
    ('LunarLander-v2', 260)
]