In [1]:
from collections import deque
from copy import deepcopy

import gym
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical
from torch.utils.data import DataLoader

In [2]:
SEED = 1
BATCH_SIZE = 4
LR = 0.0001
EPOCHS = 4
CLIP = 0.1
GAMMA = 0.99
LAMBDA = 0.95
ENT_COEF = 0.01
EX_COEF = 2.0
IN_COEF = 1.0

# set device
use_cuda = torch.cuda.is_available()
print('cuda:', use_cuda)
device = torch.device('cuda' if use_cuda else 'cpu')

# random seed
np.random.seed(SEED)
torch.manual_seed(SEED)
if use_cuda:
    torch.cuda.manual_seed_all(SEED)

cuda: True


In [3]:
class ActorCriticNet(nn.Module):
    def __init__(self, obs_space, action_space):
        super().__init__()

        self.head = nn.Sequential(
            nn.Linear(obs_space, 256),
            nn.SELU()
        )

        self.pol = nn.Sequential(
            nn.Linear(256, 256),
            nn.SELU(),
            nn.Linear(256, action_space)
        )
        
        self.val_ex = nn.Sequential(
            nn.Linear(256, 256),
            nn.SELU(),
            nn.Linear(256, 1)
        )
        
        self.val_in = nn.Sequential(
            nn.Linear(256, 256),
            nn.SELU(),
            nn.Linear(256, 1)
        )
        
        self.log_softmax = nn.LogSoftmax(dim=-1)

    def forward(self, x):
        out = self.head(x)
        logit = self.pol(out).reshape(out.shape[0], -1)
        value_ex = self.val_ex(out).reshape(out.shape[0], 1)
        value_in = self.val_in(out).reshape(out.shape[0], 1)
        log_probs = self.log_softmax(logit)
        
        return log_probs, value_ex, value_in
    
class RandomNet(nn.Module):
    def __init__(self, obs_space):
        super().__init__()
        self.head = nn.Sequential(
            nn.Linear(obs_space, 256),
            nn.SELU()
        )

        self.fc = nn.Sequential(
            nn.Linear(256, 512)
        )

    def forward(self, x):
        out = self.head(x)
        obs_feature = self.fc(out).reshape(out.shape[0], -1)

        return obs_feature


class PredictNet(nn.Module):
    def __init__(self, obs_space):
        super().__init__()
        self.head = nn.Sequential(
            nn.Linear(obs_space, 256),
            nn.SELU()
        )

        self.fc = nn.Sequential(
            nn.Linear(256, 256),
            nn.SELU(),
            nn.Linear(256, 256),
            nn.SELU(),
            nn.Linear(256, 512)
        )

    def forward(self, x):
        out = self.head(x)
        obs_feature = self.fc(out).reshape(out.shape[0], -1)

        return obs_feature

In [None]:
losses = []
m_losses = []
f_losses = []


def learn(net, old_net, pred_net, rand_net, net_optim, pred_optim, train_memory):
    global action_space

    net.train()
    old_net.train()
    dataloader = DataLoader(train_memory,
                        batch_size=BATCH_SIZE,
                        shuffle=True,
                        pin_memory=use_cuda)
    for i in range(EPOCHS):
        for (s, a, r_ex, r_in, _s, d, adv) in dataloader:
            s_batch = s.to(device).float()
            a_batch = a.detach().to(device).long()
            _s_batch = _s.to(device).float()
            _s_norm = normalize_obs(_s.detach().cpu().numpy())
            _s_norm_batch = torch.tensor(_s_norm).to(device).float()
            r_ex_batch = r_ex.to(device).float()
            r_in_batch = r_in.to(device).float()
            adv_batch = adv.to(device).float()
            done_mask = 1. - d.to(device).float()

            with torch.no_grad():
                rand_feature = rand_net(_s_norm_batch)
                log_p_batch_old, _, _ = old_net(s_batch)
                log_p_acting_old = log_p_batch_old[range(BATCH_SIZE), a_batch]

            pred_feature = pred_net(_s_norm_batch)
            log_p_batch, v_ex, v_in = net(s_batch)
            _, _v_ex, _v_in = net(_s_batch)
            log_p_acting = log_p_batch[range(BATCH_SIZE), a_batch]
            p_ratio = (log_p_acting - log_p_acting_old).exp()
            p_ratio_clip = torch.clamp(p_ratio, 1. - CLIP, 1. + CLIP)
            clip_loss = torch.min(p_ratio * adv_batch,
                                  p_ratio_clip * adv_batch)
            
            v_ex_loss = 0.5 * (r_ex_batch + GAMMA * done_mask * _v_ex - v_ex).pow(2)
            v_in_loss = 0.5 * (r_in_batch + GAMMA * done_mask * _v_in - v_in).pow(2)
            v_loss = v_ex_loss + v_in_loss 
            entropy = -(log_p_batch.exp() * log_p_batch).sum(dim=1)

            # loss
            m_loss = -(clip_loss - v_loss + ENT_COEF * entropy).mean()
            m_losses.append(m_loss)

            f_loss = (pred_feature - rand_feature).pow(2).sum(dim=1).mean()
            f_losses.append(f_loss)
            
            loss = m_loss + f_loss
            losses.append(loss)
            
            net_optim.zero_grad()
            pred_optim.zero_grad()
            loss.backward()
            net_optim.step()
            pred_optim.step()


def get_action_and_value(obs, old_net):
    old_net.eval()
    with torch.no_grad():
        state = torch.tensor([obs]).to(device).float()
        log_p, v_ex, v_in = old_net(state)
        m = Categorical(log_p.exp())
        action = m.sample()

    return action.item(), v_ex.item(), v_in.item()


def compute_adv(rewards, values, dones, roll_memory):
    rew = np.array(rewards, 'float')
    val = np.array(values[:-1], 'float')
    _val = np.array(values[1:], 'float')
    done_mask = 1. - np.array(dones, 'float')
    delta = rew + 0.99 * done_mask * _val - val
    disc_dt = np.array([
        (GAMMA * LAMBDA)**(i) * dt for i, dt in enumerate(delta.tolist())],
        'float')
    for i, data in enumerate(roll_memory):
        data.append(sum(disc_dt[i:] / (GAMMA * LAMBDA)**(i)))

    return roll_memory


def get_norm_params(obs_memory):
    global obs_apace

    obses = [[] for _ in range(obs_space)]
    for obs in obs_memory:
        for j in range(obs_space):
            obses[j].append(obs[j])

    mean = np.zeros(obs_space, 'float')
    std = np.zeros(obs_space, 'float')
    for i, obs_ in enumerate(obses):
        mean[i] = np.mean(obs_)
        std[i] = np.std(obs_)

    return mean, std


def normalize_obs(obs):
    global mean, std
#     means = [mean for _ in range(BATCH_SIZE)]
#     stds = [std for _ in range(BATCH_SIZE)]
#     mean_np = np.stack(means)
#     std_np = np.stack(stds)
    norm_obs = (obs - mean) / std

#     return np.clip(norm_obs, -5, 5)
    return norm_obs


def calculate_reward_in(pred_net, rand_net, obs):
    norm_obs = normalize_obs(obs)
    state = torch.tensor([norm_obs]).to(device).float()
    with torch.no_grad():
        pred_obs = pred_net(state)
        rand_obs = rand_net(state)
        reward = (pred_obs - rand_obs).pow(2).sum()
        clipped_reward = torch.clamp(reward, -1, 1)

    return clipped_reward.item()

## Main

In [None]:
# make an environment
# env = gym.make('CartPole-v0')
# env = gym.make('CartPole-v1')
env = gym.make('MountainCar-v0')
# env = gym.make('LunarLander-v2')

env.seed(SEED)
obs_space = env.observation_space.shape[0]
action_space = env.action_space.n

# hyperparameter
n_episodes = 3000
roll_len = 128
n_eval = env.spec.trials

# global values
init_steps = 0
steps = 0
learn_steps = 0
mean = 0.
std = 0.
ep_rewards = []
is_rollout = False
is_solved = False
is_init_roll = True

# make nerual networks
net = ActorCriticNet(obs_space, action_space).to(device)
old_net = deepcopy(net)
pred_net = PredictNet(obs_space).to(device)
rand_net = RandomNet(obs_space).to(device)

# make optimizer
net_optim = torch.optim.Adam(net.parameters(), lr=LR, eps=1e-5)
pred_optim = torch.optim.Adam(pred_net.parameters(), lr=LR, eps=1e-5)

# make a rollout memory
train_memory = []
roll_memory = []
obs_memory = []
rewards = []
values = []
dones = []

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [None]:
use_cuda

True

In [None]:
env.spec.max_episode_steps

200

In [None]:
env.spec.trials

100

In [None]:
env.spec.reward_threshold

-110.0

In [None]:
# simulation
while True:
    obs = env.reset()
    done = False
    while not done:
        env.render()
        action = env.action_space.sample()
        _obs, _, done, _ = env.step(action)
        obs_memory.append(_obs)
        obs = _obs
        init_steps += 1
        if init_steps == roll_len * 50:
            mean, std = get_norm_params(obs_memory)
            obs_memory.clear()
            is_init_roll = False
            break
    if not is_init_roll:
        break

In [None]:
# play!
for i in range(1, n_episodes + 1):
    obs = env.reset()
    done = False
    ep_reward_ex = 0.
    ep_reward_in = 0.
    while not done:
        env.render()

        action, val_ex, val_in = get_action_and_value(obs, old_net)
        _obs, rew_ex, done, _ = env.step(action)
        
        rew_in = calculate_reward_in(pred_net, rand_net, _obs)    
        
        reward = EX_COEF * rew_ex + IN_COEF * rew_in
        value = val_ex + val_in
        
        # store
        roll_memory.append([obs, action, EX_COEF * rew_ex, IN_COEF * rew_in, _obs, done])
        obs_memory.append(_obs)
        rewards.append(reward)
        values.append(value)
        dones.append(done)
        
        obs = _obs
        steps += 1
        ep_reward_ex += rew_ex
        ep_reward_in += rew_in
        
        if done or steps % roll_len == 0:
            _, _val_ex, _val_in = get_action_and_value(_obs, old_net)
            _value = _val_ex + _val_in
            values.append(_value)
            train_memory.extend(compute_adv(rewards, values, dones, roll_memory))
            rewards.clear()
            values.clear()
            dones.clear()
            roll_memory.clear()
            
        if steps % roll_len == 0:
#             print('\n============  Start Learning  ============\n')
            learn(net, old_net, pred_net, rand_net, net_optim, pred_optim, train_memory)
            learn_steps += 1
            train_memory.clear()

        if learn_steps > 1:
            old_net.load_state_dict(net.state_dict())
            learn_steps = 1
        
        if steps % roll_len*50 == 0:
            mean, std = get_norm_params(obs_memory)
            obs_memory.clear()
    
    if done:        
        ep_rewards.append(ep_reward_ex)
        print('{:3} Episode in {:5} steps, reward_ex {:.2f}, reward_in {:.2f}'.format(
            i, steps, ep_reward_ex, ep_reward_in))

        if len(ep_rewards) >= n_eval:
            if np.mean(list(reversed(ep_rewards))[: n_eval]) >= env.spec.reward_threshold:
                print('\n{} is sloved! {:3} Episode in {:3} steps'.format(
                    env.spec.id, i, steps))
                torch.save(old_net.state_dict(),
                           f'./test/saved_models/{env.spec.id}_ep{i}_clear_model_ppo.pt')
                break
env.close()

  1 Episode in   200 steps, reward_ex -200.00, reward_in 188.02
  2 Episode in   400 steps, reward_ex -200.00, reward_in 150.89
  3 Episode in   600 steps, reward_ex -200.00, reward_in 193.58
  4 Episode in   800 steps, reward_ex -200.00, reward_in 182.59
  5 Episode in  1000 steps, reward_ex -200.00, reward_in 197.42
  6 Episode in  1200 steps, reward_ex -200.00, reward_in 145.04
  7 Episode in  1400 steps, reward_ex -200.00, reward_in 107.68
  8 Episode in  1600 steps, reward_ex -200.00, reward_in 153.32
  9 Episode in  1800 steps, reward_ex -200.00, reward_in 73.19
 10 Episode in  2000 steps, reward_ex -200.00, reward_in 120.36
 11 Episode in  2200 steps, reward_ex -200.00, reward_in 81.53
 12 Episode in  2400 steps, reward_ex -200.00, reward_in 109.04
 13 Episode in  2600 steps, reward_ex -200.00, reward_in 51.10
 14 Episode in  2800 steps, reward_ex -200.00, reward_in 44.77
 15 Episode in  3000 steps, reward_ex -200.00, reward_in 75.20
 16 Episode in  3200 steps, reward_ex -200.00

132 Episode in 26400 steps, reward_ex -200.00, reward_in 11.35
133 Episode in 26600 steps, reward_ex -200.00, reward_in 9.07
134 Episode in 26800 steps, reward_ex -200.00, reward_in 10.05
135 Episode in 27000 steps, reward_ex -200.00, reward_in 10.54
136 Episode in 27200 steps, reward_ex -200.00, reward_in 12.63
137 Episode in 27400 steps, reward_ex -200.00, reward_in 7.31
138 Episode in 27600 steps, reward_ex -200.00, reward_in 7.41
139 Episode in 27800 steps, reward_ex -200.00, reward_in 5.37
140 Episode in 28000 steps, reward_ex -200.00, reward_in 8.94
141 Episode in 28200 steps, reward_ex -200.00, reward_in 6.25
142 Episode in 28400 steps, reward_ex -200.00, reward_in 8.24
143 Episode in 28600 steps, reward_ex -200.00, reward_in 46.84
144 Episode in 28800 steps, reward_ex -200.00, reward_in 136.63
145 Episode in 29000 steps, reward_ex -200.00, reward_in 40.44
146 Episode in 29200 steps, reward_ex -200.00, reward_in 5.59
147 Episode in 29400 steps, reward_ex -200.00, reward_in 5.58


264 Episode in 52800 steps, reward_ex -200.00, reward_in 11.73
265 Episode in 53000 steps, reward_ex -200.00, reward_in 3.57
266 Episode in 53200 steps, reward_ex -200.00, reward_in 3.20
267 Episode in 53400 steps, reward_ex -200.00, reward_in 4.14
268 Episode in 53600 steps, reward_ex -200.00, reward_in 68.84
269 Episode in 53800 steps, reward_ex -200.00, reward_in 16.07
270 Episode in 54000 steps, reward_ex -200.00, reward_in 3.41
271 Episode in 54200 steps, reward_ex -200.00, reward_in 4.56
272 Episode in 54400 steps, reward_ex -200.00, reward_in 6.61
273 Episode in 54600 steps, reward_ex -200.00, reward_in 3.60
274 Episode in 54800 steps, reward_ex -200.00, reward_in 2.82
275 Episode in 55000 steps, reward_ex -200.00, reward_in 4.50
276 Episode in 55200 steps, reward_ex -200.00, reward_in 7.89
277 Episode in 55400 steps, reward_ex -200.00, reward_in 3.80
278 Episode in 55600 steps, reward_ex -200.00, reward_in 3.87
279 Episode in 55800 steps, reward_ex -200.00, reward_in 6.74
280 E

396 Episode in 79200 steps, reward_ex -200.00, reward_in 2.51
397 Episode in 79400 steps, reward_ex -200.00, reward_in 2.53
398 Episode in 79600 steps, reward_ex -200.00, reward_in 3.18
399 Episode in 79800 steps, reward_ex -200.00, reward_in 3.42
400 Episode in 80000 steps, reward_ex -200.00, reward_in 2.50
401 Episode in 80200 steps, reward_ex -200.00, reward_in 2.32
402 Episode in 80400 steps, reward_ex -200.00, reward_in 5.13
403 Episode in 80600 steps, reward_ex -200.00, reward_in 3.28
404 Episode in 80800 steps, reward_ex -200.00, reward_in 2.73
405 Episode in 81000 steps, reward_ex -200.00, reward_in 3.02
406 Episode in 81200 steps, reward_ex -200.00, reward_in 6.66
407 Episode in 81400 steps, reward_ex -200.00, reward_in 11.88
408 Episode in 81600 steps, reward_ex -200.00, reward_in 4.25
409 Episode in 81800 steps, reward_ex -200.00, reward_in 3.74
410 Episode in 82000 steps, reward_ex -200.00, reward_in 3.31
411 Episode in 82200 steps, reward_ex -200.00, reward_in 2.68
412 Epi

In [None]:
plt.figure(figsize=(15, 5))
plt.title('Reward')
plt.plot(ep_rewards)
plt.figure(figsize=(15, 5))
plt.title('Loss')
plt.plot(losses)
plt.figure(figsize=(15, 5))
plt.title('m_Loss')
plt.plot(m_losses)
plt.figure(figsize=(15, 5))
plt.title('f_Loss')
plt.plot(f_losses)
plt.show()

In [None]:
[
    ('CartPole-v0', 161, 32, 256),
    ('CartPole-v1', 162, 32, 256),
    ('MountainCar-v0', 660),
    ('LunarLander-v2', 260)
]