In [1]:
from collections import deque
from copy import deepcopy

import gym
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical
from torch.utils.data import DataLoader

In [2]:
SEED = 5
BATCH_SIZE = 4
LR = 0.00025
EPOCHS = 4
CLIP = 0.1
GAMMA = 0.99
LAMBDA = 0.95
ENT_COEF = 0.01
EX_COEF = 2.0
IN_COEF = 1.0

# set device
use_cuda = torch.cuda.is_available()
print('cuda:', use_cuda)
device = torch.device('cuda' if use_cuda else 'cpu')

# random seed
np.random.seed(SEED)
torch.manual_seed(SEED)
if use_cuda:
    torch.cuda.manual_seed_all(SEED)

cuda: True


In [3]:
class ActorCriticNet(nn.Module):
    def __init__(self, obs_space, action_space):
        super().__init__()

        self.head = nn.Sequential(
            nn.Linear(obs_space, 256),
            nn.SELU()
        )

        self.pol = nn.Sequential(
            nn.Linear(256, 256),
            nn.SELU(),
            nn.Linear(256, action_space)
        )
        
        self.val_ex = nn.Sequential(
            nn.Linear(256, 256),
            nn.SELU(),
            nn.Linear(256, 1)
        )
        
        self.val_in = nn.Sequential(
            nn.Linear(256, 256),
            nn.SELU(),
            nn.Linear(256, 1)
        )
        
        self.log_softmax = nn.LogSoftmax(dim=-1)

    def forward(self, x):
        out = self.head(x)
        logit = self.pol(out).reshape(out.shape[0], -1)
        value_ex = self.val_ex(out).reshape(out.shape[0], 1)
        value_in = self.val_in(out).reshape(out.shape[0], 1)
        log_probs = self.log_softmax(logit)
        
        return log_probs, value_ex, value_in
    
class RandomNet(nn.Module):
    def __init__(self, obs_space):
        super().__init__()
        self.head = nn.Sequential(
            nn.Linear(obs_space, 256),
            nn.SELU()
        )

        self.fc = nn.Sequential(
            nn.Linear(256, 512)
        )

    def forward(self, x):
        out = self.head(x)
        obs_feature = self.fc(out).reshape(out.shape[0], -1)

        return obs_feature


class PredictNet(nn.Module):
    def __init__(self, obs_space):
        super().__init__()
        self.head = nn.Sequential(
            nn.Linear(obs_space, 256),
            nn.SELU()
        )

        self.fc = nn.Sequential(
            nn.Linear(256, 256),
            nn.SELU(),
            nn.Linear(256, 256),
            nn.SELU(),
            nn.Linear(256, 512)
        )

    def forward(self, x):
        out = self.head(x)
        obs_feature = self.fc(out).reshape(out.shape[0], -1)

        return obs_feature

In [None]:
losses = []
m_losses = []
f_losses = []


def learn(net, old_net, pred_net, rand_net, net_optim, pred_optim, train_memory):
    net.train()
    old_net.train()
    
    dataloader = DataLoader(
        train_memory,
        shuffle=True,
        batch_size=BATCH_SIZE,
        pin_memory=use_cuda
    )

    for i in range(EPOCHS):
        for (s, a, _s, ret_ex, ret_in, adv) in dataloader:
            s = s.to(device).float()
            a  = a.to(device).long()
            _s = _s.to(device).float()
            _s_norm_np = normalize_obs(_s.detach().cpu().numpy())
            _s_norm = torch.tensor(_s_norm_np).to(device).float()
            ret_ex = ret_ex.to(device).float()
            ret_in = ret_in.to(device).float()
            adv = adv.to(device).float()

            with torch.no_grad():
                rand_f = rand_net(_s_norm)
                log_p_old, v_ex_old, v_in_old = old_net(s)
                log_p_a_old = log_p_old[range(BATCH_SIZE), a]

            pred_f = pred_net(_s_norm)
            log_p, v_ex, v_in = net(s)
            log_p_a = log_p[range(BATCH_SIZE), a]
            p_ratio = (log_p_a - log_p_a_old).exp()
            p_r_clip = torch.clamp(p_ratio, 1. - CLIP, 1. + CLIP)
            p_loss = torch.min(p_ratio * adv, p_r_clip * adv).mean()
            v_ex_loss = 0.5 * (ret_ex - v_ex).pow(2)
            v_in_loss = 0.5 * (ret_in - v_in).pow(2)
            v_loss = (v_ex_loss + v_in_loss).mean() 
            entropy = -(log_p.exp() * log_p).sum(dim=1).mean()

            # loss
            m_loss = -(p_loss - v_loss + ENT_COEF * entropy)
            m_losses.append(m_loss)

            f_loss = (pred_f - rand_f).pow(2).sum(dim=1).mean()
            f_losses.append(f_loss)
            
            loss = m_loss + f_loss
            losses.append(loss)
            
            net_optim.zero_grad()
            pred_optim.zero_grad()
            loss.backward()
            net_optim.step()
            pred_optim.step()


def get_action_and_value(obs, net):
    net.eval()
    with torch.no_grad():
        state = torch.tensor([obs]).to(device).float()
        log_p, v_ex, v_in = net(state)
        m = Categorical(log_p.exp())
        action = m.sample()

    return action.item(), v_ex.item(), v_in.item()


def compute_adv_with_gae(reward_ex, reward_in, values, roll_memory):
    rew_ex = np.array(rewards_ex, 'float')
    rew_in = np.array(rewards_in, 'float')
    rew = rew_ex + rew_in
    val = np.array(values[:-1], 'float')
    _val = np.array(values[1:], 'float')
    delta = rew + GAMMA * _val - val
    dis_r_ex = np.array([GAMMA**(i) * r for i, r in enumerate(reward_ex)], 'float')
    dis_r_in = np.array([GAMMA**(i) * r for i, r in enumerate(reward_in)], 'float')
    gae_dt = np.array([(GAMMA * LAMBDA)**(i) * dt for i, dt in enumerate(delta.tolist())], 'float')
    for i, data in enumerate(roll_memory):
        data.append(sum(dis_r_ex[i:] / GAMMA**(i)))
        data.append(sum(dis_r_in[i:] / GAMMA**(i)))
        data.append(sum(gae_dt[i:] / (GAMMA * LAMBDA)**(i)))

    return roll_memory


def get_norm_params(obs_memory):
    global obs_apace

    obses = [[] for _ in range(obs_space)]
    for obs in obs_memory:
        for j in range(obs_space):
            obses[j].append(obs[j])

    mean = np.zeros(obs_space, 'float')
    std = np.zeros(obs_space, 'float')
    for i, obs_ in enumerate(obses):
        mean[i] = np.mean(obs_)
        std[i] = np.std(obs_)

    return mean, std


def normalize_obs(obs):
    global mean, std
    norm_obs = (obs - mean) / std
#     return np.clip(norm_obs, -5, 5)
    return norm_obs


def calculate_reward_in(pred_net, rand_net, obs):
    norm_obs = normalize_obs(obs)
    state = torch.tensor([norm_obs]).to(device).float()
    with torch.no_grad():
        pred_obs = pred_net(state)
        rand_obs = rand_net(state)
        reward = (pred_obs - rand_obs).pow(2).sum()
        clipped_reward = torch.clamp(reward, -1, 1)

    return clipped_reward.item()

## Main

In [None]:
# make an environment
# env = gym.make('CartPole-v0')
# env = gym.make('CartPole-v1')
env = gym.make('MountainCar-v0')
# env = gym.make('LunarLander-v2')

env.seed(SEED)
obs_space = env.observation_space.shape[0]
action_space = env.action_space.n

# hyperparameter
n_episodes = 3000
roll_len = 128
n_eval = env.spec.trials

# global values
init_steps = 0
steps = 0
learn_steps = 0
mean = 0.
std = 0.
ep_rewards = []
is_rollout = False
is_solved = False
is_init_roll = True

# make a rollout memory
net_memory = deque(maxlen=2)
train_memory = []
roll_memory = []
obs_memory = []
rewards_ex = []
rewards_in = []
values = []

# make nerual networks
net = ActorCriticNet(obs_space, action_space).to(device)
old_net = deepcopy(net)
net_memory.appendleft(net.state_dict())
pred_net = PredictNet(obs_space).to(device)
rand_net = RandomNet(obs_space).to(device)

# make optimizer
net_optim = torch.optim.Adam(net.parameters(), lr=LR, eps=1e-5)
pred_optim = torch.optim.Adam(pred_net.parameters(), lr=LR, eps=1e-5)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [None]:
use_cuda

True

In [None]:
env.spec.max_episode_steps

200

In [None]:
env.spec.trials

100

In [None]:
env.spec.reward_threshold

-110.0

In [None]:
# simulation
while True:
    obs = env.reset()
    done = False
    while not done:
#         env.render()
        action = env.action_space.sample()
        _obs, _, done, _ = env.step(action)
        obs_memory.append(_obs)
        obs = _obs
        init_steps += 1
        if init_steps == roll_len * 50:
            mean, std = get_norm_params(obs_memory)
            obs_memory.clear()
            is_init_roll = False
            break
    if not is_init_roll:
        break

In [None]:
# play!
for i in range(1, n_episodes + 1):
    obs = env.reset()
    done = False
    ep_reward_ex = 0.
    ep_reward_in = 0.
    while not done:
        env.render()

        action, val_ex, val_in = get_action_and_value(obs, net)
        _obs, rew_ex, done, _ = env.step(action)
        
        rew_in = calculate_reward_in(pred_net, rand_net, _obs)    
        value = 0.5 * (val_ex + val_in)
        
        # store
        roll_memory.append([obs, action, _obs])
        obs_memory.append(_obs)
        rewards_ex.append(EX_COEF*rew_ex)
        rewards_in.append(IN_COEF*rew_in)
        values.append(value)
        
        obs = _obs
        steps += 1
        ep_reward_ex += rew_ex
        ep_reward_in += rew_in
        
        if done or steps % roll_len == 0:
            if done:
                _value = 0.
            else:
                _, _val_ex, _val_in = get_action_and_value(_obs, net)
                _value = 0.5*(_val_ex + _val_in)
            
            values.append(_value)
            train_memory.extend(compute_adv_with_gae(rewards_ex, rewards_in, values, roll_memory))
            rewards_ex.clear()
            rewards_in.clear()
            values.clear()
            roll_memory.clear()
            
        if steps % roll_len == 0:
            net_memory.appendleft(net.state_dict())
            old_net.load_state_dict(net_memory.pop())
            learn(net, old_net, pred_net, rand_net, net_optim, pred_optim, train_memory)
            train_memory.clear()
            learn_steps += 1
        
        if steps % roll_len*50 == 0:
            mean, std = get_norm_params(obs_memory)
            obs_memory.clear()
    
    if done:        
        ep_rewards.append(ep_reward_ex)
        print('{:3} Episode in {:5} steps, reward_ex {:.2f}, reward_in {:.2f}'.format(
            i, steps, ep_reward_ex, ep_reward_in))

        if len(ep_rewards) >= n_eval:
            if np.mean(list(reversed(ep_rewards))[: n_eval]) >= env.spec.reward_threshold:
                print('\n{} is sloved! {:3} Episode in {:3} steps'.format(
                    env.spec.id, i, steps))
                torch.save(old_net.state_dict(),
                           f'../test/saved_models/{env.spec.id}_ep{i}_clear_model_ppo_r.pt')
                break
env.close()

  1 Episode in   200 steps, reward_ex -200.00, reward_in 193.96
  2 Episode in   400 steps, reward_ex -200.00, reward_in 167.49
  3 Episode in   600 steps, reward_ex -200.00, reward_in 198.34
  4 Episode in   800 steps, reward_ex -200.00, reward_in 111.43
  5 Episode in  1000 steps, reward_ex -200.00, reward_in 102.17
  6 Episode in  1200 steps, reward_ex -200.00, reward_in 88.35
  7 Episode in  1400 steps, reward_ex -200.00, reward_in 89.48
  8 Episode in  1600 steps, reward_ex -200.00, reward_in 94.37
  9 Episode in  1800 steps, reward_ex -200.00, reward_in 56.75
 10 Episode in  2000 steps, reward_ex -200.00, reward_in 100.91
 11 Episode in  2200 steps, reward_ex -200.00, reward_in 47.76
 12 Episode in  2400 steps, reward_ex -200.00, reward_in 71.74
 13 Episode in  2600 steps, reward_ex -200.00, reward_in 58.55
 14 Episode in  2800 steps, reward_ex -200.00, reward_in 168.36
 15 Episode in  3000 steps, reward_ex -200.00, reward_in 46.63
 16 Episode in  3200 steps, reward_ex -200.00, r

132 Episode in 26400 steps, reward_ex -200.00, reward_in 28.66
133 Episode in 26600 steps, reward_ex -200.00, reward_in 17.03
134 Episode in 26800 steps, reward_ex -200.00, reward_in 7.28
135 Episode in 27000 steps, reward_ex -200.00, reward_in 5.86
136 Episode in 27200 steps, reward_ex -200.00, reward_in 4.10
137 Episode in 27400 steps, reward_ex -200.00, reward_in 4.88
138 Episode in 27600 steps, reward_ex -200.00, reward_in 17.15
139 Episode in 27800 steps, reward_ex -200.00, reward_in 28.67
140 Episode in 28000 steps, reward_ex -200.00, reward_in 81.66
141 Episode in 28200 steps, reward_ex -200.00, reward_in 11.61
142 Episode in 28400 steps, reward_ex -200.00, reward_in 8.97
143 Episode in 28600 steps, reward_ex -200.00, reward_in 13.58
144 Episode in 28800 steps, reward_ex -200.00, reward_in 95.03
145 Episode in 29000 steps, reward_ex -200.00, reward_in 12.32
146 Episode in 29200 steps, reward_ex -200.00, reward_in 17.70
147 Episode in 29400 steps, reward_ex -200.00, reward_in 19.

263 Episode in 52600 steps, reward_ex -200.00, reward_in 5.37
264 Episode in 52800 steps, reward_ex -200.00, reward_in 5.52
265 Episode in 53000 steps, reward_ex -200.00, reward_in 8.66
266 Episode in 53200 steps, reward_ex -200.00, reward_in 10.83
267 Episode in 53400 steps, reward_ex -200.00, reward_in 8.54
268 Episode in 53600 steps, reward_ex -200.00, reward_in 4.23
269 Episode in 53800 steps, reward_ex -200.00, reward_in 22.50
270 Episode in 54000 steps, reward_ex -200.00, reward_in 60.80
271 Episode in 54200 steps, reward_ex -200.00, reward_in 23.02
272 Episode in 54400 steps, reward_ex -200.00, reward_in 13.99
273 Episode in 54600 steps, reward_ex -200.00, reward_in 30.37
274 Episode in 54800 steps, reward_ex -200.00, reward_in 33.61
275 Episode in 55000 steps, reward_ex -200.00, reward_in 11.75
276 Episode in 55200 steps, reward_ex -200.00, reward_in 14.47
277 Episode in 55400 steps, reward_ex -200.00, reward_in 8.35
278 Episode in 55600 steps, reward_ex -200.00, reward_in 4.29

395 Episode in 79000 steps, reward_ex -200.00, reward_in 4.94
396 Episode in 79200 steps, reward_ex -200.00, reward_in 3.49
397 Episode in 79400 steps, reward_ex -200.00, reward_in 11.04
398 Episode in 79600 steps, reward_ex -200.00, reward_in 30.25
399 Episode in 79800 steps, reward_ex -200.00, reward_in 32.48
400 Episode in 80000 steps, reward_ex -200.00, reward_in 5.62
401 Episode in 80200 steps, reward_ex -200.00, reward_in 6.49
402 Episode in 80400 steps, reward_ex -200.00, reward_in 4.32
403 Episode in 80600 steps, reward_ex -200.00, reward_in 3.79
404 Episode in 80800 steps, reward_ex -200.00, reward_in 4.63
405 Episode in 81000 steps, reward_ex -200.00, reward_in 8.02
406 Episode in 81200 steps, reward_ex -200.00, reward_in 6.61
407 Episode in 81400 steps, reward_ex -200.00, reward_in 25.24
408 Episode in 81600 steps, reward_ex -200.00, reward_in 18.49
409 Episode in 81800 steps, reward_ex -200.00, reward_in 7.32
410 Episode in 82000 steps, reward_ex -200.00, reward_in 25.14
41

526 Episode in 105200 steps, reward_ex -200.00, reward_in 27.67
527 Episode in 105400 steps, reward_ex -200.00, reward_in 7.66
528 Episode in 105600 steps, reward_ex -200.00, reward_in 6.85
529 Episode in 105800 steps, reward_ex -200.00, reward_in 6.40
530 Episode in 106000 steps, reward_ex -200.00, reward_in 4.90
531 Episode in 106200 steps, reward_ex -200.00, reward_in 4.82
532 Episode in 106400 steps, reward_ex -200.00, reward_in 5.74
533 Episode in 106600 steps, reward_ex -200.00, reward_in 2.82
534 Episode in 106800 steps, reward_ex -200.00, reward_in 15.10
535 Episode in 107000 steps, reward_ex -200.00, reward_in 10.76
536 Episode in 107200 steps, reward_ex -200.00, reward_in 108.99
537 Episode in 107400 steps, reward_ex -200.00, reward_in 31.54
538 Episode in 107600 steps, reward_ex -200.00, reward_in 10.51
539 Episode in 107800 steps, reward_ex -200.00, reward_in 5.10
540 Episode in 108000 steps, reward_ex -200.00, reward_in 5.38
541 Episode in 108200 steps, reward_ex -200.00, 

656 Episode in 131200 steps, reward_ex -200.00, reward_in 5.40
657 Episode in 131400 steps, reward_ex -200.00, reward_in 4.54
658 Episode in 131600 steps, reward_ex -200.00, reward_in 20.49
659 Episode in 131800 steps, reward_ex -200.00, reward_in 15.16
660 Episode in 132000 steps, reward_ex -200.00, reward_in 113.42
661 Episode in 132200 steps, reward_ex -200.00, reward_in 32.47
662 Episode in 132400 steps, reward_ex -200.00, reward_in 26.91
663 Episode in 132600 steps, reward_ex -200.00, reward_in 5.54
664 Episode in 132800 steps, reward_ex -200.00, reward_in 14.63
665 Episode in 133000 steps, reward_ex -200.00, reward_in 6.02
666 Episode in 133200 steps, reward_ex -200.00, reward_in 6.04
667 Episode in 133400 steps, reward_ex -200.00, reward_in 4.92
668 Episode in 133600 steps, reward_ex -200.00, reward_in 5.78
669 Episode in 133800 steps, reward_ex -200.00, reward_in 5.19
670 Episode in 134000 steps, reward_ex -200.00, reward_in 3.45
671 Episode in 134200 steps, reward_ex -200.00, 

786 Episode in 157200 steps, reward_ex -200.00, reward_in 3.74
787 Episode in 157400 steps, reward_ex -200.00, reward_in 2.26
788 Episode in 157600 steps, reward_ex -200.00, reward_in 2.67
789 Episode in 157800 steps, reward_ex -200.00, reward_in 3.95
790 Episode in 158000 steps, reward_ex -200.00, reward_in 4.83
791 Episode in 158200 steps, reward_ex -200.00, reward_in 3.79
792 Episode in 158400 steps, reward_ex -200.00, reward_in 47.76
793 Episode in 158600 steps, reward_ex -200.00, reward_in 12.86
794 Episode in 158800 steps, reward_ex -200.00, reward_in 16.63
795 Episode in 159000 steps, reward_ex -200.00, reward_in 13.62
796 Episode in 159200 steps, reward_ex -200.00, reward_in 7.04
797 Episode in 159400 steps, reward_ex -200.00, reward_in 124.79
798 Episode in 159600 steps, reward_ex -200.00, reward_in 73.56
799 Episode in 159800 steps, reward_ex -200.00, reward_in 8.14
800 Episode in 160000 steps, reward_ex -200.00, reward_in 14.80
801 Episode in 160200 steps, reward_ex -200.00,

916 Episode in 183200 steps, reward_ex -200.00, reward_in 3.18
917 Episode in 183400 steps, reward_ex -200.00, reward_in 3.41
918 Episode in 183600 steps, reward_ex -200.00, reward_in 4.90
919 Episode in 183800 steps, reward_ex -200.00, reward_in 5.67
920 Episode in 184000 steps, reward_ex -200.00, reward_in 5.47
921 Episode in 184200 steps, reward_ex -200.00, reward_in 6.41
922 Episode in 184400 steps, reward_ex -200.00, reward_in 13.70
923 Episode in 184600 steps, reward_ex -200.00, reward_in 18.58
924 Episode in 184800 steps, reward_ex -200.00, reward_in 4.48
925 Episode in 185000 steps, reward_ex -200.00, reward_in 2.35
926 Episode in 185200 steps, reward_ex -200.00, reward_in 97.65
927 Episode in 185400 steps, reward_ex -200.00, reward_in 66.53
928 Episode in 185600 steps, reward_ex -200.00, reward_in 9.14
929 Episode in 185800 steps, reward_ex -200.00, reward_in 52.61
930 Episode in 186000 steps, reward_ex -200.00, reward_in 17.33
931 Episode in 186200 steps, reward_ex -200.00, r

1045 Episode in 209000 steps, reward_ex -200.00, reward_in 3.18
1046 Episode in 209200 steps, reward_ex -200.00, reward_in 11.22
1047 Episode in 209400 steps, reward_ex -200.00, reward_in 5.01
1048 Episode in 209600 steps, reward_ex -200.00, reward_in 2.94
1049 Episode in 209800 steps, reward_ex -200.00, reward_in 18.52
1050 Episode in 210000 steps, reward_ex -200.00, reward_in 15.37
1051 Episode in 210200 steps, reward_ex -200.00, reward_in 8.85
1052 Episode in 210400 steps, reward_ex -200.00, reward_in 8.76
1053 Episode in 210600 steps, reward_ex -200.00, reward_in 4.06
1054 Episode in 210800 steps, reward_ex -200.00, reward_in 3.69
1055 Episode in 211000 steps, reward_ex -200.00, reward_in 5.04
1056 Episode in 211200 steps, reward_ex -200.00, reward_in 6.72
1057 Episode in 211400 steps, reward_ex -200.00, reward_in 64.94
1058 Episode in 211600 steps, reward_ex -200.00, reward_in 51.36
1059 Episode in 211800 steps, reward_ex -200.00, reward_in 9.89
1060 Episode in 212000 steps, rewar

1173 Episode in 234600 steps, reward_ex -200.00, reward_in 3.00
1174 Episode in 234800 steps, reward_ex -200.00, reward_in 3.66
1175 Episode in 235000 steps, reward_ex -200.00, reward_in 2.89
1176 Episode in 235200 steps, reward_ex -200.00, reward_in 4.26
1177 Episode in 235400 steps, reward_ex -200.00, reward_in 1.95
1178 Episode in 235600 steps, reward_ex -200.00, reward_in 43.95
1179 Episode in 235800 steps, reward_ex -200.00, reward_in 9.82
1180 Episode in 236000 steps, reward_ex -200.00, reward_in 5.12
1181 Episode in 236200 steps, reward_ex -200.00, reward_in 6.59
1182 Episode in 236400 steps, reward_ex -200.00, reward_in 4.72
1183 Episode in 236600 steps, reward_ex -200.00, reward_in 4.27
1184 Episode in 236800 steps, reward_ex -200.00, reward_in 2.57
1185 Episode in 237000 steps, reward_ex -200.00, reward_in 10.51
1186 Episode in 237200 steps, reward_ex -200.00, reward_in 3.78
1187 Episode in 237400 steps, reward_ex -200.00, reward_in 3.35
1188 Episode in 237600 steps, reward_e

In [None]:
plt.figure(figsize=(15, 5))
plt.title('Reward')
plt.plot(ep_rewards)
plt.figure(figsize=(15, 5))
plt.title('Loss')
plt.plot(losses)
plt.figure(figsize=(15, 5))
plt.title('m_Loss')
plt.plot(m_losses)
plt.figure(figsize=(15, 5))
plt.title('f_Loss')
plt.plot(f_losses)
plt.show()

In [None]:
[
    ('CartPole-v0', 161, 32, 256),
    ('CartPole-v1', 162, 32, 256),
    ('MountainCar-v0', 660),
    ('LunarLander-v2', 260)
]