In [1]:
from collections import deque
from copy import deepcopy

import gym
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical
from torch.utils.data import DataLoader

In [2]:
SEED = 5
BATCH_SIZE = 4
LR = 0.00025
EPOCHS = 4
CLIP = 0.1
GAMMA = 0.99
LAMBDA = 0.95
ENT_COEF = 0.01
EX_COEF = 0.0
IN_COEF = 1.0

# set device
use_cuda = torch.cuda.is_available()
print('cuda:', use_cuda)
device = torch.device('cuda' if use_cuda else 'cpu')

# random seed
np.random.seed(SEED)
torch.manual_seed(SEED)
if use_cuda:
    torch.cuda.manual_seed_all(SEED)

cuda: True


In [3]:
class ActorCriticNet(nn.Module):
    def __init__(self, obs_space, action_space):
        super().__init__()

        self.head = nn.Sequential(
            nn.Linear(obs_space, 256),
            nn.SELU()
        )

        self.pol = nn.Sequential(
            nn.Linear(256, 256),
            nn.SELU(),
            nn.Linear(256, action_space)
        )
        
        self.val_ex = nn.Sequential(
            nn.Linear(256, 256),
            nn.SELU(),
            nn.Linear(256, 1)
        )
        
        self.val_in = nn.Sequential(
            nn.Linear(256, 256),
            nn.SELU(),
            nn.Linear(256, 1)
        )
        
        self.log_softmax = nn.LogSoftmax(dim=-1)

    def forward(self, x):
        out = self.head(x)
        logit = self.pol(out).reshape(out.shape[0], -1)
        value_ex = self.val_ex(out).reshape(out.shape[0], 1)
        value_in = self.val_in(out).reshape(out.shape[0], 1)
        log_probs = self.log_softmax(logit)
        
        return log_probs, value_ex, value_in
    
class RandomNet(nn.Module):
    def __init__(self, obs_space):
        super().__init__()
        self.head = nn.Sequential(
            nn.Linear(obs_space, 256),
            nn.SELU()
        )

        self.fc = nn.Sequential(
            nn.Linear(256, 512)
        )

    def forward(self, x):
        out = self.head(x)
        obs_feature = self.fc(out).reshape(out.shape[0], -1)

        return obs_feature


class PredictNet(nn.Module):
    def __init__(self, obs_space):
        super().__init__()
        self.head = nn.Sequential(
            nn.Linear(obs_space, 256),
            nn.SELU()
        )

        self.fc = nn.Sequential(
            nn.Linear(256, 256),
            nn.SELU(),
            nn.Linear(256, 256),
            nn.SELU(),
            nn.Linear(256, 512)
        )

    def forward(self, x):
        out = self.head(x)
        obs_feature = self.fc(out).reshape(out.shape[0], -1)

        return obs_feature

In [None]:
losses = []
m_losses = []
f_losses = []


def learn(net, old_net, pred_net, rand_net, net_optim, pred_optim, train_memory):
    net.train()
    old_net.train()
    
    dataloader = DataLoader(
        train_memory,
        shuffle=True,
        batch_size=BATCH_SIZE,
        pin_memory=use_cuda
    )

    for i in range(EPOCHS):
        for (s, a, _s, ret_ex, ret_in, adv) in dataloader:
            s = s.to(device).float()
            a  = a.to(device).long()
            _s = _s.to(device).float()
            _s_norm_np = normalize_obs(_s.detach().cpu().numpy())
            _s_norm = torch.tensor(_s_norm_np).to(device).float()
            ret_ex = ret_ex.to(device).float()
            ret_in = ret_in.to(device).float()
            adv = adv.to(device).float()

            with torch.no_grad():
                rand_f = rand_net(_s_norm)
                log_p_old, v_ex_old, v_in_old = old_net(s)
                log_p_a_old = log_p_old[range(BATCH_SIZE), a]

            pred_f = pred_net(_s_norm)
            log_p, v_ex, v_in = net(s)
            log_p_a = log_p[range(BATCH_SIZE), a]
            p_ratio = (log_p_a - log_p_a_old).exp()
            p_r_clip = torch.clamp(p_ratio, 1. - CLIP, 1. + CLIP)
            p_loss = torch.min(p_ratio * adv, p_r_clip * adv).mean()
            v_ex_loss = 0.5 * (ret_ex - v_ex).pow(2)
            v_in_loss = 0.5 * (ret_in - v_in).pow(2)
            v_loss = (v_ex_loss + v_in_loss).mean() 
            entropy = -(log_p.exp() * log_p).sum(dim=1).mean()

            # loss
            m_loss = -(p_loss - v_loss + ENT_COEF * entropy)
            m_losses.append(m_loss)

            f_loss = (pred_f - rand_f).pow(2).sum(dim=1).mean()
            f_losses.append(f_loss)
            
            loss = m_loss + f_loss
            losses.append(loss)
            
            net_optim.zero_grad()
            pred_optim.zero_grad()
            loss.backward()
            net_optim.step()
            pred_optim.step()


def get_action_and_value(obs, net):
    net.eval()
    with torch.no_grad():
        state = torch.tensor([obs]).to(device).float()
        log_p, v_ex, v_in = net(state)
        m = Categorical(log_p.exp())
        action = m.sample()

    return action.item(), v_ex.item(), v_in.item()


def compute_adv_with_gae(reward_ex, reward_in, values, roll_memory):
    rew_ex = np.array(rewards_ex, 'float')
    rew_in = np.array(rewards_in, 'float')
    rew = rew_ex + rew_in
    val = np.array(values[:-1], 'float')
    _val = np.array(values[1:], 'float')
    delta = rew + GAMMA * _val - val
    dis_r_ex = np.array([GAMMA**(i) * r for i, r in enumerate(reward_ex)], 'float')
    dis_r_in = np.array([GAMMA**(i) * r for i, r in enumerate(reward_in)], 'float')
    gae_dt = np.array([(GAMMA * LAMBDA)**(i) * dt for i, dt in enumerate(delta.tolist())], 'float')
    for i, data in enumerate(roll_memory):
        data.append(sum(dis_r_ex[i:] / GAMMA**(i)))
        data.append(sum(dis_r_in[i:] / GAMMA**(i)))
        data.append(sum(gae_dt[i:] / (GAMMA * LAMBDA)**(i)))

    return roll_memory


def get_norm_params(obs_memory):
    global obs_apace

    obses = [[] for _ in range(obs_space)]
    for obs in obs_memory:
        for j in range(obs_space):
            obses[j].append(obs[j])

    mean = np.zeros(obs_space, 'float')
    std = np.zeros(obs_space, 'float')
    for i, obs_ in enumerate(obses):
        mean[i] = np.mean(obs_)
        std[i] = np.std(obs_)

    return mean, std


def normalize_obs(obs):
    global mean, std
    norm_obs = (obs - mean) / std
#     return np.clip(norm_obs, -5, 5)
    return norm_obs


def calculate_reward_in(pred_net, rand_net, obs):
    norm_obs = normalize_obs(obs)
    state = torch.tensor([norm_obs]).to(device).float()
    with torch.no_grad():
        pred_obs = pred_net(state)
        rand_obs = rand_net(state)
        reward = (pred_obs - rand_obs).pow(2).sum()
        clipped_reward = torch.clamp(reward, -1, 1)

    return clipped_reward.item()

## Main

In [None]:
# make an environment
# env = gym.make('CartPole-v0')
env = gym.make('CartPole-v1')
# env = gym.make('MountainCar-v0')
# env = gym.make('LunarLander-v2')

env.seed(SEED)
obs_space = env.observation_space.shape[0]
action_space = env.action_space.n

# hyperparameter
n_episodes = 3000
roll_len = 128
n_eval = env.spec.trials

# global values
init_steps = 0
steps = 0
learn_steps = 0
mean = 0.
std = 0.
ep_rewards = []
is_rollout = False
is_solved = False
is_init_roll = True

# make a rollout memory
net_memory = deque(maxlen=2)
train_memory = []
roll_memory = []
obs_memory = []
rewards_ex = []
rewards_in = []
values = []

# make nerual networks
net = ActorCriticNet(obs_space, action_space).to(device)
old_net = deepcopy(net)
net_memory.appendleft(net.state_dict())
pred_net = PredictNet(obs_space).to(device)
rand_net = RandomNet(obs_space).to(device)

# make optimizer
net_optim = torch.optim.Adam(net.parameters(), lr=LR, eps=1e-5)
pred_optim = torch.optim.Adam(pred_net.parameters(), lr=LR, eps=1e-5)

In [None]:
use_cuda

True

In [None]:
env.spec.max_episode_steps

500

In [None]:
env.spec.trials

100

In [None]:
env.spec.reward_threshold

475.0

In [None]:
# simulation
while True:
    obs = env.reset()
    done = False
    while not done:
#         env.render()
        action = env.action_space.sample()
        _obs, _, done, _ = env.step(action)
        obs_memory.append(_obs)
        obs = _obs
        init_steps += 1
        if init_steps == roll_len * 50:
            mean, std = get_norm_params(obs_memory)
            obs_memory.clear()
            is_init_roll = False
            break
    if not is_init_roll:
        break

In [None]:
# play!
for i in range(1, n_episodes + 1):
    obs = env.reset()
    done = False
    ep_reward_ex = 0.
    ep_reward_in = 0.
    while not done:
        env.render()

        action, val_ex, val_in = get_action_and_value(obs, net)
        _obs, rew_ex, done, _ = env.step(action)
        
        rew_in = calculate_reward_in(pred_net, rand_net, _obs)    
        value = 0.5 * (val_ex + val_in)
        
        # store
        roll_memory.append([obs, action, _obs])
        obs_memory.append(_obs)
        rewards_ex.append(EX_COEF*rew_ex)
        rewards_in.append(IN_COEF*rew_in)
        values.append(value)
        
        obs = _obs
        steps += 1
        ep_reward_ex += rew_ex
        ep_reward_in += rew_in
        
        if done or steps % roll_len == 0:
            if done:
                _value = 0.
            else:
                _, _val_ex, _val_in = get_action_and_value(_obs, net)
                _value = 0.5*(_val_ex + _val_in)
            
            values.append(_value)
            train_memory.extend(compute_adv_with_gae(rewards_ex, rewards_in, values, roll_memory))
            rewards_ex.clear()
            rewards_in.clear()
            values.clear()
            roll_memory.clear()
            
        if steps % roll_len == 0:
            net_memory.appendleft(net.state_dict())
            old_net.load_state_dict(net_memory.pop())
            learn(net, old_net, pred_net, rand_net, net_optim, pred_optim, train_memory)
            train_memory.clear()
            learn_steps += 1
        
        if steps % roll_len*50 == 0:
            mean, std = get_norm_params(obs_memory)
            obs_memory.clear()
    
    if done:        
        ep_rewards.append(ep_reward_ex)
        print('{:3} Episode in {:5} steps, reward_ex {:.2f}, reward_in {:.2f}'.format(
            i, steps, ep_reward_ex, ep_reward_in))

        if len(ep_rewards) >= n_eval:
            if np.mean(list(reversed(ep_rewards))[: n_eval]) >= env.spec.reward_threshold:
                print('\n{} is sloved! {:3} Episode in {:3} steps'.format(
                    env.spec.id, i, steps))
                torch.save(old_net.state_dict(),
                           f'./test/saved_models/{env.spec.id}_ep{i}_clear_model_ppo_r.pt')
                break
env.close()

  1 Episode in    10 steps, reward_ex 10.00, reward_in 10.00
  2 Episode in    35 steps, reward_ex 25.00, reward_in 25.00
  3 Episode in    46 steps, reward_ex 11.00, reward_in 11.00
  4 Episode in    58 steps, reward_ex 12.00, reward_in 12.00
  5 Episode in    81 steps, reward_ex 23.00, reward_in 23.00
  6 Episode in   103 steps, reward_ex 22.00, reward_in 22.00
  7 Episode in   116 steps, reward_ex 13.00, reward_in 13.00
  8 Episode in   128 steps, reward_ex 12.00, reward_in 12.00
  9 Episode in   140 steps, reward_ex 12.00, reward_in 12.00
 10 Episode in   157 steps, reward_ex 17.00, reward_in 17.00
 11 Episode in   211 steps, reward_ex 54.00, reward_in 54.00
 12 Episode in   236 steps, reward_ex 25.00, reward_in 24.08
 13 Episode in   255 steps, reward_ex 19.00, reward_in 19.00
 14 Episode in   268 steps, reward_ex 13.00, reward_in 12.84
 15 Episode in   278 steps, reward_ex 10.00, reward_in 9.81
 16 Episode in   291 steps, reward_ex 13.00, reward_in 11.69
 17 Episode in   301 step

135 Episode in 13940 steps, reward_ex 174.00, reward_in 58.66
136 Episode in 14099 steps, reward_ex 159.00, reward_in 39.20
137 Episode in 14240 steps, reward_ex 141.00, reward_in 53.28
138 Episode in 14395 steps, reward_ex 155.00, reward_in 45.39
139 Episode in 14565 steps, reward_ex 170.00, reward_in 19.31
140 Episode in 14708 steps, reward_ex 143.00, reward_in 18.79
141 Episode in 14878 steps, reward_ex 170.00, reward_in 51.32
142 Episode in 15019 steps, reward_ex 141.00, reward_in 21.22
143 Episode in 15182 steps, reward_ex 163.00, reward_in 30.44
144 Episode in 15352 steps, reward_ex 170.00, reward_in 84.85
145 Episode in 15487 steps, reward_ex 135.00, reward_in 55.20
146 Episode in 15620 steps, reward_ex 133.00, reward_in 42.82
147 Episode in 15772 steps, reward_ex 152.00, reward_in 30.58
148 Episode in 15940 steps, reward_ex 168.00, reward_in 37.70
149 Episode in 16112 steps, reward_ex 172.00, reward_in 28.59
150 Episode in 16261 steps, reward_ex 149.00, reward_in 25.54
151 Epis

267 Episode in 53423 steps, reward_ex 500.00, reward_in 280.96
268 Episode in 53923 steps, reward_ex 500.00, reward_in 262.81
269 Episode in 54423 steps, reward_ex 500.00, reward_in 263.40
270 Episode in 54923 steps, reward_ex 500.00, reward_in 255.61
271 Episode in 55423 steps, reward_ex 500.00, reward_in 308.09
272 Episode in 55923 steps, reward_ex 500.00, reward_in 369.12
273 Episode in 56423 steps, reward_ex 500.00, reward_in 301.21
274 Episode in 56923 steps, reward_ex 500.00, reward_in 205.92
275 Episode in 57423 steps, reward_ex 500.00, reward_in 241.82
276 Episode in 57923 steps, reward_ex 500.00, reward_in 320.17
277 Episode in 58423 steps, reward_ex 500.00, reward_in 186.18
278 Episode in 58923 steps, reward_ex 500.00, reward_in 183.53
279 Episode in 59423 steps, reward_ex 500.00, reward_in 213.89
280 Episode in 59923 steps, reward_ex 500.00, reward_in 367.98
281 Episode in 60423 steps, reward_ex 500.00, reward_in 213.67
282 Episode in 60923 steps, reward_ex 500.00, reward_in

In [None]:
plt.figure(figsize=(15, 5))
plt.title('Reward')
plt.plot(ep_rewards)
plt.figure(figsize=(15, 5))
plt.title('Loss')
plt.plot(losses)
plt.figure(figsize=(15, 5))
plt.title('m_Loss')
plt.plot(m_losses)
plt.figure(figsize=(15, 5))
plt.title('f_Loss')
plt.plot(f_losses)
plt.show()

In [None]:
[
    ('CartPole-v0', 161, 32, 256),
    ('CartPole-v1', 162, 32, 256),
    ('MountainCar-v0', 660),
    ('LunarLander-v2', 260)
]