In [1]:
from collections import deque
from copy import deepcopy
from pprint import pprint

import gym
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical
from torch.utils.data import DataLoader

In [2]:
SEED = 1
BATCH_SIZE = 4
LR = 0.00025
EPOCHS = 4
CLIP = 0.1
GAMMA = 0.99
LAMBDA = 0.95
ENT_COEF = 0.01
V_CLIP = False

# set device
use_cuda = torch.cuda.is_available()
print('cuda:', use_cuda)
device = torch.device('cuda' if use_cuda else 'cpu')

# random seed
np.random.seed(SEED)
torch.manual_seed(SEED)
if use_cuda:
    torch.cuda.manual_seed_all(SEED)

cuda: True


In [3]:
class ActorCriticNet(nn.Module):
    def __init__(self, obs_space, action_space):
        super().__init__()

        self.head = nn.Sequential(
            nn.Linear(obs_space, 256),
            nn.SELU()
        )

        self.pol = nn.Sequential(
            nn.Linear(256, 256),
            nn.SELU(),
            nn.Linear(256, action_space)
        )
        
        self.val = nn.Sequential(
            nn.Linear(256, 256),
            nn.SELU(),
            nn.Linear(256, 1)
        )
        
        self.log_softmax = nn.LogSoftmax(dim=-1)

    def forward(self, x):
        out = self.head(x)
        logit = self.pol(out).reshape(out.shape[0], -1)
        value = self.val(out).reshape(out.shape[0], 1)
        log_p = self.log_softmax(logit)
        
        return log_p, value

In [None]:
losses = []


def learn(net, old_net, optimizer, train_memory):
    net.train()
    old_net.train()
    dataloader = DataLoader(train_memory,
                            batch_size=BATCH_SIZE,
                            shuffle=True,
                            pin_memory=use_cuda)

    for i in range(EPOCHS):
        for (s, a, _s, ret, adv) in dataloader:
            s_batch = s.to(device).float()
            a_batch = a.detach().to(device).long()
            _s_batch = _s.to(device).float()
            ret_batch = ret.to(device).float()
            adv_batch = adv.to(device).float()
            batch_size = s_batch.shape[0]
            
            with torch.no_grad():
                log_p_batch_old, v_batch_old = old_net(s_batch)
                log_p_acting_old = log_p_batch_old[range(batch_size), a_batch]
                
            log_p_batch, v_batch = net(s_batch)
            log_p_acting = log_p_batch[range(batch_size), a_batch]
            p_ratio = (log_p_acting - log_p_acting_old).exp()
            p_ratio_clip = torch.clamp(p_ratio, 1. - CLIP, 1. + CLIP)
            p_loss = torch.min(p_ratio * adv_batch, p_ratio_clip * adv_batch).mean()
            if V_CLIP:
                v_clip = v_batch_old + torch.clamp(v_batch - v_batch_old, -CLIP, CLIP)
                v_loss1 = (ret_batch - v_clip).pow(2)
                v_loss2 = (ret_batch - v_batch).pow(2)
                v_loss = torch.max(v_loss1, v_loss2).mean()
            else:
                v_loss = (ret_batch - v_batch).pow(2).mean()
            entropy = -(log_p_batch.exp() * log_p_batch).sum(dim=1).mean()

            # loss
            loss = -(p_loss - v_loss + ENT_COEF * entropy)
            losses.append(loss)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()


def get_action_and_value(obs, net):
    net.eval()
    with torch.no_grad():
        state = torch.tensor([obs]).to(device).float()
        log_p, v = net(state)
        m = Categorical(log_p.exp())
        action = m.sample()

    return action.item(), v.item()


def compute_adv(rewards, values, roll_memory):
    dis_r = np.array(
        [GAMMA**(i) * r for i, r in enumerate(rewards)]
    )
    for i, (roll, val) in enumerate(zip(roll_memory, values)):
        ret = sum(dis_r[i:] / GAMMA**(i))
        adv = ret - val 
        roll.extend([ret, adv])

    return roll_memory


def compute_adv_with_gae(rewards, values, roll_memory):
    rew = np.array(rewards, 'float')
    val = np.array(values[:-1], 'float')
    _val = np.array(values[1:], 'float')
    delta = rew + GAMMA * _val - val
    dis_r = np.array([GAMMA**(i) * r for i, r in enumerate(rewards)], 'float')
    gae_dt = np.array([(GAMMA * LAMBDA)**(i) * dt for i, dt in enumerate(delta.tolist())], 'float')
    for i, data in enumerate(roll_memory):
        data.append(sum(dis_r[i:] / GAMMA**(i)))
        data.append(sum(gae_dt[i:] / (GAMMA * LAMBDA)**(i)))

    return roll_memory

## Main

In [None]:
# make an environment
# env = gym.make('CartPole-v0')
# env = gym.make('CartPole-v1')
env = gym.make('MountainCar-v0')
# env = gym.make('LunarLander-v2')

env.seed(SEED)
obs_space = env.observation_space.shape[0]
action_space = env.action_space.n

# hyperparameter
n_episodes = 1000
n_roll_ep = 5
n_eval = env.spec.trials

# global values
steps = 0
learn_steps = 0
ep_rewards = []
reward_eval = deque(maxlen=n_eval)
is_rollout = False
is_solved = False

# make memories
net_memory = deque(maxlen=2)
train_memory = []
roll_memory = []
rewards = []
values = []

# make nerual networks
net = ActorCriticNet(obs_space, action_space).to(device)
old_net = deepcopy(net)
net_memory.appendleft(net.state_dict())

# make a optimizer
optimizer = optim.Adam(net.parameters(), lr=LR, eps=1e-5)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [None]:
env.spec.max_episode_steps

200

In [None]:
env.spec.trials

100

In [None]:
env.spec.reward_threshold

-110.0

In [None]:
# play
for i in range(1, n_episodes + 1):
    obs = env.reset()
    done = False
    ep_reward = 0
    while not done:
        env.render()

        action, value = get_action_and_value(obs, net)
        _obs, reward, done, _ = env.step(action)
        
        # store
        roll_memory.append([obs, action, _obs])
        rewards.append(reward)
        values.append(value)
        
        obs = _obs
        steps += 1
        ep_reward += reward
    
    if done:
        values.append(0.)
        train_memory.extend(compute_adv_with_gae(rewards, values, roll_memory))
        rewards.clear()
        values.clear()
        roll_memory.clear()
        
        if i % n_roll_ep == 0:
            net_memory.appendleft(net.state_dict())
            old_net.load_state_dict(net_memory.pop())
            learn(net, old_net, optimizer, train_memory)
            train_memory.clear()
            learn_steps += 1
        
        ep_rewards.append(ep_reward)
        print('{:3} Episode in {:5} steps, reward {:.2f}'.format(
            i, steps, ep_reward))

        if len(ep_rewards) >= n_eval:
            if np.mean(list(reversed(ep_rewards))[: n_eval]) >= env.spec.reward_threshold:
                print('\n{} is sloved! {:3} Episode in {:3} steps'.format(
                    env.spec.id, i, steps))
                torch.save(old_net.state_dict(),
                           f'./test/saved_models/{env.spec.id}_ep{i}_clear_model_ppo_ep.pt')
                break
env.close()

  1 Episode in   200 steps, reward -200.00
  2 Episode in   400 steps, reward -200.00
  3 Episode in   600 steps, reward -200.00
  4 Episode in   800 steps, reward -200.00
  5 Episode in  1000 steps, reward -200.00
  6 Episode in  1200 steps, reward -200.00
  7 Episode in  1400 steps, reward -200.00
  8 Episode in  1600 steps, reward -200.00
  9 Episode in  1800 steps, reward -200.00
 10 Episode in  2000 steps, reward -200.00
 11 Episode in  2200 steps, reward -200.00
 12 Episode in  2400 steps, reward -200.00
 13 Episode in  2600 steps, reward -200.00
 14 Episode in  2800 steps, reward -200.00
 15 Episode in  3000 steps, reward -200.00
 16 Episode in  3200 steps, reward -200.00
 17 Episode in  3400 steps, reward -200.00
 18 Episode in  3600 steps, reward -200.00
 19 Episode in  3800 steps, reward -200.00
 20 Episode in  4000 steps, reward -200.00
 21 Episode in  4200 steps, reward -200.00
 22 Episode in  4400 steps, reward -200.00
 23 Episode in  4600 steps, reward -200.00
 24 Episode

192 Episode in 38400 steps, reward -200.00
193 Episode in 38600 steps, reward -200.00
194 Episode in 38800 steps, reward -200.00
195 Episode in 39000 steps, reward -200.00
196 Episode in 39200 steps, reward -200.00
197 Episode in 39400 steps, reward -200.00
198 Episode in 39600 steps, reward -200.00
199 Episode in 39800 steps, reward -200.00
200 Episode in 40000 steps, reward -200.00
201 Episode in 40200 steps, reward -200.00
202 Episode in 40400 steps, reward -200.00
203 Episode in 40600 steps, reward -200.00
204 Episode in 40800 steps, reward -200.00
205 Episode in 41000 steps, reward -200.00
206 Episode in 41200 steps, reward -200.00
207 Episode in 41400 steps, reward -200.00
208 Episode in 41600 steps, reward -200.00
209 Episode in 41800 steps, reward -200.00
210 Episode in 42000 steps, reward -200.00
211 Episode in 42200 steps, reward -200.00
212 Episode in 42400 steps, reward -200.00
213 Episode in 42600 steps, reward -200.00
214 Episode in 42800 steps, reward -200.00
215 Episode

383 Episode in 76600 steps, reward -200.00
384 Episode in 76800 steps, reward -200.00
385 Episode in 77000 steps, reward -200.00
386 Episode in 77200 steps, reward -200.00
387 Episode in 77400 steps, reward -200.00
388 Episode in 77600 steps, reward -200.00
389 Episode in 77800 steps, reward -200.00
390 Episode in 78000 steps, reward -200.00
391 Episode in 78200 steps, reward -200.00
392 Episode in 78400 steps, reward -200.00
393 Episode in 78600 steps, reward -200.00
394 Episode in 78800 steps, reward -200.00
395 Episode in 79000 steps, reward -200.00
396 Episode in 79200 steps, reward -200.00
397 Episode in 79400 steps, reward -200.00
398 Episode in 79600 steps, reward -200.00
399 Episode in 79800 steps, reward -200.00
400 Episode in 80000 steps, reward -200.00
401 Episode in 80200 steps, reward -200.00
402 Episode in 80400 steps, reward -200.00
403 Episode in 80600 steps, reward -200.00
404 Episode in 80800 steps, reward -200.00
405 Episode in 81000 steps, reward -200.00
406 Episode

In [None]:
plt.figure(figsize=(15, 5))
plt.title('Reward')
plt.plot(ep_rewards)
plt.figure(figsize=(15, 5))
plt.title('Loss')
plt.plot(losses)
plt.show()

In [None]:
[
    ('CartPole-v0', 245, 4, 5),
    ('CartPole-v1', 319, 4, 5),
    ('MountainCar-v0', None),
    ('LunarLander-v2', None)
]