In [2]:
import random
from copy import deepcopy

import gym
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
from torch.distributions import Categorical
from torch.utils.data import DataLoader
from transformers.optimization import (
    AdamW,
    get_constant_schedule_with_warmup,
    get_cosine_with_min_lr_schedule_with_warmup,
)
from IPython.display import clear_output

In [None]:
SEED = 0
BATCH_SIZE = 256
LR = 0.002 * BATCH_SIZE / 1024
EPOCHS = 20
# EPOCHS = 80
CLIP = 0.2
GAMMA = 0.999
LAMBDA = 0.98
ENT_COEF = 0
V_COEF = 1
V_CLIP = False
LIN_REDUCE = False
GRAD_NORM = False
OBS_NORM = True
# set device
use_cuda = torch.cuda.is_available()
print("cuda:", use_cuda)
device = torch.device("cuda" if use_cuda else "cpu")
# device = torch.device('mps')

# random seed
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if use_cuda:
    torch.cuda.manual_seed_all(SEED)

cuda: False


In [4]:
def init_weights(m):
    if isinstance(m, nn.Linear):
        # Xavier (Glorot) Uniform 초기화 적용
        torch.nn.init.xavier_uniform_(m.weight)
        torch.nn.init.xavier_normal_(m.weight)
        if m.bias is not None:
            m.bias.data.fill_(0.0)
    elif isinstance(m, nn.BatchNorm1d):
        # BatchNorm의 경우 gamma는 1, beta는 0으로 초기화 (기본값과 동일)
        m.weight.data.fill_(1.0)
        m.bias.data.fill_(0.0)


class ActorCriticNet(nn.Module):
    def __init__(self, obs_space, action_space, use_obs_norm=True):
        super().__init__()
        h = 64
        # self.head = nn.Sequential(
        #     nn.Linear(obs_space, h, bias=False),
        #     nn.BatchNorm1d(h),
        #     # nn.Tanh(),
        #     nn.ReLU(),
        #     # nn.Linear(h, h),
        # )
        self.use_obs_norm = use_obs_norm
        self.obs_space = obs_space
        self.obs_norms = nn.ModuleList([nn.BatchNorm1d(1, momentum=None, affine=False) for _ in range(obs_space)])
        self.pol = nn.Sequential(
            nn.Linear(obs_space, h, bias=True),
            nn.Tanh(),
            # nn.Dropout(p=0.01),
            # nn.Linear(h, h, bias=True),
            # nn.Tanh(),
            # nn.Dropout(p=0.01),
            nn.Linear(h, action_space, bias=True),
        )
        self.val = nn.Sequential(
            nn.Linear(obs_space, h, bias=True),
            nn.Tanh(),
            # nn.Dropout(p=0.01),
            # nn.Linear(h, h, bias=True),
            # nn.Tanh(),
            # nn.Dropout(p=0.01),
            nn.Linear(h, 1, bias=True),
        )
        self.log_softmax = nn.LogSoftmax(dim=-1)

    def forward(self, x):
        # out = self.head(x)
        # logit = self.pol(out).reshape(out.shape[0], -1)
        # log_p = self.log_softmax(logit)
        # v = self.val(out).reshape(out.shape[0], 1)
        if self.use_obs_norm:
            norm_x = []
            for norm, obs in zip(self.obs_norms, x.chunk(chunks=self.obs_space, dim=1)):
                norm_x.append(norm(obs))
            x = torch.concat(norm_x, dim=1)
        logit = self.pol(x).reshape(x.shape[0], -1)
        log_p = self.log_softmax(logit)
        v = self.val(x).reshape(x.shape[0], 1)

        return log_p, v

In [5]:
# make an environment
# env = gym.make("CartPole-v0")
# env = gym.make("CartPole-v1")
# env = gym.make("MountainCar-v0")
env = gym.make("LunarLander-v2")

# env.seed(SEED)
obs_space = env.observation_space.shape[0]
action_space = env.action_space.n
print(obs_space)
print(action_space)
print(env.spec.reward_threshold)
print(env.spec.max_episode_steps)

8
4
200
1000


In [6]:
# hyperparameter
n_episodes = 100000
# max_ep_len = env.spec.max_episode_steps
max_ep_len = 1000
# roll_len = 2048
roll_len = 2048
# roll_len = 32
total_epochs = roll_len // BATCH_SIZE
n_eval = 10

# global values
steps = 0
ep_rewards = []
reward_eval = []

# make memories
train_memory = []
roll_memory = []
rewards = []
values = []

losses = []
mean_losses = []


def learn(net, old_net, optimizer, scheduler, train_memory):
    global CLIP, LR
    global total_epochs
    net.train()
    old_net.eval()
    advs = torch.tensor([adv for s, a, ret, adv in train_memory]).to(device)

    for epoch in range(EPOCHS):
        if LIN_REDUCE:
            lr = LR - (LR * epoch / total_epochs)
            clip = CLIP - (CLIP * epoch / total_epochs)
        else:
            lr = LR
            clip = CLIP

        for param_group in optimizer.param_groups:
            param_group["lr"] = lr

        dataloader = DataLoader(train_memory, shuffle=True, batch_size=BATCH_SIZE, pin_memory=use_cuda)

        for s, a, ret, adv in dataloader:
            s_batch = s.to(device).float()
            a_batch = a.to(device).long()
            ret_batch = ret.to(device).float()
            # ret_batch = (ret_batch - ret_batch.mean()) / (ret_batch.std() + 1e-6)
            adv_batch = adv.to(device).float()
            adv_batch = (adv_batch - advs.mean()) / (advs.std() + 1e-8)
            with torch.no_grad():
                log_p_batch_old, v_batch_old = old_net(s_batch)
                log_p_acting_old = log_p_batch_old[range(BATCH_SIZE), a_batch]

            log_p_batch, v_batch = net(s_batch)
            log_p_acting = log_p_batch[range(BATCH_SIZE), a_batch]
            p_ratio = (log_p_acting - log_p_acting_old).exp()
            p_ratio_clip = torch.clamp(p_ratio, 1 - clip, 1 + clip)
            p_loss = torch.min(p_ratio * adv_batch, p_ratio_clip * adv_batch).mean()
            if V_CLIP:
                v_clip = v_batch_old + torch.clamp(v_batch - v_batch_old, -clip, clip)
                v_loss1 = (ret_batch - v_clip).pow(2)
                v_loss2 = (ret_batch - v_batch).pow(2)
                v_loss = 0.5 * torch.max(v_loss1, v_loss2).mean()
            else:
                v_loss = 0.5 * (ret_batch - v_batch).pow(2).mean()

            log_p, _ = net(s_batch)
            entropy = -(log_p.exp() * log_p).sum(dim=1).mean()

            # loss
            loss = -(p_loss - V_COEF * v_loss + ENT_COEF * entropy)
            losses.append(loss.item())
            mean_losses.append(np.mean(losses[-200:]))

            optimizer.zero_grad()
            loss.backward()
            if GRAD_NORM:
                nn.utils.clip_grad_norm_(net.parameters(), max_norm=0.5)
            optimizer.step()
            scheduler.step()
    train_memory.clear()


def get_action_and_value(obs, old_net):
    old_net.eval()
    with torch.no_grad():
        state = torch.tensor([obs]).to(device).float()
        log_p, v = old_net(state)
        m = Categorical(log_p.exp())
        action = m.sample()

    return action.item(), v.item()


def compute_adv_with_gae(rewards, values, roll_memory):
    rew = np.array(rewards, "float")
    val = np.array(values[:-1], "float")
    _val = np.array(values[1:], "float")
    delta = rew + GAMMA * _val - val
    dis_r = np.array([GAMMA ** (i) * r for i, r in enumerate(rewards)], "float")
    gae_dt = np.array([(GAMMA * LAMBDA) ** (i) * dt for i, dt in enumerate(delta.tolist())], "float")
    for i, data in enumerate(roll_memory):
        data.append(sum(dis_r[i:] / GAMMA ** (i)))
        data.append(sum(gae_dt[i:] / (GAMMA * LAMBDA) ** (i)))

    rewards.clear()
    values.clear()

    return roll_memory


def plot():
    clear_output(True)
    plt.figure(figsize=(16, 5))
    plt.subplot(121)
    plt.plot(ep_rewards, alpha=0.5)
    plt.subplot(121)
    plt.plot(reward_eval)
    plt.title(f"Reward: " f"{reward_eval[-1]}")
    plt.subplot(122)
    plt.plot(losses, alpha=0.5)
    plt.subplot(122)
    plt.plot(mean_losses)
    plt.title(f"Loss: " f"{np.mean(list(reversed(losses))[: n_eval]):.4f}")
    plt.show()

In [7]:
# make nerual networks
net = ActorCriticNet(obs_space, action_space, OBS_NORM).to(device)
# net.apply(init_weights)
old_net = deepcopy(net)
no_decay = ["bias"]
grouped_parameters = [
    {"params": [p for n, p in net.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": 0.01},
    {"params": [p for n, p in net.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
]
# optimizer = torch.optim.AdamW(net.parameters(), lr=LR, eps=1e-6)
optimizer = torch.optim.AdamW(grouped_parameters, lr=LR, eps=1e-8)
# optimizer = AdamW(grouped_parameters, lr=LR, eps=1e-8)
scheduler = get_constant_schedule_with_warmup(optimizer, num_warmup_steps=total_epochs * EPOCHS * 5)
# scheduler = get_cosine_with_min_lr_schedule_with_warmup(
#     optimizer,
#     num_warmup_steps=total_epochs * EPOCHS * 5,
#     num_training_steps=total_epochs * EPOCHS * 2000,
#     min_lr_rate=0.1,
# )

# play!
for i in range(1, n_episodes + 1):
    obs, _ = env.reset()
    done = False
    timeout = False
    ep_reward = 0
    ep_steps = 0
    while not (done or timeout):
        # env.render()
        action, value = get_action_and_value(obs, old_net)
        out = env.step(action)
        _obs, reward, done, _, _ = env.step(action)

        # store
        roll_memory.append([obs, action])
        rewards.append(reward)
        # print(np.tanh(reward))
        values.append(value)

        obs = _obs
        steps += 1
        ep_steps += 1
        ep_reward += reward

        timeout = ep_steps >= min(max_ep_len, 1000)

        if done or steps % roll_len == 0 or timeout:
            if done:
                _value = 0.0
            else:
                _, _value = get_action_and_value(_obs, old_net)

            values.append(_value)
            train_memory.extend(compute_adv_with_gae(rewards, values, roll_memory))
            roll_memory.clear()

            if steps % roll_len == 0:
                learn(net, old_net, optimizer, scheduler, train_memory)
                old_net.load_state_dict(net.state_dict())

    if done or timeout:
        ep_rewards.append(ep_reward)
        reward_eval.append(np.mean(list(reversed(ep_rewards))[:n_eval]).round(decimals=2))
        plot()
        #         print('{:3} Episode in {:5} steps, reward {:.2f}'.format(
        #             i, steps, ep_reward))

        if len(ep_rewards) >= n_eval:
            if reward_eval[-1] >= env.spec.reward_threshold:
                #             if reward_eval[-1] >= 495:
                print("\n{} is sloved! {:3} Episode in {:3} steps".format(env.spec.id, i, steps))
                torch.save(net.state_dict(), f"./test/saved_models/{env.spec.id}_ep{i}_clear_model_ppo_st.pt")
                break
env.close()

KeyboardInterrupt: 

In [None]:
env.spec.max_episode_steps

500

In [None]:
[
    ("CartPole-v0", 889, 2048, 0.2, 10, 0.5, 0.01, False, 0.999, 0.98),
    ("CartPole-v1", 801, 2048, 0.2, 10, 0.5, 0.01, False, 0.999, 0.98),
    ("MountainCar-v0", None),
    ("LunarLander-v2", 876, 2048, 0.2, 10, 1.0, 0.01, False, 0.999, 0.98),
]

[('CartPole-v0', 889, 2048, 0.2, 10, 0.5, 0.01, False, 0.999, 0.98),
 ('CartPole-v1', 801, 2048, 0.2, 10, 0.5, 0.01, False, 0.999, 0.98),
 ('MountainCar-v0', None),
 ('LunarLander-v2', 876, 2048, 0.2, 10, 1.0, 0.01, False, 0.999, 0.98)]

In [None]:
env.reward_range

(-inf, inf)

In [None]:
env.spec.max_episode_steps

500