In [1]:
import random
from collections import deque
from copy import deepcopy

import gym
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

In [2]:
SEED = 1
BATCH_SIZE = 32
LR = 0.00025
UP_COEF = 0.01
GAMMA = 0.99
V_MAX = 10
V_MIN = -10
N_ATOMS = 51
DELTA_Z = (V_MAX - V_MIN) / (N_ATOMS - 1)

# set device
use_cuda = torch.cuda.is_available()
device = torch.device('cuda' if use_cuda else 'cpu')

# random seed
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if use_cuda:
    torch.cuda.manual_seed_all(SEED)

In [3]:
class CategoricalDQN(nn.Module):
    def __init__(self, obs_space, action_space, n_atoms):
        super().__init__()

        self.head = nn.Sequential(
            nn.Linear(obs_space, obs_space*10),
            nn.SELU()
        )

        self.fc = nn.Sequential(
            nn.Linear(obs_space*10, 512),
            nn.SELU(),
            nn.Linear(512, 512),
            nn.SELU(),
            nn.Linear(512, action_space * n_atoms)
        )

        self.log_softmax = nn.LogSoftmax(dim=-1)

        self.register_buffer(
            'support', torch.arange(V_MIN, V_MAX + DELTA_Z, DELTA_Z))

    def forward(self, x):
        out = self.head(x)
        out = self.fc(out).reshape(out.shape[0], -1, N_ATOMS)
        log_p = self.log_softmax(out)

        return log_p

In [None]:
losses = []


def learn(net, tgt_net, optimizer, rep_memory):
    net.train()
    tgt_net.train()
    
    train_memory = random.sample(rep_memory, BATCH_SIZE)

    dataloader = DataLoader(train_memory,
                            batch_size=BATCH_SIZE,
                            pin_memory=use_cuda)

    for i, (s, a, r, _s, d) in enumerate(dataloader):
        s = s.to(device).float()
        a = a.to(device).long()
        _s = _s.to(device).float()
        r = r.to(device).float()
        d = d.to(device).float()
        
        with torch.no_grad():
            _log_p = net(_s)
            _weight = _log_p.exp() * net.support
            _q = _weight.sum(dim=2)
            _a = torch.argmax(_q, dim=1)
            _log_p_tgt = tgt_net(_s)
            _log_p_a = _log_p_tgt[range(BATCH_SIZE), _a]
            _p_a = _log_p_a.exp()
            _p_proj = projection(_p_a, r, d)
        
        log_p = net(s)
        log_p_a = log_p[range(BATCH_SIZE), a]

        # loss
        loss = -(_p_proj * log_p_a).sum(dim=1).mean()
        losses.append(loss)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


def projection(_p_a, r, d):
    _p_proj = np.zeros((BATCH_SIZE, N_ATOMS), dtype=np.float32)
    r_np = r.cpu().numpy()
    d_np = d.cpu().numpy()
    _p_a_np = _p_a.cpu().numpy()
    batch_id = range(BATCH_SIZE)
    for i in range(N_ATOMS):
        z = np.clip(r_np + GAMMA * (1 - d_np) * (V_MIN + i * DELTA_Z),
                    V_MIN, V_MAX)
        b = (z - V_MIN) / DELTA_Z
        l = np.floor(b).astype(np.int64)
        u = np.ceil(b).astype(np.int64)
        _p_proj[batch_id, l[batch_id]] += d_np + _p_a_np[batch_id, i] * (u - b)[batch_id] * (1 - d_np)
        _p_proj[batch_id, u[batch_id]] += d_np + _p_a_np[batch_id, i] * (b - l)[batch_id] * (1 - d_np)
        
    _p_proj = _p_proj / _p_proj.sum(axis=1, keepdims=1)
    
    return torch.tensor(_p_proj).to(device).float()


def select_action(obs, tgt_net):
    tgt_net.eval()
    with torch.no_grad():
        state = torch.tensor([obs]).to(device).float()
        log_p = target_net(state)
        weights = log_p.exp() * net.support
        q = weights.sum(dim=2)
        action = torch.argmax(q, dim=1)

    return action.item()

## Main

In [None]:
# make an environment
# env = gym.make('CartPole-v0')
env = gym.make('CartPole-v1')
# env = gym.make('MountainCar-v0')
# env = gym.make('LunarLander-v2')

env.seed(SEED)
obs_space = env.observation_space.shape[0]
action_space = env.action_space.n

# hyperparameter
n_episodes = 1000
learn_start = 1500
memory_size = 100000
update_frq = 1
use_eps_decay = False
epsilon = 0.001
eps_min = 0.001
decay_rate = 0.0001
n_eval = env.spec.trials

# global values
total_steps = 0
learn_steps = 0
rewards = []
reward_eval = deque(maxlen=n_eval)
is_learned = False
is_solved = False

# make a memory
rep_memory = deque(maxlen=memory_size)

# make two nerual networks
net = CategoricalDQN(obs_space, action_space, N_ATOMS).to(device)
target_net = deepcopy(net)

# make a optimizer
optimizer = optim.Adam(net.parameters(), lr=LR, eps=1e-5)
# optimizer = optim.SGD(net.parameters(), lr=LR, momentum=0.9, weight_decay=0.0001)

  result = entry_point.load(False)


In [None]:
env.spec.max_episode_steps

500

In [None]:
env.spec.trials

100

In [None]:
env.spec.reward_threshold

475.0

In [None]:
# play
for i in range(1, n_episodes + 1):
    obs = env.reset()
    done = False
    ep_reward = 0
    while not done:
#         env.render()
        if np.random.rand() < epsilon:
            action = env.action_space.sample()
        else:
            action = select_action(obs, target_net)

        _obs, reward, done, _ = env.step(action)

        rep_memory.append((obs, action, reward, _obs, done))

        obs = _obs
        total_steps += 1
        ep_reward += reward

        if use_eps_decay:
            epsilon -= epsilon * decay_rate
            epsilon = max(eps_min, epsilon)

        if len(rep_memory) >= learn_start:
            if len(rep_memory) == learn_start:
                print('\n============  Start Learning  ============\n')
            learn(net, target_net, optimizer, rep_memory)
            learn_steps += 1

        if learn_steps == update_frq:
            # target smoothing update
            with torch.no_grad():
                for t, n in zip(target_net.parameters(), net.parameters()):
                    t.data = UP_COEF * n.data + (1 - UP_COEF) * t.data
                learn_steps = 0

    if done:
        rewards.append(ep_reward)
        reward_eval.append(ep_reward)
        print('{:3} Episode in {:5} steps, reward {:.2f}'.format(
            i, total_steps, ep_reward))

        if len(reward_eval) >= n_eval:
            if np.mean(reward_eval) >= env.spec.reward_threshold:
                print('\n{} is sloved! {:3} Episode in {:3} steps'.format(
                    env.spec.id, i, total_steps))
                torch.save(target_net.state_dict(),
                           f'./test/saved_models/{env.spec.id}_ep{i}_clear_model_cdqn.pt')
                break
env.close()

  1 Episode in    10 steps, reward 10.00
  2 Episode in    20 steps, reward 10.00
  3 Episode in    28 steps, reward 8.00
  4 Episode in    36 steps, reward 8.00
  5 Episode in    44 steps, reward 8.00
  6 Episode in    53 steps, reward 9.00
  7 Episode in    63 steps, reward 10.00
  8 Episode in    71 steps, reward 8.00
  9 Episode in    80 steps, reward 9.00
 10 Episode in    89 steps, reward 9.00
 11 Episode in    99 steps, reward 10.00
 12 Episode in   109 steps, reward 10.00
 13 Episode in   119 steps, reward 10.00
 14 Episode in   129 steps, reward 10.00
 15 Episode in   138 steps, reward 9.00
 16 Episode in   146 steps, reward 8.00
 17 Episode in   155 steps, reward 9.00
 18 Episode in   165 steps, reward 10.00
 19 Episode in   175 steps, reward 10.00
 20 Episode in   185 steps, reward 10.00
 21 Episode in   195 steps, reward 10.00
 22 Episode in   205 steps, reward 10.00
 23 Episode in   215 steps, reward 10.00
 24 Episode in   223 steps, reward 8.00
 25 Episode in   231 steps,

203 Episode in  1907 steps, reward 10.00
204 Episode in  1918 steps, reward 11.00
205 Episode in  1927 steps, reward 9.00
206 Episode in  1937 steps, reward 10.00
207 Episode in  1947 steps, reward 10.00
208 Episode in  1955 steps, reward 8.00
209 Episode in  1965 steps, reward 10.00
210 Episode in  1975 steps, reward 10.00
211 Episode in  1984 steps, reward 9.00
212 Episode in  1995 steps, reward 11.00
213 Episode in  2003 steps, reward 8.00
214 Episode in  2013 steps, reward 10.00
215 Episode in  2023 steps, reward 10.00
216 Episode in  2032 steps, reward 9.00
217 Episode in  2041 steps, reward 9.00
218 Episode in  2051 steps, reward 10.00
219 Episode in  2061 steps, reward 10.00
220 Episode in  2070 steps, reward 9.00
221 Episode in  2081 steps, reward 11.00
222 Episode in  2091 steps, reward 10.00
223 Episode in  2101 steps, reward 10.00
224 Episode in  2109 steps, reward 8.00
225 Episode in  2117 steps, reward 8.00
226 Episode in  2126 steps, reward 9.00
227 Episode in  2135 steps

403 Episode in 15479 steps, reward 148.00
404 Episode in 15631 steps, reward 152.00
405 Episode in 15771 steps, reward 140.00
406 Episode in 15919 steps, reward 148.00
407 Episode in 16068 steps, reward 149.00
408 Episode in 16214 steps, reward 146.00
409 Episode in 16361 steps, reward 147.00
410 Episode in 16509 steps, reward 148.00
411 Episode in 16661 steps, reward 152.00
412 Episode in 16811 steps, reward 150.00
413 Episode in 16952 steps, reward 141.00
414 Episode in 17101 steps, reward 149.00
415 Episode in 17241 steps, reward 140.00
416 Episode in 17393 steps, reward 152.00
417 Episode in 17534 steps, reward 141.00
418 Episode in 17678 steps, reward 144.00
419 Episode in 17825 steps, reward 147.00
420 Episode in 17961 steps, reward 136.00
421 Episode in 18101 steps, reward 140.00
422 Episode in 18246 steps, reward 145.00
423 Episode in 18399 steps, reward 153.00
424 Episode in 18547 steps, reward 148.00
425 Episode in 18688 steps, reward 141.00
426 Episode in 18835 steps, reward

599 Episode in 44753 steps, reward 161.00
600 Episode in 44905 steps, reward 152.00
601 Episode in 45047 steps, reward 142.00
602 Episode in 45184 steps, reward 137.00
603 Episode in 45318 steps, reward 134.00
604 Episode in 45484 steps, reward 166.00
605 Episode in 45629 steps, reward 145.00
606 Episode in 45766 steps, reward 137.00
607 Episode in 45907 steps, reward 141.00
608 Episode in 46050 steps, reward 143.00
609 Episode in 46182 steps, reward 132.00
610 Episode in 46322 steps, reward 140.00
611 Episode in 46473 steps, reward 151.00
612 Episode in 46618 steps, reward 145.00
613 Episode in 46766 steps, reward 148.00
614 Episode in 46905 steps, reward 139.00
615 Episode in 47038 steps, reward 133.00
616 Episode in 47169 steps, reward 131.00
617 Episode in 47296 steps, reward 127.00
618 Episode in 47429 steps, reward 133.00
619 Episode in 47567 steps, reward 138.00
620 Episode in 47704 steps, reward 137.00
621 Episode in 47840 steps, reward 136.00
622 Episode in 47981 steps, reward

In [None]:
plt.figure(figsize=(15, 5))
plt.title('Reward')
plt.plot(rewards)
plt.figure(figsize=(15, 5))
plt.title('Loss')
plt.plot(losses)
plt.show()

In [None]:
[
    ('CartPole-v0', 215, 0.25),
    ('CartPole-v1', 291, 0.1),
    ('MountainCar-v0', None, 0.1),
    ('LunarLander-v2', None, 0.1)
]