In [1]:
# built-in
import random
from collections import deque
from copy import deepcopy

# tihrd party
import gym
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

In [2]:
# for C51
V_MAX = 10
V_MIN = -10
N_ATOMS = 51
DELTA_Z = (V_MAX - V_MIN) / (N_ATOMS - 1)
# for learning
SEED = 1
BATCH_SIZE = 256
GAMMA = 0.99
LR = 0.03
EPS = 1e-8

In [3]:
class CategoricalDuelingDQNN(nn.Module):
    def __init__(self, obs_space, action_space, n_atoms):
        super().__init__()

        self.head = nn.Sequential(
            nn.Linear(obs_space, obs_space*10),
            nn.ReLU(),
            nn.Linear(obs_space*10, (obs_space+action_space)*5),
            nn.ReLU()
        )

        self.val = nn.Sequential(
            nn.Linear((obs_space+action_space)*5, action_space*10),
            nn.ReLU(),
            nn.Linear(action_space*10, n_atoms)
        )

        self.adv = nn.Sequential(
            nn.Linear((obs_space+action_space)*5, action_space*10),
            nn.ReLU(),
            nn.Linear(action_space*10, action_space * n_atoms)
        )

        self.log_softmax = nn.LogSoftmax(dim=-1)

        self.register_buffer(
            'support', torch.arange(V_MIN, V_MAX + DELTA_Z, DELTA_Z))

    def _make_layer(self, linear, n_hidden, n_layers):
        layers = [linear(n_hidden) for _ in range(n_layers)]

        return nn.Sequential(*layers)

    def forward(self, x):
        out = self.head(x)
        val_out = self.val(out).view(out.size(0), 1, N_ATOMS)
        adv_out = self.adv(out).view(out.size(0), -1, N_ATOMS)
        adv_mean = adv_out.mean(dim=1, keepdim=True)
        out = val_out + adv_out - adv_mean
        out = self.log_softmax(out)
        probs = out.exp()

        return probs


class CategoricalDuelingDQN(nn.Module):
    def __init__(self, obs_space, action_space, n_atoms):
        super().__init__()

        self.head = nn.Sequential(
            nn.Linear(obs_space, 256),
            nn.ReLU(),
            nn.Linear(256, 256),
            nn.ReLU()
        )

        self.val = nn.Sequential(
            nn.Linear(256, 256),
            nn.ReLU(),
            nn.Linear(256, n_atoms)
        )

        self.adv = nn.Sequential(
            nn.Linear(256, 256),
            nn.ReLU(),
            nn.Linear(256, action_space * n_atoms)
        )

        self.log_softmax = nn.LogSoftmax(dim=-1)

        self.register_buffer(
            'support', torch.arange(V_MIN, V_MAX + DELTA_Z, DELTA_Z))

    def _make_layer(self, linear, n_hidden, n_layers):
        layers = [linear(n_hidden) for _ in range(n_layers)]

        return nn.Sequential(*layers)

    def forward(self, x):
        out = self.head(x)
        val_out = self.val(out).view(out.size(0), 1, N_ATOMS)
        adv_out = self.adv(out).view(out.size(0), -1, N_ATOMS)
        adv_mean = adv_out.mean(dim=1, keepdim=True)
        out = val_out + adv_out - adv_mean
        out = self.log_softmax(out)
        probs = out.exp()

        return probs

In [None]:
losses = []


def learn(net, tgt_net, optimizer, rep_memory):
    net.train()
    tgt_net.train()

    train_data = []
    train_data.extend(random.sample(rep_memory, BATCH_SIZE))

    dataloader = DataLoader(train_data,
                            batch_size=BATCH_SIZE,
                            pin_memory=use_cuda)
    # like a double DQN
    for i, (s, a, r, _s, d) in enumerate(dataloader):
        s_batch = s.to(device).float()
        a_batch = a.to(device).long()
        _s_batch = _s.to(device).float()
        rewards = r.detach().cpu().numpy().astype(np.longlong)
        dones = d.detach().cpu().numpy().astype(np.bool)

        _p_batch = net(_s_batch)
        _weights = _p_batch * net.support
        _q_batch = _weights.sum(dim=2)
        _q_batch_np = _q_batch.detach().cpu().numpy()[0]
        _action_batch_np = np.argmax(_q_batch_np)

        with torch.no_grad():
            tgt_p_batch = tgt_net(_s_batch)
            _p_best = tgt_p_batch[range(BATCH_SIZE), _action_batch_np]
            _p_best_np = _p_best.cpu().numpy()

        proj_p_np = projection(_p_best_np, rewards, dones)
        proj_p = torch.tensor(proj_p_np).to(device).float()

        p_batch = net(s_batch)
        p_acting = p_batch[range(BATCH_SIZE), a_batch.data]

        # loss
        eps = np.finfo(np.float32).eps.item()
        loss = -(proj_p * (p_acting + eps).log()).sum(dim=1).mean()
        losses.append(loss)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [None]:
def projection(next_p, rewards, dones):
    proj_p = np.zeros((BATCH_SIZE, N_ATOMS), dtype=np.float32)
    for atom in range(N_ATOMS):
        z = np.minimum(V_MAX, np.maximum(
            V_MIN, rewards + (V_MIN + atom * DELTA_Z) * GAMMA))
        b = (z - V_MIN) / DELTA_Z
        l = np.floor(b).astype(np.int64)
        u = np.ceil(b).astype(np.int64)

        eq_mask = u == l
        proj_p[eq_mask, l[eq_mask]] += next_p[eq_mask, atom]
        ne_mask = u != l
        proj_p[ne_mask, l[ne_mask]] += next_p[ne_mask, atom] * (u - b)[ne_mask]
        proj_p[ne_mask, u[ne_mask]] += next_p[ne_mask, atom] * (b - l)[ne_mask]

        if dones.any():
            proj_p[dones] = 0.0
            z = np.minimum(V_MAX, np.maximum(V_MIN, rewards[dones]))
            b = (z - V_MIN) / DELTA_Z
            l = np.floor(b).astype(np.int64)
            u = np.ceil(b).astype(np.int64)

            eq_mask = u == l
            eq_dones = dones.copy()
            eq_dones[dones] = eq_mask
            if eq_dones.any():
                proj_p[eq_dones, l] = 1.0

            ne_mask = u != l
            ne_dones = dones.copy()
            ne_dones[dones] = ne_mask
            if ne_dones.any():
                proj_p[ne_dones, l] = (u - b)[ne_mask]
                proj_p[ne_dones, u] = (b - l)[ne_mask]

    return proj_p

In [None]:
# set device
use_cuda = torch.cuda.is_available()
print('cuda:', use_cuda)
device = torch.device('cuda' if use_cuda else 'cpu')

# random seed
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if use_cuda:
    torch.cuda.manual_seed_all(SEED)

# make an environment
# env = gym.make('CartPole-v0')
env = gym.make('CartPole-v1')
# env = gym.make('MountainCar-v0')
# env = gym.make('LunarLander-v2')
env.seed(SEED)
obs_space = env.observation_space.shape[0]
action_space = env.action_space.n
max_steps = env.spec.max_episode_steps*env.spec.timestep_limit

# hyperparameter
learn_start = env.spec.timestep_limit*5
memory_size = learn_start*20
update_frq = int(env.spec.timestep_limit/10)
epsilon = 1.
eps_min = 0.0025
eps_decay = 1. - np.exp(np.log(eps_min)/(max_steps/2))
num_eval = 10

# global values
total_steps = 0
learn_steps = 0
rewards = []
reward_eval = deque(maxlen=num_eval)
is_learned = False
is_solved = False

# make two nerual networks
net = CategoricalDuelingDQN(obs_space, action_space, N_ATOMS).to(device)
target_net = deepcopy(net)

# make optimizer
# optimizer = optim.SGD(net.parameters(), momentum=0.9, lr=LR, weight_decay=1e-4)
optimizer = optim.Adam(net.parameters(), lr=LR, eps=EPS)

# make memory
rep_memory = deque(maxlen=memory_size)

cuda: True
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m




In [None]:
env.spec.max_episode_steps

1000

In [None]:
env.spec.reward_threshold

200

In [None]:
env.spec.timestep_limit

1000

In [None]:
# play
for i in range(1, env.spec.max_episode_steps+1):
    obs = env.reset()
    done = False
    ep_reward = 0
    while not done:
        env.render()
        if np.random.rand() < epsilon:
            action = env.action_space.sample()
        else:
            target_net.eval()
            with torch.no_grad():
                state = torch.tensor([obs]).to(device).float()
                probs = target_net(state)
                weights = probs * net.support
                q = weights.sum(dim=2)
                q_np = q.cpu().numpy()[0]
                action = np.argmax(q_np)

        _obs, reward, done, _ = env.step(action)

        rep_memory.append((obs, action, reward, _obs, done))

        obs = _obs
        total_steps += 1
        ep_reward += reward
        epsilon -= epsilon * eps_decay
        epsilon = max(eps_min, epsilon)

        if len(rep_memory) >= learn_start:
            if len(rep_memory) == learn_start:
                print('\n============  Start Learning  ============\n')
            learn(net, target_net, optimizer, rep_memory)
            learn_steps += 1

        if learn_steps == update_frq:
            target_net.load_state_dict(net.state_dict())
            learn_steps = 0
    if done:
        rewards.append(ep_reward)
        reward_eval.append(ep_reward)
        print('{:3} Episode in {:5} steps, reward {:.2f}'.format(
            i, total_steps, ep_reward))

        if len(reward_eval) >= num_eval:
            if np.mean(reward_eval) >= env.spec.reward_threshold:
                print('\n{} is sloved! {:3} Episode in {:3} steps'.format(
                    env.spec.id, i, total_steps))
                break
env.close()

  1 Episode in    60 steps, reward -126.51
  2 Episode in   131 steps, reward -182.64
  3 Episode in   203 steps, reward -106.16
  4 Episode in   267 steps, reward -151.84
  5 Episode in   337 steps, reward -156.44
  6 Episode in   397 steps, reward -170.28
  7 Episode in   489 steps, reward -199.73
  8 Episode in   579 steps, reward -206.62
  9 Episode in   645 steps, reward -155.45
 10 Episode in   717 steps, reward -174.54
 11 Episode in   791 steps, reward -168.89
 12 Episode in   908 steps, reward -203.93
 13 Episode in   985 steps, reward -281.83
 14 Episode in  1089 steps, reward -243.68
 15 Episode in  1153 steps, reward -169.49
 16 Episode in  1271 steps, reward -187.91
 17 Episode in  1370 steps, reward -266.76
 18 Episode in  1443 steps, reward -275.86
 19 Episode in  1511 steps, reward -135.63
 20 Episode in  1612 steps, reward -335.32
 21 Episode in  1741 steps, reward -155.99
 22 Episode in  1806 steps, reward -269.69
 23 Episode in  1926 steps, reward -424.74
 24 Episode

191 Episode in 18283 steps, reward -155.31
192 Episode in 18354 steps, reward -123.69
193 Episode in 18420 steps, reward -96.51
194 Episode in 18554 steps, reward -346.20
195 Episode in 18674 steps, reward -212.46
196 Episode in 18783 steps, reward -184.14
197 Episode in 18900 steps, reward -244.31
198 Episode in 19010 steps, reward -400.08
199 Episode in 19124 steps, reward -181.77
200 Episode in 19279 steps, reward -85.77
201 Episode in 19412 steps, reward -118.68
202 Episode in 19523 steps, reward -207.00
203 Episode in 19655 steps, reward -77.45
204 Episode in 19742 steps, reward -111.51
205 Episode in 19822 steps, reward -160.05
206 Episode in 19927 steps, reward -274.04
207 Episode in 20043 steps, reward -394.49
208 Episode in 20147 steps, reward -127.98
209 Episode in 20243 steps, reward -180.43
210 Episode in 20348 steps, reward -150.00
211 Episode in 20468 steps, reward -344.45
212 Episode in 20578 steps, reward -52.45
213 Episode in 20668 steps, reward -267.32
214 Episode in 

383 Episode in 41113 steps, reward -111.21
384 Episode in 41240 steps, reward -236.26


In [None]:
plt.figure(figsize=(15, 5))
plt.title('reward')
plt.plot(rewards)
plt.figure(figsize=(15, 5))
plt.title('loss')
plt.plot(losses)
plt.show()