In [1]:
import random
from collections import deque
from copy import deepcopy

import gym
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

In [2]:
SEED = 1
BATCH_SIZE = 256
LR = 0.0003
UP_COEF = 0.1
GAMMA = 0.99
EPS = np.finfo(np.float32).eps
V_MAX = 10
V_MIN = -10
N_ATOMS = 51
DELTA_Z = (V_MAX - V_MIN) / (N_ATOMS - 1)

# set device
use_cuda = torch.cuda.is_available()
device = torch.device('cuda' if use_cuda else 'cpu')

# random seed
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if use_cuda:
    torch.cuda.manual_seed_all(SEED)

In [3]:
class CategoricalDQN(nn.Module):
    def __init__(self, obs_space, action_space, n_atoms):
        super().__init__()

        self.head = nn.Sequential(
            nn.Linear(obs_space, 256),
            nn.SELU(),
            nn.Linear(256, 256),
            nn.SELU()
        )

        self.fc = nn.Sequential(
            nn.Linear(256, 256),
            nn.SELU(),
            nn.Linear(256, action_space * n_atoms)
        )

        self.log_softmax = nn.LogSoftmax(dim=-1)

        self.register_buffer(
            'support', torch.arange(V_MIN, V_MAX + DELTA_Z, DELTA_Z))

    def forward(self, x):
        out = self.head(x)
        out = self.fc(out).reshape(out.shape[0], -1, N_ATOMS)
        out = self.log_softmax(out)
        probs = out.exp()

        return probs

In [4]:
losses = []


def learn(net, tgt_net, optimizer, rep_memory):
    net.train()
    tgt_net.train()

    dataloader = DataLoader(rep_memory,
                            batch_size=BATCH_SIZE,
                            shuffle=True,
                            pin_memory=use_cuda)

    for i, (s, a, r, _s, d) in enumerate(dataloader):
        if i > 0:
            break
        s_batch = s.to(device).float()
        a_batch = a.detach().to(device).long()
        _s_batch = _s.to(device).float()
        r_batch = r.detach().to(device).float()
        is_done = 1. - d.detach().to(device).float()

        with torch.no_grad():
            _p_batch_tgt = tgt_net(_s_batch)
            _p_acting = _p_batch_tgt[range(BATCH_SIZE), a_batch]
        _p_proj = projection(_p_acting, r_batch, is_done)

        p_batch = net(s_batch)
        p_acting = p_batch[range(BATCH_SIZE), a_batch.data]

        # loss
        loss = -(_p_proj * torch.clamp(p_acting, min=EPS).log()).sum(dim=1).mean()
        losses.append(loss)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


def projection(_p_acting, r_batch, is_done):
    _p_proj = np.zeros((BATCH_SIZE, N_ATOMS), dtype=np.float32)
    r_batch_np = r_batch.cpu().numpy()
    is_done_np = is_done.cpu().numpy()
    _p_acting_np = _p_acting.detach().cpu().numpy()
    batch_id = range(BATCH_SIZE)
    for i in range(N_ATOMS):
        z = np.clip(r_batch_np + GAMMA * (V_MIN + i * DELTA_Z) * is_done_np,
                    V_MIN, V_MAX)
        b = (z - V_MIN) / DELTA_Z
        l = np.floor(b).astype(np.int64)
        u = np.ceil(b).astype(np.int64)
        _p_proj[batch_id, l[batch_id]] += _p_acting_np[batch_id, i] * (u - b)[batch_id]
        _p_proj[batch_id, u[batch_id]] += _p_acting_np[batch_id, i] * (b - l)[batch_id]

#     _p_proj += EPS
#         _p_proj = np.clip(_p_proj, EPS, None)
#     _p_proj = _p_proj / _p_proj.sum(axis=1, keepdims=1)
    
    return torch.tensor(_p_proj).to(device).float()


def select_action(obs, tgt_net):
    tgt_net.eval()
    with torch.no_grad():
        state = torch.tensor([obs]).to(device).float()
        probs = target_net(state)
        weights = probs * net.support
        q = weights.sum(dim=2)
        action = torch.argmax(q, dim=1)

    return action.item()

## Main

In [5]:
# make an environment
# env = gym.make('CartPole-v0')
env = gym.make('CartPole-v1')
# env = gym.make('MountainCar-v0')
# env = gym.make('LunarLander-v2')

env.seed(SEED)
obs_space = env.observation_space.shape[0]
action_space = env.action_space.n

# hyperparameter
n_episodes = 1000
learn_start = 1500
memory_size = 50000
update_frq = 1
use_eps_decay = False
epsilon = 0.001
eps_min = 0.001
decay_rate = 0.0001
n_eval = env.spec.trials

# global values
total_steps = 0
learn_steps = 0
rewards = []
reward_eval = deque(maxlen=n_eval)
is_learned = False
is_solved = False

# make two nerual networks
net = CategoricalDQN(obs_space, action_space, N_ATOMS).to(device)
target_net = deepcopy(net)

# make a optimizer
optimizer = optim.Adam(net.parameters(), lr=LR, eps=EPS)

# make memory
rep_memory = deque(maxlen=memory_size)



[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [6]:
use_cuda

True

In [7]:
env.spec.max_episode_steps

500

In [8]:
env.spec.trials

100

In [9]:
env.spec.reward_threshold

475.0

In [10]:
# play
for i in range(1, n_episodes + 1):
    obs = env.reset()
    done = False
    ep_reward = 0
    while not done:
#         env.render()
        if np.random.rand() < epsilon:
            action = env.action_space.sample()
        else:
            action = select_action(obs, target_net)

        _obs, reward, done, _ = env.step(action)

        rep_memory.append((obs, action, reward, _obs, done))

        obs = _obs
        total_steps += 1
        ep_reward += reward

        if use_eps_decay:
            epsilon -= epsilon * decay_rate
            epsilon = max(eps_min, epsilon)

        if len(rep_memory) >= learn_start:
            if len(rep_memory) == learn_start:
                print('\n============  Start Learning  ============\n')
            learn(net, target_net, optimizer, rep_memory)
            learn_steps += 1

        if learn_steps == update_frq:
            # target smoothing update
            for t, n in zip(target_net.parameters(), net.parameters()):
                t.data = UP_COEF * n.data + (1 - UP_COEF) * t.data
            learn_steps = 0

    if done:
        rewards.append(ep_reward)
        reward_eval.append(ep_reward)
        print('{:3} Episode in {:5} steps, reward {:.2f}'.format(
            i, total_steps, ep_reward))

        if len(reward_eval) >= n_eval:
            if np.mean(reward_eval) >= env.spec.reward_threshold:
                print('\n{} is sloved! {:3} Episode in {:3} steps'.format(
                    env.spec.id, i, total_steps))
                torch.save(target_net.state_dict(),
                           f'./test/saved_models/{env.spec.id}_ep{i}_clear_model_cdddqn.pt')
                break
env.close()

  1 Episode in    48 steps, reward 48.00
  2 Episode in    89 steps, reward 41.00
  3 Episode in   164 steps, reward 75.00
  4 Episode in   203 steps, reward 39.00
  5 Episode in   278 steps, reward 75.00
  6 Episode in   356 steps, reward 78.00
  7 Episode in   462 steps, reward 106.00
  8 Episode in   531 steps, reward 69.00
  9 Episode in   616 steps, reward 85.00
 10 Episode in   694 steps, reward 78.00
 11 Episode in   733 steps, reward 39.00
 12 Episode in   773 steps, reward 40.00
 13 Episode in   877 steps, reward 104.00
 14 Episode in   930 steps, reward 53.00
 15 Episode in  1015 steps, reward 85.00
 16 Episode in  1084 steps, reward 69.00
 17 Episode in  1172 steps, reward 88.00
 18 Episode in  1210 steps, reward 38.00
 19 Episode in  1252 steps, reward 42.00
 20 Episode in  1352 steps, reward 100.00
 21 Episode in  1406 steps, reward 54.00
 22 Episode in  1444 steps, reward 38.00
 23 Episode in  1493 steps, reward 49.00


 24 Episode in  1547 steps, reward 54.00
 25 Episode

KeyboardInterrupt: 

In [None]:
plt.figure(figsize=(15, 5))
plt.title('Reward')
plt.plot(rewards)
plt.figure(figsize=(15, 5))
plt.title('Loss')
plt.plot(losses)
plt.show()

In [None]:
[
    ('CartPole-v0', 215, 0.25),
    ('CartPole-v1', 291, 0.1),
    ('MountainCar-v0', None, 0.1),
    ('LunarLander-v2', None, 0.1)
]