# Rainbow DQN

You have now seen all the DQN improvements mentioned in the paper Rainbow:
Combining Improvements in Deep Reinforcement Learning, but it was done in an
incremental way, which helped you to understand the idea and implementation
of every improvement. The main point of the paper was to combine those
improvements and check the results. In the final example, I've decided to exclude
categorical DQN and double DQN from the final system, as they haven't shown too
much improvement on our guinea pig environment. If you want, you can add them
and try using a different game.

First of all, we need to define our network architecture and the methods that have
contributed to it:
- Dueling DQN: our network will have two separate paths for the value of
the state distribution and advantage distribution. On the output, both paths
will be summed together, providing the final value probability distributions
for actions. To force the advantage distribution to have a zero mean, we will
subtract the distribution with the mean advantage in every atom.
- Noisy networks: our linear layers in the value and advantage paths will be
noisy variants of nn.Linear.
In addition to network architecture changes, we will use the prioritized replay buffer
to keep environment transitions and sample them proportionally to the MSE loss.
Finally, we will unroll the Bellman equation to n-steps.

In [1]:
import sys
sys.path.append("../Chapter08/")

In [2]:
import gym
import ptan
import argparse
import random

import torch
import torch.optim as optim

from ignite.engine import Engine

from lib import common, dqn_extra

NAME = "08_rainbow"
N_STEPS = 4
PRIO_REPLAY_ALPHA = 0.6

In [3]:
def calc_loss_rainbow(batch, batch_weights, net, tgt_net, gamma,
                      device="cpu", double=True):
    states, actions, rewards, dones, next_states = \
        common.unpack_batch(batch)

    states_v = torch.tensor(states).to(device)
    actions_v = torch.tensor(actions).to(device)
    rewards_v = torch.tensor(rewards).to(device)
    done_mask = torch.BoolTensor(dones).to(device)
    batch_weights_v = torch.tensor(batch_weights).to(device)

    actions_v = actions_v.unsqueeze(-1)
    state_action_values = net(states_v).gather(1, actions_v)
    state_action_values = state_action_values.squeeze(-1)
    with torch.no_grad():
        next_states_v = torch.tensor(next_states).to(device)
        if double:
            next_state_actions = net(next_states_v).max(1)[1]
            next_state_actions = next_state_actions.unsqueeze(-1)
            next_state_values = tgt_net(next_states_v).gather(
                1, next_state_actions).squeeze(-1)
        else:
            next_state_values = tgt_net(next_states_v).max(1)[0]
        next_state_values[done_mask] = 0.0
        expected_state_action_values = \
            next_state_values.detach() * gamma + rewards_v
    losses_v = (state_action_values -
                expected_state_action_values) ** 2
    losses_v *= batch_weights_v
    return losses_v.mean(), (losses_v + 1e-5).data.cpu().numpy()


def calc_loss_prio(batch, batch_weights, net, tgt_net, gamma, device="cpu"):
    states, actions, rewards, dones, next_states = common.unpack_batch(batch)

    states_v = torch.tensor(states).to(device)
    actions_v = torch.tensor(actions).to(device)
    rewards_v = torch.tensor(rewards).to(device)
    done_mask = torch.BoolTensor(dones).to(device)
    batch_weights_v = torch.tensor(batch_weights).to(device)

    state_action_values = net(states_v).gather(1, actions_v.unsqueeze(-1)).squeeze(-1)
    with torch.no_grad():
        next_states_v = torch.tensor(next_states).to(device)
        next_state_values = tgt_net(next_states_v).max(1)[0]
        next_state_values[done_mask] = 0.0
        expected_state_action_values = next_state_values.detach() * gamma + rewards_v
    losses_v = batch_weights_v * (state_action_values - expected_state_action_values) ** 2
    return losses_v.mean(), (losses_v + 1e-5).data.cpu().numpy()

In [None]:
random.seed(common.SEED)
torch.manual_seed(common.SEED)
params = common.HYPERPARAMS['pong']

device = torch.device("cuda")

env = gym.make(params.env_name)
env = ptan.common.wrappers.wrap_dqn(env)
env.seed(common.SEED)

net = dqn_extra.RainbowDQN(env.observation_space.shape,
                    env.action_space.n).to(device)

tgt_net = ptan.agent.TargetNet(net)
selector = ptan.actions.ArgmaxActionSelector()
agent = ptan.agent.DQNAgent(net, selector, device=device)

exp_source = ptan.experience.ExperienceSourceFirstLast(
    env, agent, gamma=params.gamma, steps_count=N_STEPS)
buffer = dqn_extra.PrioReplayBuffer(
    exp_source, params.replay_size, PRIO_REPLAY_ALPHA)
optimizer = optim.Adam(net.parameters(),
                       lr=params.learning_rate)

def process_batch(engine, batch_data):
    batch, batch_indices, batch_weights = batch_data
    optimizer.zero_grad()
    loss_v, sample_prios = calc_loss_prio(
        batch, batch_weights, net, tgt_net.target_model,
        gamma=params.gamma**N_STEPS, device=device)
    loss_v.backward()
    optimizer.step()
    buffer.update_priorities(batch_indices, sample_prios)
    if engine.state.iteration % params.target_net_sync == 0:
        tgt_net.sync()
    return {
        "loss": loss_v.item(),
        "beta": buffer.update_beta(engine.state.iteration),
    }

engine = Engine(process_batch)
common.setup_ignite(engine, params, exp_source, NAME)
engine.run(common.batch_generator(buffer, params.replay_initial,
                                  params.batch_size))

Episode 1: reward=-21, steps=762, speed=0.0 f/s, elapsed=0:00:33
Episode 2: reward=-21, steps=762, speed=0.0 f/s, elapsed=0:00:33
Episode 3: reward=-21, steps=759, speed=0.0 f/s, elapsed=0:00:33
Episode 4: reward=-21, steps=760, speed=0.0 f/s, elapsed=0:00:33
Episode 5: reward=-21, steps=759, speed=0.0 f/s, elapsed=0:00:33
Episode 6: reward=-21, steps=758, speed=0.0 f/s, elapsed=0:00:33
Episode 7: reward=-21, steps=756, speed=0.0 f/s, elapsed=0:00:33
Episode 8: reward=-21, steps=757, speed=0.0 f/s, elapsed=0:00:33
Episode 9: reward=-21, steps=756, speed=0.0 f/s, elapsed=0:00:33
Episode 10: reward=-21, steps=755, speed=0.0 f/s, elapsed=0:00:33
Episode 11: reward=-21, steps=756, speed=0.0 f/s, elapsed=0:00:33
Episode 12: reward=-21, steps=761, speed=0.0 f/s, elapsed=0:00:33
Episode 13: reward=-21, steps=760, speed=0.0 f/s, elapsed=0:00:33
Episode 14: reward=-21, steps=818, speed=52.0 f/s, elapsed=0:00:46
Episode 15: reward=-20, steps=901, speed=52.0 f/s, elapsed=0:01:03
Episode 16: rewar

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/home/anton/envs/reinforcement_learning/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3343, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-4-a11980732695>", line 44, in <module>
    params.batch_size))
  File "/home/anton/envs/reinforcement_learning/lib/python3.6/site-packages/ignite/engine/engine.py", line 446, in run
    self._handle_exception(e)
  File "/home/anton/envs/reinforcement_learning/lib/python3.6/site-packages/ignite/engine/engine.py", line 410, in _handle_exception
    raise e
  File "/home/anton/envs/reinforcement_learning/lib/python3.6/site-packages/ignite/engine/engine.py", line 433, in run
    hours, mins, secs = self._run_once_on_dataset()
  File "/home/anton/envs/reinforcement_learning/lib/python3.6/site-packages/ignite/engine/engine.py", line 399, in _run_once_on_dataset
    self._handle_exception(e)
  File "/home/anton/envs/reinforcement_learning/lib/python3.

TypeError: object of type 'NoneType' has no len()