# Double DQN

In [1]:
import sys
sys.path.append("../Chapter08/")

In [2]:
import gym
import ptan
import argparse
import random
import numpy as np

import torch
import torch.optim as optim
import torch.nn as nn

from ignite.engine import Engine

from lib import dqn_model, common

NAME = "03_double"
STATES_TO_EVALUATE = 1000
EVAL_EVERY_FRAME = 100

The core implementation is very simple. What we need to do is slightly modify our
loss function. Let's go a step further and compare action values produced by the
basic DQN and double DQN. To do this, we store a random held-out set of states
and periodically calculate the mean value of the best action for every state in the
evaluation set.
The complete example is in Chapter08/03_dqn_double.py. Let's first take a look
at the loss function:
def calc_loss_double_dqn(batch, net, tgt_net, gamma,
device="cpu", double=True):
states, actions, rewards, dones, next_states = \
common.unpack_batch(batch)
The double extra argument turns on and off the double DQN way of calculating
actions to take.

``` python
states_v = torch.tensor(states).to(device)
actions_v = torch.tensor(actions).to(device)
rewards_v = torch.tensor(rewards).to(device)
done_mask = torch.BoolTensor(dones).to(device)
```

The preceding section is the same as before.

``` python
actions_v = actions_v.unsqueeze(-1)
state_action_vals = net(states_v).gather(1, actions_v)
state_action_vals = state_action_vals.squeeze(-1)
with torch.no_grad():
    next_states_v = torch.tensor(next_states).to(device)
    if double:
        next_state_acts = net(next_states_v).max(1)[1]
        next_state_acts = next_state_acts.unsqueeze(-1)
        next_state_vals = tgt_net(next_states_v).gather(1, next_state_acts).squeeze(-1)
    else:
        next_state_vals = tgt_net(next_states_v).max(1)[0]
    next_state_vals[done_mask] = 0.0
    exp_sa_vals = next_state_vals.detach()*gamma+rewards_v
    return nn.MSELoss()(state_action_vals, exp_sa_vals)
```

Here is the difference compared to the basic DQN loss function. If double DQN is
enabled, we calculate the best action to take in the next state using our main trained
network, but values corresponding to this action come from the target network.
Of course, this part could be implemented in a faster way, by combining next_
states_v with states_v and calling our main network only once, but it will make
the code less clear.
The rest of the function is the same: we mask completed episodes and compute the
mean squared error (MSE) loss between Q-values predicted by the network and
approximated Q-values. The last function that we consider calculates the values
of our held-out state:

``` python
def calc_values_of_states(states, net, device="cpu"):
    mean_vals = []
    for batch in np.array_split(states, 64):
        states_v = torch.tensor(batch).to(device)
        action_values_v = net(states_v)
        best_action_values_v = action_values_v.max(1)[0]
        mean_vals.append(best_action_values_v.mean().item())
    return np.mean(mean_vals)
```

There is nothing too complicated here: we just split our held-out states array into
equal chunks and pass every chunk to the network to obtain action values. From
those values, we choose the action with the largest value (for every state) and
calculate the mean of such values. As our array with states is fixed for the whole
training process, and this array is large enough (in the code we store 1,000 states),
we can compare the dynamics of this mean value in both DQN variants.
The rest of the 03_dqn_double.py file is almost the same; the two differences are
usage of our tweaked loss function and keep randomly sampled 1,000 states for
periodical evaluation.

In [3]:
def calc_loss_double_dqn(batch, net, tgt_net, gamma,
                         device="cpu", double=True):
    states, actions, rewards, dones, next_states = \
        common.unpack_batch(batch)

    states_v = torch.tensor(states).to(device)
    actions_v = torch.tensor(actions).to(device)
    rewards_v = torch.tensor(rewards).to(device)
    done_mask = torch.BoolTensor(dones).to(device)

    actions_v = actions_v.unsqueeze(-1)
    state_action_vals = net(states_v).gather(1, actions_v)
    state_action_vals = state_action_vals.squeeze(-1)
    with torch.no_grad():
        next_states_v = torch.tensor(next_states).to(device)
        if double:
            next_state_acts = net(next_states_v).max(1)[1]
            next_state_acts = next_state_acts.unsqueeze(-1)
            next_state_vals = tgt_net(next_states_v).gather(
                1, next_state_acts).squeeze(-1)
        else:
            next_state_vals = tgt_net(next_states_v).max(1)[0]
        next_state_vals[done_mask] = 0.0
        exp_sa_vals = next_state_vals.detach() * gamma + rewards_v
    return nn.MSELoss()(state_action_vals, exp_sa_vals)

In [4]:
random.seed(common.SEED)
torch.manual_seed(common.SEED)
params = common.HYPERPARAMS['pong']
device = torch.device("cuda")

env = gym.make(params.env_name)
env = ptan.common.wrappers.wrap_dqn(env)
env.seed(common.SEED)

net = dqn_model.DQN(env.observation_space.shape, env.action_space.n).to(device)

tgt_net = ptan.agent.TargetNet(net)
selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=params.epsilon_start)
epsilon_tracker = common.EpsilonTracker(selector, params)
agent = ptan.agent.DQNAgent(net, selector, device=device)

exp_source = ptan.experience.ExperienceSourceFirstLast(
    env, agent, gamma=params.gamma)
buffer = ptan.experience.ExperienceReplayBuffer(
    exp_source, buffer_size=params.replay_size)
optimizer = optim.Adam(net.parameters(), lr=params.learning_rate)

def process_batch(engine, batch):
    optimizer.zero_grad()
    loss_v = calc_loss_double_dqn(batch, net, tgt_net.target_model,
                                  gamma=params.gamma, device=device,
                                  double=True)
    loss_v.backward()
    optimizer.step()
    epsilon_tracker.frame(engine.state.iteration)
    if engine.state.iteration % params.target_net_sync == 0:
        tgt_net.sync()
    if engine.state.iteration % EVAL_EVERY_FRAME == 0:
        eval_states = getattr(engine.state, "eval_states", None)
        if eval_states is None:
            eval_states = buffer.sample(STATES_TO_EVALUATE)
            eval_states = [np.array(transition.state, copy=False) for transition in eval_states]
            eval_states = np.array(eval_states, copy=False)
            engine.state.eval_states = eval_states
        engine.state.metrics["values"] = \
            common.calc_values_of_states(eval_states, net, device)
    return {
        "loss": loss_v.item(),
        "epsilon": selector.epsilon,
    }

engine = Engine(process_batch)
common.setup_ignite(engine, params, exp_source, f"{NAME}={True}", extra_metrics=('values',))
engine.run(common.batch_generator(buffer, params.replay_initial, params.batch_size))

Episode 1: reward=-20, steps=1063, speed=0.0 f/s, elapsed=0:00:26
Episode 2: reward=-21, steps=848, speed=0.0 f/s, elapsed=0:00:26
Episode 3: reward=-21, steps=819, speed=0.0 f/s, elapsed=0:00:26
Episode 4: reward=-20, steps=928, speed=0.0 f/s, elapsed=0:00:26
Episode 5: reward=-21, steps=839, speed=0.0 f/s, elapsed=0:00:26
Episode 6: reward=-20, steps=917, speed=0.0 f/s, elapsed=0:00:26
Episode 7: reward=-20, steps=1014, speed=0.0 f/s, elapsed=0:00:26
Episode 8: reward=-19, steps=939, speed=0.0 f/s, elapsed=0:00:26
Episode 9: reward=-20, steps=1016, speed=0.0 f/s, elapsed=0:00:26
Episode 10: reward=-19, steps=932, speed=0.0 f/s, elapsed=0:00:26
Episode 11: reward=-20, steps=916, speed=53.3 f/s, elapsed=0:00:31
Episode 12: reward=-21, steps=812, speed=53.4 f/s, elapsed=0:00:44
Episode 13: reward=-20, steps=861, speed=53.5 f/s, elapsed=0:00:59
Episode 14: reward=-21, steps=820, speed=53.7 f/s, elapsed=0:01:12
Episode 15: reward=-19, steps=1053, speed=53.9 f/s, elapsed=0:01:29
Episode 16

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.


KeyboardInterrupt

