In [22]:
import sys
if "../" not in sys.path:
  sys.path.append("../")

import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from utils.gym_utils import get_env
from utils.schedule import PiecewiseSchedule, LinearSchedule
from deep_q_learning import OptimizerSpec, dqn_learing

import matplotlib
%matplotlib inline
matplotlib.style.use('ggplot')

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [23]:
# Get Atari games.
benchmark = gym.benchmark_spec('Atari40M')

# Change the index to select a different game.
task = benchmark.tasks[3]

In [24]:
# Run training
seed = 0 # Use a seed of zero (you may want to randomize the seed!)
env = get_env(task, seed)

[2017-02-28 17:35:00,948] Making new env: PongNoFrameskip-v3
[2017-02-28 17:35:01,018] Clearing 2 monitor files from previous run (because force=True was provided)


In [25]:
class DQN(nn.Module):
    def __init__(self, in_channels=4, num_actions=18):
        """
        Initialize a deep Q-learning network as described in
        https://storage.googleapis.com/deepmind-data/assets/papers/DeepMindNature14236Paper.pdf
        Arguments:
            input_channel: number of channel of input.
                i.e The number of most recent frames stacked together as describe in the paper
            num_actions: number of action-value to output, one-to-one correspondence to action in game.
        """
        super(DQN, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, 32, kernel_size=8, stride=4)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
        self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)
        self.fc4 = nn.Linear(7 * 7 * 64, 512)
        self.fc5 = nn.Linear(512, num_actions)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = F.relu(self.fc4(x.view(x.size(0), -1)))
        return self.fc5(x)

In [26]:
BATCH_SIZE = 32
GAMMA = 0.99
REPLAY_BUFFER_SIZE=1e6
LEARNING_STARTS=5e4
LEARNING_FREQ=4
FRAME_HISTORY_LEN=4
TARGER_UPDATE_FREQ=1e4
GRAD_NORM_CLIPPING=10
NUM_TIMESTEPS = 10

In [80]:
# This is just a rough estimate
num_iterations = float(NUM_TIMESTEPS) / 4.0

# define learning rate and exploration schedules below
lr_multiplier = 1.0
lr_schedule = PiecewiseSchedule([
    (0, 1e-4 * lr_multiplier),
    (num_iterations / 10, 1e-4 * lr_multiplier),
    (num_iterations / 2,  5e-5 * lr_multiplier),
], outside_value=5e-5 * lr_multiplier)

optimizer = OptimizerSpec(
    constructor=optim.Adam,
    kwargs=dict(eps=1e-4),
    lr_schedule=lr_schedule
)

exploration_schedule = PiecewiseSchedule([
    (0, 1.0),
    (1, 0.1),
    (num_iterations / 2, 0.01),
], outside_value=0.01)

In [28]:
def stopping_criterion(t):
    return t >= NUM_TIMESTEPS

In [88]:
dqn_learing(
    env=env,
    q_func=DQN,
    optimizer_spec=optimizer,
    exploration=exploration_schedule,
    stopping_criterion=stopping_criterion,
    replay_buffer_size=1000000,
    batch_size=4,
    gamma=0.99,
    learning_starts=0,
    learning_freq=4,
    frame_history_len=4,
    target_update_freq=10000,
    grad_norm_clipping=10
)

KeyError: <class 'torch.ByteTensor'>