In [1]:
import numpy as np
import os
import gym
import torch
from torch.autograd import Variable
import matplotlib.pyplot as plt
from matplotlib import gridspec
from sklearn.manifold import TSNE
from dqn_model import DQN
from dqn_learn import OptimiserSpec, dqn_learning
from dqn_utils.mygym import get_env
from dqn_utils.atari_wrapper import wrap_deepmind
from dqn_utils.replaybuffer import ReplayBuffer
from dqn_utils.evaluation import *
from collections import namedtuple
USE_CUDA = torch.cuda.is_available()
FloatTensor = torch.cuda.FloatTensor if USE_CUDA else torch.FloatTensor

In [None]:
# sanity check
import time

GAME_ID_PONG = 3
benchmark = gym.benchmark_spec('Atari40M')
task = benchmark.tasks[GAME_ID_PONG]
seed = 0
env = get_env(task, seed)

# - memory
replay_buffer_size = 1000000
frame_history_len = 4
img_h, img_w, img_c = env.observation_space.shape
input_arg = frame_history_len * img_c  # in_channels = #.frame-history * channels per frame
num_actions = env.action_space.n
replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len)

# - model
Q = DQN(input_arg, num_actions).type(FloatTensor)
Q.load('checkpoints/cp7390000.torchmodel')

# - policy
def select_greedy_action(model, s):
    s_ = torch.from_numpy(s).type(FloatTensor).unsqueeze(0) / 255.0
    # unsqueeze(0) => to make the observation a one-sample batch
    predicted_action_values = model(Variable(s_, volatile=True)).data  # type: torch.FloatTensor
    greedy_action = predicted_action_values.max(dim=1)[1].cpu()
    # the 2nd return val of max is the index of the max (argmax) in each row (since
        # we have specified dim=1 in the function call)
    return greedy_action, predicted_action_values

last_obs = env.reset()
t = 0

rr = 0
while t in range(1000):
    replay_buffer.store_frame(last_obs)
    recent_observations = replay_buffer.encode_recent_observation()
    action, action_values = select_greedy_action(Q, recent_observations)
    action = action[0, 0]
    action_values = action_values.cpu().numpy()
    obs, reward, done, _ = env.step(action)
    last_obs = obs
    if done:
        break
    #env.render()
    rr += reward
    print "\r {}: {:.2f}".format(t, rr),

In [3]:
# Init the game environment
GAME_ID_PONG = 6
benchmark = gym.benchmark_spec('Atari40M')
task = benchmark.tasks[GAME_ID_PONG]
seed = 0
env = get_env(task, seed)
#env = gym.make(task.env_id)
#env = wrap_deepmind(env)
## env = wrappers.Monitor(env, 'tmp/evaluation-monitor', force=True)

# prepare demo output
PLOT_FEATURE = True
trained_model_fname = 'checkpoints_alieninvader/cp9000000.torchmodel'
cpfname = os.path.split(os.path.splitext(trained_model_fname)[0])[1]
output_image_prefix = 'tmp/{}/{}/f'.format(task.env_id, cpfname)
output_image_dir = os.path.split(output_image_prefix)[0]
print output_image_dir
if not os.path.exists(output_image_dir):
    os.makedirs(output_image_dir)
ffmpeg_cmd = "ffmpeg -y -framerate 10 -i {}_%05d.png -c:v " \
    "libx264 -pix_fmt yuv420p {}/out.mp4".format(
        os.path.abspath(output_image_prefix),
        os.path.abspath(output_image_dir))
print ffmpeg_cmd

[2017-08-29 09:05:52,833] Making new env: SpaceInvadersNoFrameskip-v4
[2017-08-29 09:05:53,111] Finished writing results. You can upload them to the scoreboard via gym.upload('/home/junli/projects/dplay/RUNS/t0821_nature_dqn4/tmp/gym-results')


tmp/SpaceInvadersNoFrameskip-v4/cp9000000
ffmpeg -y -framerate 10 -i /home/junli/projects/dplay/RUNS/t0821_nature_dqn4/tmp/SpaceInvadersNoFrameskip-v4/cp9000000/f_%05d.png -c:v libx264 -pix_fmt yuv420p /home/junli/projects/dplay/RUNS/t0821_nature_dqn4/tmp/SpaceInvadersNoFrameskip-v4/cp9000000/out.mp4


In [None]:
print env.action_space.contains(6)

In [8]:
# ACTION meanings
import time
EXPLORE_ACTION = False
if EXPLORE_ACTION:
    actions = [0, 0, 0, 2, 2, 2, 5, 5, 5, 4, 4, 4]
    env = gym.make(task.env_id)
    env.reset()
    for A in actions:
        for i in range(100):
            env.step(A)
            env.render()
            time.sleep(0.02)
            print "\r", A,
# Pong    
# action_labels = ['NOOP', 'NOOP', 'UP', 'DOWN', 'UP', 'DOWN']
# SpaceInvader
action_labels = ['NOOP', 'FIRE', 'RIGHT', 'LEFT', 'R+F', 'L+F']
#action_labels = None

In [4]:
# Make model
# - game configurations
frame_history_len = 4
max_evaluation_steps = 10000
replay_buffer_size = 1000000 # not training, just for the frame-encoding function

# - init model parameters (derived)
img_h, img_w, img_c = env.observation_space.shape
input_arg = frame_history_len * img_c  # in_channels = #.frame-history * channels per frame
num_actions = env.action_space.n
replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len)

# - greedy policy
def select_greedy_action(model, s):
    s_ = torch.from_numpy(s).type(FloatTensor).unsqueeze(0) / 255.0
    # unsqueeze(0) => to make the observation a one-sample batch
    predicted_action_values = model(Variable(s_, volatile=True)).data  # type: torch.FloatTensor
    greedy_action = predicted_action_values.max(dim=1)[1].cpu()
    # the 2nd return val of max is the index of the max (argmax) in each row (since
        # we have specified dim=1 in the function call)
    return greedy_action, predicted_action_values
    
collector_hook = FeatureCollector()
Q = DQN(input_arg, num_actions).type(FloatTensor)
if os.path.exists(trained_model_fname):
    Q.load(trained_model_fname)
else:
    print "No such file!!"
if PLOT_FEATURE:
    tmp_mod_list = [m for m in Q.modules()]
    fc5 = tmp_mod_list[-1] # get the last forward layer
    fc5.register_forward_hook(collector_hook)


# Test cell, see if the first step can run
TEST_STEP = False
if TEST_STEP:
    last_obs = env.reset()
    replay_buffer.store_frame(last_obs)
    recent_observations = replay_buffer.encode_recent_observation()
    action, action_values = select_greedy_action(Q, recent_observations)

In [5]:
# Run the game and collect all we needed.
last_obs = env.reset()
t = 0
rec = EvaluationRecord(
    observations = [],
    final_features = collector_hook.data,
    predicted_action_values = [],
    actions = []
)

rr=0
for t in range(max_evaluation_steps):
    replay_buffer.store_frame(last_obs)
    recent_observations = replay_buffer.encode_recent_observation()
    action, action_values = select_greedy_action(Q, recent_observations)
    action = action[0, 0]
    action_values = action_values.cpu().numpy()
    rec.observations.append(recent_observations)
    rec.predicted_action_values.append(action_values)
    rec.actions.append(action)
    obs, reward, done, _ = env.step(action)
    last_obs = obs
    if done:
        break
    rr += reward
    print "\r {}: {:.2f}".format(t, rr),

[2017-08-29 09:06:00,329] Starting new video recorder writing to /home/junli/projects/dplay/RUNS/t0821_nature_dqn4/tmp/gym-results/openaigym.video.1.24018.video000000.mp4


 1580: 70.00                      


# Visualisation

In [6]:
if PLOT_FEATURE:
    tsne = TSNE()
    feat2d = tsne.fit_transform(np.vstack(rec.hidden_features))
else:
    feat2d = None

In [9]:
draw_state_evaluation(rec, feat2d, output_image_prefix,
    action_labels=action_labels)

T=1581                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  