In [None]:
import os
import json
import numpy as np
from dqn_utils.env_wrapper_NES import get_contra_env
from dqn_evaluate_contra import MultiDQNPolicy, evaluate_single_policy, \
    evaluate_multimodel_by_start_image
from dqn_model import DQN
import matplotlib.pyplot as plt

In [None]:
frame_history_len = 4
demo0 = {'models': [('checkpoints_contra_s00', 'Contra_stage0_0.nss', 2000000),
                    ('checkpoints_contra_s01', 'Contra_stage0_1.nss', 1750000),
                    ('checkpoints_contra_s02', 'Contra_stage0_2.nss', 2000000),
                    ('checkpoints_contra_s03', 'Contra_stage0_3.nss', 7350000),
                   ]}
resdir = os.path.abspath('dqn_utils/nes_env/res/')

state_fnames, bestmodel_fnames, log_fnames = [], [], []

for cpdir, nssfname, bestmod_id in demo0['models']:
    cpdir_ = os.path.abspath(cpdir)
    log_fname = os.path.join(cpdir_, 'latest.json')
    log_fnames.append(log_fname)
    
    state_fname = os.path.join(resdir, nssfname)
    state_fnames.append(state_fname)
    
    bestmodel_fname = os.path.join(cpdir_, 'cp{}.torchmodel'.format(bestmod_id))
    bestmodel_fnames.append(bestmodel_fname)

In [None]:
resdir = os.path.abspath('dqn_utils/nes_env/res/')
trn_start_nss = [os.path.join(resdir, 'Contra_stage0_{}.nss'.format(i)) for i in range(4)]
modelfiles = [os.path.abspath('checkpoints_contra_s0{}/cp{}.torchmodel'.format(i,ci))
              for i, ci in zip(range(4), [2000000, 1750000, 2010000, 2200000])]

settings = [{'epsilon': 0.02, 'agamma':0.8}, {'epsilon': 0.02, 'agamma':0.8},
            {'epsilon': 0.02, 'agamma':0.8}, {'epsilon': 0.3, 'agamma':0.8}]

models = {
    'trained_models': modelfiles,
    'training_start_stages': trn_start_nss,
    'eval_settings': settings
}

In [None]:
%run dqn_evaluate_contra.py
acts = evaluate_multimodel_by_start_image('tmp/play1', models, start_mini_stage=0)

In [None]:
%run dqn_evaluate_contra.py
nssfiles = [os.path.abspath('tmp/play1/s{}.nss'.format(i)) for i in range(4)]
show_game_play(nssfiles, acts)

In [None]:
for model_id in range(2000000, 2500000, 100000):
    print model_id
    for RANDSEED in range(3):
        rng = np.random.RandomState(RANDSEED)
        rec = evaluate_single_policy(state_fnames[3], 
            '/home/junli/projects/dplay/RUNS/t0821_nature_dqn4/'
            'checkpoints_contra_s03/cp{}.torchmodel'.format(model_id), 
            rng, epsilon=0.3, agamma=0.8)
        #avs = np.stack(rec.predicted_action_values).squeeze()
        #plt.plot(avs.max(axis=1))
        #plt.show()

In [None]:
# Get starting scenario image for each trained model

In [None]:
env = get_contra_env()
img_h, img_w, img_c = env.observation_space.shape
input_arg = frame_history_len * img_c  # in_channels = #.frame-history * channels per frame
num_actions = env.action_space.n
def Contra_DQN():
    return DQN(input_arg, num_actions, img_h, img_w).type(FloatTensor)
del env
mp = MultiDQNPolicy(Contra_DQN, get_contra_env, bestmodel_fnames, state_fnames)

In [None]:
# get scene images
scene_images = []
for nss_fname in state_fnames:
    env = get_contra_env(nss_fname)
    env.reset()
    env.step(0)
    scene_images.append(env.frame())
    del env

In [None]:
# get models
dqns = []
for mfname in bestmodel_fnames:
    Q = DQN(input_arg, num_actions, img_h, img_w).type(FloatTensor)
    Q.load(bestmodel_fnames[mod_id])
    dqns.append(Q)

In [None]:
# check performance to choose the checkpoint to load
mod_id = 1
if mod_id != 3: # the last one's log has been corrupted.
    with open(log_fnames[mod_id], 'r') as f:
        logd = json.load(f)
        mean_rew = logd['mean_episode_reward']
else:
    with open(log_fnames[mod_id], 'r') as f:
        txt = f.read()
    mean_rew = np.fromstring(txt[60:], dtype=float, sep=',')

In [None]:
plt.clf()
plt.plot(mean_rew)
plt.ylim([-10,20])
plt.show()
print np.argmax(mean_rew)

In [None]:
####
# quick dirty testing ...
from dqn_utils.replaybuffer import ReplayBuffer
import torch
from torch.autograd import Variable
from dqn_utils.evaluation import EvaluationRecord
from itertools import count

env = get_contra_env()

replay_buffer_size = 250000
frame_history_len = 4
USE_CUDA = torch.cuda.is_available()
FloatTensor = torch.cuda.FloatTensor if USE_CUDA else torch.FloatTensor

def select_greedy_action(model, s):
    s_ = torch.from_numpy(s).type(FloatTensor).unsqueeze(0) / 255.0
    # unsqueeze(0) => to make the observation a one-sample batch
    predicted_action_values = model(Variable(s_, volatile=True)).data  # type: torch.FloatTensor
    greedy_action = predicted_action_values.max(dim=1)[1].cpu()
    # the 2nd return val of max is the index of the max (argmax) in each row (since
    # we have specified dim=1 in the function call)
    return greedy_action, predicted_action_values

img_h, img_w, img_c = env.observation_space.shape
input_arg = frame_history_len * img_c  # in_channels = #.frame-history * channels per frame
num_actions = env.action_space.n
    
#Q = DQN(input_arg, num_actions, img_h, img_w).type(FloatTensor)
#Q.load(bestmodel_fnames[mod_id])
rec = EvaluationRecord(
        observations=[],
        hidden_features=None, #collector_hook.data,
        predicted_action_values=[],
        actions=[])
        

scene_image_diff = []
mini_stage = 0
if mini_stage == 0:
    next_stage_state_file = state_fnames[0]
else:
    next_stage_state_file = os.path.abspath(
        'tmp/play/mini_stage_{}.nss'.format(mini_stage))
    mp.set_current_model(mini_stage)
    
while mini_stage < 2:
    print "Now do {}".format(next_stage_state_file)
    env.load_state(next_stage_state_file)
    mini_stage_done = False
    
    while not mini_stage_done:
        # env = get_contra_env(next_stage_state_file)
        replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len)
        last_obs = env.reset()
        for t in count():
            buf_idx = replay_buffer.store_frame(last_obs)
            recent_observations = replay_buffer.encode_recent_observation()

            did_change_model, sc_ = mp.change_model_when_ready(env, t)
            scene_image_diff.append(sc_)
            Q = mp.get_current_model()
            if did_change_model:
                # save
                mini_stage_done = True
                print "Mini stage {} cleared".format(mini_stage)
                mini_stage += 1
                next_stage_state_file = 'tmp/play/mini_stage_{}.nss'.format(mini_stage)
                env.save_state(next_stage_state_file)
                break
            action, action_values = select_greedy_action(Q, recent_observations)
            if t<10:
                print action_values, action 
            action = action[0, 0]
            action_values = action_values.cpu().numpy()
            rec.observations.append(recent_observations)
            rec.predicted_action_values.append(action_values)
            rec.actions.append(action)
            obs, reward, done, _ = env.step(action)
            replay_buffer.store_effect(buf_idx, action, reward, done)
            #rr += reward
            last_obs = obs
            if done:
                break
        
        
# draw values
avs = np.vstack(rec.predicted_action_values)
plt.clf()
#for i in range(19):
#    plt.plot(avs[:,i])
plt.plot(avs.max(axis=1))
plt.show()
plt.clf()
plt.plot(scs_)
plt.show()
del env

In [None]:
0 62.9148065476
Model id 140567545210704
1 62.5227864583
Model id 140567545210704
2 60.5958891369
Model id 140567545210704
3 63.6144903274
Model id 140567545210704


plt.plot(scs_, 'b.-')
plt.ylim([10,30])
plt.xlim([100,450])
plt.grid('on')
plt.show()

In [None]:
avs = np.vstack(rec.predicted_action_values)
plt.clf()
#for i in range(19):
#    plt.plot(avs[:,i])
plt.plot(avs.max(axis=1))
plt.xlim([0,100])
plt.show()


In [None]:
del env

# Test Meta Policies

In [None]:
from dqn_utils.replaybuffer import ReplayBuffer
import torch
from torch.autograd import Variable
from dqn_utils.evaluation import EvaluationRecord


replay_buffer_size = 250000
frame_history_len = 4
USE_CUDA = torch.cuda.is_available()
FloatTensor = torch.cuda.FloatTensor if USE_CUDA else torch.FloatTensor


def evaluate(env, saved_models, maxsteps, meta_policy):
    """
    :param saved_models: list of saved model file names
    :param meta_policy: how to choose acting policy
    """
    # - greedy policy
    def select_greedy_action(model, s):
        s_ = torch.from_numpy(s).type(FloatTensor).unsqueeze(0) / 255.0
        # unsqueeze(0) => to make the observation a one-sample batch
        predicted_action_values = model(Variable(s_, volatile=True)).data  # type: torch.FloatTensor
        greedy_action = predicted_action_values.max(dim=1)[1].cpu()
        # the 2nd return val of max is the index of the max (argmax) in each row (since
        # we have specified dim=1 in the function call)
        return greedy_action, predicted_action_values
    
    img_h, img_w, img_c = env.observation_space.shape
    input_arg = frame_history_len * img_c  # in_channels = #.frame-history * channels per frame
    num_actions = env.action_space.n
    replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len)
    
    recs = []
    Qs = []
    for fn in saved_models:
        Q = DQN(input_arg, num_actions, img_h, img_w).type(FloatTensor)
        Q.load(fn)
        Qs.append(Q)
        rec = EvaluationRecord(
                observations=[],
                hidden_features=None, #collector_hook.data,
                predicted_action_values=[],
                actions=[])
        recs.append(rec)
        
    last_obs = env.reset()
    for t in range(maxsteps):
        replay_buffer.store_frame(last_obs)
        recent_observations = replay_buffer.encode_recent_observation()
        
        action_candidates = []
        action_value_predictions = []
        for Q, rec in zip(Qs, recs):
            action, action_values = select_greedy_action(Q, recent_observations)
            action = action[0, 0]
            action_candidates.append(action)
            action_values = action_values.cpu().numpy()
            action_value_predictions.append(action_values)
            rec.observations.append(recent_observations)
            rec.predicted_action_values.append(action_values)
            rec.actions.append(action)
            
        act_model_id = meta_policy(t, action_value_predictions)
        action = action_candidates[act_model_id]
        obs, reward, done, _ = env.step(action)
        #rr += reward
        last_obs = obs
        if done:
            break
    return recs
        

In [None]:
env = get_contra_env(state_fnames[0])

In [None]:
mp = MetaPolicy()
recs = evaluate(env, bestmodel_fnames, maxsteps=5000, meta_policy=mp)

In [None]:
del env

In [None]:
# Plot recorded predicted actition values during the game
max_expect_rewards = []
for rec_ in recs:
    avs_ = np.vstack(rec_.predicted_action_values)
    max_expect_rewards.append(avs_.max(axis=1))
plt.clf()
axhandles = []
labels = []
for i_, mr_ in enumerate(max_expect_rewards):
    h_, = plt.plot(mr_)
    axhandles.append(h_)
    labels.append("model-{}".format(i_))
plt.legend(axhandles, labels)
plt.show()

In [None]:
replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len)

last_obs = env.reset()
replay_buffer.store_frame(last_obs)
for i in range(10):
    last_obs, _, _, _ = env.step(0)
last_obs, _, _, _ = env.step(0)
rr= 0

In [None]:
for t in range(10000):
    replay_buffer.store_frame(last_obs)
    recent_observations = replay_buffer.encode_recent_observation()
    action, action_values = select_greedy_action(Q, recent_observations)
    action = action[0, 0]
    action_values = action_values.cpu().numpy()
    rec.observations.append(recent_observations)
    rec.predicted_action_values.append(action_values)
    rec.actions.append(action)
    obs, reward, done, _ = env.step(action)
    rr += reward
    last_obs = obs
    if done:
        break

In [None]:
avs = np.vstack(rec.predicted_action_values)

In [None]:
plt.clf()
#for i in range(19):
#    plt.plot(avs[:,i])
plt.plot(avs.max(axis=1))
plt.show()

In [None]:
# rec = dqn_evaluate(env=env, q_func=DQN, trained_model_fname=bestmodel_fname, max_eval_steps=500)

# Test if we can repeat our experiment
SUCCEED 
DEBUGGED

In [None]:
import os
%run dqn_evaluate_contra.py

In [None]:
nssfile = os.path.abspath('dqn_utils/nes_env/res/Contra_stage0_2.nss')
modelfile = os.path.abspath('checkpoints_contra_s02/cp2000000.torchmodel')
RANDSEED=0
rng = np.random.RandomState(RANDSEED)
obs_rec_1, act_rec_1, actval_rec_1, frm_rec_1, rb1 = \
    test_repeat_model_eval(nssfile, modelfile, rng)

In [None]:
rng = np.random.RandomState(RANDSEED)
obs_rec_2, act_rec_2, actval_rec_2, frm_rec_2, rb2 = \
    test_repeat_model_eval(nssfile, modelfile, rng, {'obs': obs_rec_1})
    

In [None]:
obs_1 = np.stack(obs_rec_1)
act_1 = np.stack(act_rec_1)
val_1 = np.stack(actval_rec_1)
frm_1 = np.stack(frm_rec_1)
obs_2 = np.stack(obs_rec_2)
act_2 = np.stack(act_rec_2)
val_2 = np.stack(actval_rec_2)
frm_2 = np.stack(frm_rec_2)
n1 = obs_1.shape[0]
n2 = obs_2.shape[0]
n = min(n1, n2)
print "Observation numbers: test-1: {}; test-2: {}".format(n1, n2)
print "Observation shape {}".format(obs_1.shape[1:])
print "Frame shape {}".format(frm_1.shape[1:])
print "Act value shape {}".format(val_1.shape[1:])

def fn_diff(x1, x2):
    return np.abs(x1[:n] - x2[:n]).reshape(n, -1).max(axis=1)
    
obs_d = fn_diff(obs_1, obs_2)
act_d = fn_diff(act_1, act_2)
val_d = fn_diff(val_1, val_2)
frm_d = fn_diff(frm_1, frm_2)

def str_first_nonzero(x): # x must be 1d
    i = np.nonzero(x)[0] 
    if i.size > 0:
        i = i[0]
        return "[{}]:{:.2f}".format(i, x[i])
    return "None"

print "Compare observations: {}".format(str_first_nonzero(obs_d))
print "Compare actions: {}".format(str_first_nonzero(act_d))
print "Compare values: {}".format(str_first_nonzero(val_d))
print "Compare frames: {}".format(str_first_nonzero(frm_d))

The output of the aboive cell show there is problems in the replay memory:

Observation numbers: test-1: 40; test-2: 44
Observation shape (12, 112, 128)
Frame shape (112, 128, 3)
Act value shape (1, 19)
Compare observations: [33]:240.00
Compare actions: [33]:12.00
Compare values: [33]:1.88
Compare frames: [34]:252.00


the history given to the memory is exactly the same, difference occures in the observation (the input to the model), the difference in the frame only follow because the dfferent chosen actions.Also I remember in Atari games, the size of replay memory matters in evaluation, which makes no sense.

Check replay memory, by:

- Run the experiment once, and save the output of the replay memory for the model
- In the second time of running the game, check the output for each time step

We must save the effect in replay memory, otherwise, the "done" flags are random, and will seriously affect the encoded observations

# Test saved trained models
SUCCEED

In [None]:
import os
%run dqn_evaluate_contra.py
stagefiles = [os.path.abspath('dqn_utils/nes_env/res/Contra_stage0_{}.nss'.format(i))
              for i in range(4)]
modelfiles = [os.path.abspath('checkpoints_contra_s0{}/cp{}.torchmodel'.format(i,ci))
              for i, ci in zip(range(4), [2000000, 1750000, 2000000, 3500000])]
for nssfile, mfile in zip(stagefiles, modelfiles):
    RANDSEED=0
    rng = np.random.RandomState(RANDSEED)
    obs_rec, act_rec, actval_rec, frm_rec, rb1 = \
        test_repeat_model_eval(nssfile, mfile, rng)

In [None]:
import os
%run dqn_evaluate_contra.py
stagefiles = [os.path.abspath('tmp/play1/s{}.nss'.format(i))
              for i in range(4)]
modelfiles = [os.path.abspath('checkpoints_contra_s0{}/cp{}.torchmodel'.format(i,ci))
              for i, ci in zip(range(4), [2000000, 1750000, 2010000, 3500000])]
for nssfile, mfile in zip(stagefiles, modelfiles):
    RANDSEED=0
    rng = np.random.RandomState(RANDSEED)
    obs_rec, act_rec, actval_rec, frm_rec, rb1 = \
        test_repeat_model_eval(nssfile, mfile, rng)