In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
%%javascript
var nb = IPython.notebook;
var kernel = IPython.notebook.kernel;
var command = "FULL_NOTEBOOK = '" + nb.base_url + nb.notebook_path + "'";
kernel.execute(command);

In [None]:
THIS_NOTEBOOK = FULL_NOTEBOOK.split('/')[-1]
print "Current Notebook: {}".format(THIS_NOTEBOOK) 

# Prerequisites

In [None]:
import os
import torch
import torch.nn as nn
import json
from torch.autograd import Variable
import numpy as np
import imp  # Python 2
from collections import deque
import gym
import time
import subprocess
from dplay_utils.tensordata import to_tensor_f32
p = subprocess.Popen('hostname', shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
hostname = p.stdout.readlines()[0][:-1]
global USE_CUDA
USE_CUDA = torch.cuda.is_available()
if hostname == 'maibu':
    REL_PROJ_PATH = 'local/projects/dplay'
else:   # configure project folder on different machines
    REL_PROJ_PATH = 'projects/dplay'
    
FULL_PROJ_PATH = os.path.join(os.environ['HOME'], REL_PROJ_PATH)

# Prerequisites-END

# Components

### Data manage
Work with Environment. Preprocess Data and Handle GPU CPU trans.

In [None]:
from experience_managers.preprocessors import GymAtariFramePreprocessor_Stacker, GymAtariFramePreprocessor_Diff

In [None]:
from games.aigym import AtariEnvironment_Pong

In [None]:
from experience_managers.mem import ExperienceMemory

In [None]:
# Test Memory and Environment - 1, Save experience
%pylab inline
preproc = GymAtariFramePreprocessor_Stacker()
env = AtariEnvironment_Pong()
mem = ExperienceMemory(500, 0.99)
s = preproc(env.reset())
ei = 0
t = 0
while ei<10:
    print "\rt {}".format(t),
    sys.stdout.flush()
    s1, r, term, _ = env.step(1)
    s1 = preproc(s1)
    mem.add_experience(ei, s, 1, r, term)
    
    if term:
        s = preproc(env.reset())
        ei += 1
    else:
        s = s1
    env.render()
    t += 1
    
plot(mem.experience['advantages'])
plot(mem.experience['rewards'], 'r')

In [None]:
# Test Memory and Environment - 2: Sanity check.
test_states, test_actions, test_advs = mem.get_training_batch(
    episodes=[8,9])

print test_states.numpy().shape
plot(test_advs.numpy())

### Networks
The neural network that takes states and produces desired assessments. A network consists of two parts: encoder and decoder:
- Encoder: This part of the model is generic -- once the extractor has been learned, it can be adapted to other tasks with difference format of inputs (same number of channels) and it is independent with the task-specific target. See below.
- Decoder: It takes the features and procudes the outputs, e.g. in Q-learning the targets are action values, in policy gradient, the targets are next action probabilities.

In [None]:
from networks.encoders.conv_encoders import DeepConvEncoder, DummyEncoder

In [None]:
from networks.decoders.policy_decoders import Decoder

In [None]:
from networks.nets import RLNet

In [None]:
# Sanity check
encoder_opts = {
    'input_channels': 3,
    'convs': [
        {'kernel_size':3, 'conv_kernels': 32, 'pool_factor': 2, 'relu': True},
        {'kernel_size':3, 'conv_kernels': 64, 'pool_factor': 2, 'relu': True},
    ]
}

encoder = DeepConvEncoder(encoder_opts)

nin = encoder.get_feature_num({'height':preproc.im_height, 'width':preproc.im_width})

decoder_opts = {
    'input_num': nin,
    'fc1_hidden_unit_num': 256,
    'output_num':4
}

decoder = Decoder(decoder_opts)
net = RLNet(encoder, decoder)
y = net(Variable(test_states, requires_grad=False))
yv = y.data.numpy()
print yv.shape
print yv[:20]

### Policy
Policies select an action for a state. The state is given in a preprocessed form that is ready to be taken by an RLNet object, which produces assessment of the state. Policy then chooses an action accordingly.

In [None]:
from policies.sa_policies import Policy

In [None]:
# Sanity check of policy
po = Policy(net)
s_ = preproc(env.reset())
print po.get_action(s_)

### Trainer
Trainer takes recent experience, adjust model parameters to minimise a loss. Hopefully, a smaller loss will lead to a better performance.

In [None]:
from rl.train import OneStepPolicyGradientTrainer

In [None]:
# Sanity check of trainer.
opts_ = {'Optimiser': torch.optim.Adagrad, 'learning_rate':1e-6}
trainer = OneStepPolicyGradientTrainer(net, mem, opts_)
trainer.step()

NLLoss is defined as 
$$
\sum_n - \log P_{n_i}
$$
where $n_i$ is the actual class for the $n$-th sample and $P$ is the predicted prob. To minimise the negative value of the log-probability is to push the network so the probability of the classes tha actually happen increases. I.e. when $n_i$-th class is the case for $n$-th sample, you'd like the model to predict more chance of class $n_i$ for the $n$-th sample next time. 

In RL, we introduce the concept of {\em advantage}: instead of increasing the likelihood of acutal action, we allow the probability to go both ways -- it get increased if the chosen action turns out to be a good one, and on the contrary, for decisions turns to be bad, it can decrease its future probability.

### Keeper
A Keeper maintains information about the training, such as how many epoches, episodes, minibatches. The methods can be thought as **callbacks** -- to be invoked by the learning algorithm at various occasions, such as when and how to save/load models, when to stop and when to perform evaluation etc.

In [None]:
from rl.keeppg import Keeper

In [None]:
# Sanity check of Keeper
# 1. setup
preproc = GymAtariFramePreprocessor_Stacker()
env = AtariEnvironment_Pong()
mem = ExperienceMemory(500, 0.99)
encoder_opts = {
    'input_channels': 3,
    'convs': [
        {'kernel_size':3, 'conv_kernels': 32, 'pool_factor': 2, 'relu': True},
        {'kernel_size':3, 'conv_kernels': 64, 'pool_factor': 2, 'relu': True},
    ]
}

encoder = DeepConvEncoder(encoder_opts)

nin = encoder.get_feature_num({'height':preproc.im_height, 'width':preproc.im_width})

decoder_opts = {
    'input_num': nin,
    'fc1_hidden_unit_num': 256,
    'output_num':4
}

decoder = Decoder(decoder_opts)
net = RLNet(encoder, decoder)

po = Policy(net)

trainer = OneStepPolicyGradientTrainer(net, mem, 
    {'Optimiser': torch.optim.Adagrad, 'learning_rate':1e-4})

path_opts = {
    'BASE_PATH': '/Users/junli/local/projects/dplay',
    'RUN_PATH': 'RUNS',
    'experiment_id': 'TEST01_sanitychk'}

running_dir = os.path.join(path_opts['BASE_PATH'], 
                           path_opts['RUN_PATH'], 
                           path_opts['experiment_id'])

save_dir = os.path.join(running_dir, 'checkpoints')

if not os.path.exists(running_dir):
    os.mkdir(running_dir)  # NOT using makedirs, I want the 
    # users to be responsible for the parent directory (and 
    # overall structure)
if not os.path.exists(save_dir):
    os.mkdir(save_dir)

keeper = Keeper([encoder, decoder, po, mem], {
    'train_every_n_episodes': 1,
    'save_every_n_training_steps': 10,
    'draw_every_n_training_steps': -1,
    'max_training_steps': 100,
    'save_path': save_dir,
    'report': {
                'save_checkpoint': True,
                'every_n_steps': 1,
                'every_n_training': 1,
                'every_n_episodes': 1, 
                'every_n_time_records': 100}})

In [None]:
# 2. get some data for training
state = preproc.process(env.reset())

while not keeper.need_train:
    action, action_prob = po.get_action(state)
    next_state, reward, is_terminal, _ = env.step(action)
    next_state = preproc.process(next_state)
    ep = keeper.records['episodes']
    mem.add_experience(ep, state, action, reward, is_terminal, None)
    keeper.record_env_step(reward, is_terminal)

In [None]:
# 3. do training
for i in range(10):
    loss = trainer.step()
    keeper.record_train_step(loss)
    print i, loss

In [None]:
# 4. save and load
keeper.save()

In [None]:
# GO ABOVE, re-initialise the encoder/decoder / re-collect experience, see if
# the training starts from where it is supposed to 
keeper.load()
for i in range(10):
    loss = trainer.step()
    keeper.record_train_step(loss)
    print i, loss

In [None]:
keeper.load()
state = preproc.process(env.reset())
while not keeper.need_stop:
    action, action_prob = po.get_action(state)
    next_state, reward, is_terminal, _ = env.step(action)
    next_state = preproc.process(next_state)
    mem.add_experience(state, action, reward, is_terminal, None)
    if is_term:
        state = preproc.process(env.reset())
    else:
        state = next_state
    
    keeper.record_env_step(reward, is_terminal)
    
    if keeper.need_train:  # TODO train condition call back
        loss = train_step()
        keeper.record_train_step(loss)
        
    if keeper.need_save:
        keeper.save()
        
    if keeper.need_draw:
        env.render()
        
    keeper.report_step()

# Components-END

# Framework-F1

In [None]:
# Framework definition:
# **Necessary to run this cell** to create experiment package for this framework
RL_components = {
    'Preprocessor': GymAtariFramePreprocessor_Stacker,
    'ExperienceMemoryManager': ExperienceMemory,
    'Encoder': DeepConvEncoder,
    'Decoder': Decoder,
    'RLNet': RLNet,
    'Policy': Policy,
    'Environment': AtariEnvironment_Pong,
    'Trainer': OneStepPolicyGradientTrainer,
    'Keeper': Keeper,
}

experience_opts = {
    'capacity': 1000,
    'discount': 0.99
}

encoder_opts = {
    'input_channels': 3,
    'convs': [
        {'kernel_size':3, 'conv_kernels': 32, 'pool_factor': 2, 'relu': True},
        {'kernel_size':3, 'conv_kernels': 64, 'pool_factor': 2, 'relu': True},
    ]
}

decoder_opts = {
    'input_num': None,
    'fc1_hidden_unit_num': 256,
    'output_num':4
}

trainer_opts = {'Optimiser': torch.optim.Adagrad, 'learning_rate':1e-4}

path_opts = {
    'BASE_PATH': FULL_PROJ_PATH,
    'RUN_PATH': 'RUNS',
    'experiment_id': 'TEST01a'}

running_dir = os.path.join(path_opts['BASE_PATH'], 
                           path_opts['RUN_PATH'], 
                           path_opts['experiment_id'])

save_dir = os.path.join(running_dir, 'checkpoints')

if not os.path.exists(running_dir):
    os.mkdir(running_dir)  # NOT using makedirs, I want the 
    # users to be responsible for the parent directory (and 
    # overall structure)
if not os.path.exists(save_dir):
    os.mkdir(save_dir)

keeper_opts = {
    'train_every_n_episodes': 1,
    'save_every_n_training_steps': 5000,
    'draw_every_n_training_steps': -1,
    'max_training_steps': 2000000,
    'save_path': save_dir,
    'report': {'save_checkpoint': True,
               'every_n_steps': -1,
               'every_n_training': 1,
               'every_n_episodes': 1,
               'every_n_time_records': 100}
}
    
# CREATE LEARNING COMPONENTS
env = RL_components['Environment']()
preproc = RL_components['Preprocessor']()
mem = RL_components['ExperienceMemoryManager'](**experience_opts)
enc = RL_components['Encoder'](encoder_opts)
decoder_opts['input_num'] = enc.get_feature_num({'height':preproc.im_height, 'width':preproc.im_width})
dec = RL_components['Decoder'](decoder_opts)
rlnet = RL_components['RLNet'](enc, dec)
policy = RL_components['Policy'](rlnet)
trainer = RL_components['Trainer'](rlnet, mem, trainer_opts)
keeper = RL_components['Keeper']([enc, dec, policy, mem], keeper_opts)  # objects has "save/load" interface

In [None]:
# RUNNING: this part does the actual work. 
# NOT necessary to run this cell to create experiment package for this framework
keeper.load()
state = preproc.process(env.reset())


while not keeper.need_stop:
    keeper.set_timer()
    action, action_prob = policy.get_action(state)
    keeper.record_time('policy.get_action')
    next_state, reward, is_terminal, _ = env.step(action)
    keeper.record_time('env.step')
    next_state = preproc(next_state)
    ep = keeper.records['episodes']
    keeper.set_timer()
    mem.add_experience(ep, state, action, reward, is_terminal, None)
    keeper.record_time('mem.add_experience')
    # None: We don't use last prediction (will predict in traing step)
    
    if is_terminal:
        state = preproc.process(env.reset())
    else:
        state = next_state
        
    keeper.set_timer()
    keeper.record_env_step(reward, is_terminal)
    keeper.record_time('record_env_step')
    
    if keeper.need_train:  # TODO train condition call back
        loss = trainer.step()
        keeper.record_time('trainer.step')
        keeper.record_train_step(loss)
        keeper.record_time('record_train_step')

    if keeper.need_save:
        keeper.save()

    if keeper.need_draw:
        env.render()
    
    keeper.set_timer()
    keeper.report_step()
    keeper.record_time('report_step')

# Framework-F1-END

# Framework-DiffPreproc1
Using the differecen between two consecutive frames as input -- easy and it worked.
Using easy encoder design

In [None]:
# Framework definition:
# **Necessary to run this cell** to create experiment package for this framework
from rl.keeppg import RLAlgorithm
RL_components = {
    'Preprocessor': GymAtariFramePreprocessor_Diff,
    'ExperienceMemoryManager': ExperienceMemory,
    'Encoder': DummyEncoder,
    'Decoder': Decoder,
    'RLNet': RLNet,
    'Policy': Policy,
    'Environment': AtariEnvironment_Pong,
    'Trainer': OneStepPolicyGradientTrainer,
    'Keeper': Keeper,
}

experience_opts = {
    'capacity': 1000,
    'discount': 0.99
}

encoder_opts = {
    'input_channels': 1,
}

decoder_opts = {
    'input_num': None,
    'fc1_hidden_unit_num': 256,
    'output_num':4
}

trainer_opts = {'Optimiser': torch.optim.Adagrad, 'learning_rate':1e-4}

path_opts = {
    'BASE_PATH': FULL_PROJ_PATH,
    'RUN_PATH': 'RUNS',
    'experiment_id': 'DiffPreproc1'}

running_dir = os.path.join(path_opts['BASE_PATH'], 
                           path_opts['RUN_PATH'], 
                           path_opts['experiment_id'])

save_dir = os.path.join(running_dir, 'checkpoints')

if not os.path.exists(running_dir):
    os.mkdir(running_dir)  # NOT using makedirs, I want the 
    # users to be responsible for the parent directory (and 
    # overall structure)
if not os.path.exists(save_dir):
    os.mkdir(save_dir)

keeper_opts = {
    'train_every_n_episodes': 1,
    'save_every_n_training_steps': 5000,
    'draw_every_n_training_steps': -1,
    'max_training_steps': 2000000,
    'save_path': save_dir,
    'report': {'save_checkpoint': True,
               'every_n_steps': 10000,
               'every_n_training': 1,
               'every_n_episodes': 1,
               'every_n_time_records': 100}
}
    
# CREATE LEARNING COMPONENTS
env = RL_components['Environment']()
preproc = RL_components['Preprocessor']()
mem = RL_components['ExperienceMemoryManager'](**experience_opts)
enc = RL_components['Encoder'](encoder_opts)
decoder_opts['input_num'] = enc.get_feature_num({'height':preproc.im_height, 'width':preproc.im_width})
dec = RL_components['Decoder'](decoder_opts)
rlnet = RL_components['RLNet'](enc, dec)
policy = RL_components['Policy'](rlnet)
trainer = RL_components['Trainer'](rlnet, mem, trainer_opts)
keeper = RL_components['Keeper']([enc, dec, policy, mem], keeper_opts)  # objects has "save/load" interface
alg = RLAlgorithm(keeper, env, preproc, mem, policy, trainer)

In [None]:
alg.run()

# Framework-DiffPreproc1-END

In [19]:
import dplay_utils.xdeploy as xd
xd.deploy(THIS_NOTEBOOK, 'DiffPreproc1', RL_components, '../../RUNS/DiffPreproc1')