In [1]:
import sys
sys.path.append("/home/tt/imitation")
import argparse
import json
import h5py
import numpy as np
import yaml
import os, os.path, shutil
from policyopt import util


In [2]:
def gen_taskname2outfile(spec, assert_not_exists=False):
    '''
    Generate dataset filenames for each task. Phase 0 (sampling) writes to these files,
    phase 1 (training) reads from them.
    '''
    taskname2outfile = {}
    trajdir = os.path.join(spec['options']['storagedir'], spec['options']['traj_subdir'])
    util.mkdir_p(trajdir)
    for task in spec['tasks']:
        assert task['name'] not in taskname2outfile
        fname = os.path.join(trajdir, 'trajs_{}.h5'.format(task['name']))
        if assert_not_exists:
            assert not os.path.exists(fname), 'Traj destination {} already exists'.format(fname)
        taskname2outfile[task['name']] = fname
    return taskname2outfile

In [10]:
with open('./pipelines/im_pipeline.yaml', 'r') as f:
    spec = yaml.load(f)

In [11]:
spec

{'options': {'checkpt_subdir': 'checkpoints_all',
  'eval_num_trajs': 50,
  'pbs': {'jobname': 'im_modern', 'ppn': 12, 'queue': 'atlas'},
  'results_filename': 'results.h5',
  'storagedir': 'imitation_runs/modern_stochastic',
  'traj_subdir': 'trajs'},
 'tasks': [{'cuts_off_on_success': False,
   'data_subsamp_freq': 20,
   'env': 'Hopper-v1',
   'name': 'hopper',
   'policy': 'expert_policies/modern/log_Hopper-v0_3.h5/snapshots/iter0000500'},
  {'cuts_off_on_success': False,
   'data_subsamp_freq': 20,
   'env': 'Walker2d-v1',
   'name': 'walker',
   'policy': 'expert_policies/modern/walker_eb5b2e_1.h5/snapshots/iter0000480'},
  {'cuts_off_on_success': False,
   'data_subsamp_freq': 20,
   'env': 'Ant-v1',
   'name': 'ant',
   'policy': 'expert_policies/modern/log_Ant-v1_0.h5/snapshots/iter0000500'},
  {'cuts_off_on_success': False,
   'data_subsamp_freq': 20,
   'env': 'HalfCheetah-v1',
   'name': 'halfcheetah',
   'policy': 'expert_policies/modern/log_HalfCheetah-v0_2.h5/snapshots/i

In [12]:
taskname2dset = gen_taskname2outfile(spec)

In [13]:
checkptdir = os.path.join(spec['options']['storagedir'], spec['options']['checkpt_subdir'])
util.mkdir_p(checkptdir)
# Make sure checkpoint dir is empty
assert not os.listdir(checkptdir), 'Checkpoint directory {} is not empty!'.format(checkptdir)


In [16]:
cmd_templates, outputfilenames, argdicts = [], [], []
for alg in spec['training']['algorithms']:
    for task in spec['tasks']:
        for num_trajs in spec['training']['dataset_num_trajs']:
            assert num_trajs <= spec['training']['full_dataset_num_trajs']
            for run in range(spec['training']['runs']):
                # A string identifier. Used in filenames for this run
                strid = 'alg={},task={},num_trajs={},run={}'.format(alg['name'], task['name'], num_trajs, run)
                cmd_templates.append(alg['cmd'].replace('\n', ' ').strip())
                outputfilenames.append(strid + '.txt')
                argdicts.append({
                    'env': task['env'],
                    'dataset': taskname2dset[task['name']],
                    'num_trajs': num_trajs,
                    'cuts_off_on_success': int(task['cuts_off_on_success']),
                    'data_subsamp_freq': task['data_subsamp_freq'],
                    'out': os.path.join(checkptdir, strid + '.h5'),
                })


In [17]:
for cmd, arg in zip(cmd_templates, argdicts):
    print(cmd.format(**arg))


python scripts/imitate_mj.py --mode bclone --env Hopper-v1 --data imitation_runs/modern_stochastic/trajs/trajs_hopper.h5 --limit_trajs 4 --data_subsamp_freq 20 --max_iter 20001 --log imitation_runs/modern_stochastic/checkpoints_all/alg=bclone,task=hopper,num_trajs=4,run=0.h5
python scripts/imitate_mj.py --mode bclone --env Hopper-v1 --data imitation_runs/modern_stochastic/trajs/trajs_hopper.h5 --limit_trajs 11 --data_subsamp_freq 20 --max_iter 20001 --log imitation_runs/modern_stochastic/checkpoints_all/alg=bclone,task=hopper,num_trajs=11,run=0.h5
python scripts/imitate_mj.py --mode bclone --env Hopper-v1 --data imitation_runs/modern_stochastic/trajs/trajs_hopper.h5 --limit_trajs 18 --data_subsamp_freq 20 --max_iter 20001 --log imitation_runs/modern_stochastic/checkpoints_all/alg=bclone,task=hopper,num_trajs=18,run=0.h5
python scripts/imitate_mj.py --mode bclone --env Hopper-v1 --data imitation_runs/modern_stochastic/trajs/trajs_hopper.h5 --limit_trajs 25 --data_subsamp_freq 20 --max_i

In [18]:
#!python scripts/imitate_mj.py --mode ga --env Hopper-v1 --data imitation_runs/modern_stochastic/trajs/trajs_hopper.h5 --limit_trajs 25 --data_subsamp_freq 20 --favor_zero_expert_reward 0 --min_total_sa 50000 --max_iter 501 --reward_include_time 0 --reward_lr .01 --log imitation_runs/modern_stochastic/checkpoints_all/alg=ga,task=hopper,num_trajs=25,run=0.h5


In [52]:
import pandas as pd

taskname2dset = gen_taskname2outfile(spec)

# This is where model logs are stored.
# We will also store the evaluation here.
checkptdir = os.path.join(spec['options']['storagedir'], spec['options']['checkpt_subdir'])
print 'Evaluating results in {}'.format(checkptdir)

results_full_path = os.path.join(checkptdir, spec['options']['results_filename'])
print 'Will store results in {}'.format(results_full_path)
if os.path.exists(results_full_path):
    raise RuntimeError('Results file {} already exists'.format(results_full_path))

# First, pre-determine which evaluations we have to do
evals_to_do = []
nonexistent_checkptfiles = []
for task in spec['tasks']:
    # See how well the algorithms did...
    for alg in spec['training']['algorithms']:
        # ...on various dataset sizes
        for num_trajs in spec['training']['dataset_num_trajs']:
            # for each rerun, for mean / error bars later
            for run in range(spec['training']['runs']):
                # Make sure the checkpoint file exists (maybe PBS dropped some jobs)
                strid = 'alg={},task={},num_trajs={},run={}'.format(alg['name'], task['name'], num_trajs, run)
                checkptfile = os.path.join(checkptdir, strid + '.h5')
                if not os.path.exists(checkptfile):
                    nonexistent_checkptfiles.append(checkptfile)
                if checkptfile == 'imitation_runs/modern_stochastic/checkpoints_all/alg=ga,task=hopper,num_trajs=25,run=0.h5':
                    evals_to_do.append((task, alg, num_trajs, run, checkptfile))

#if nonexistent_checkptfiles:
#    print 'Cannot find checkpoint files:\n', '\n'.join(nonexistent_checkptfiles)
#    raise RuntimeError

Evaluating results in imitation_runs/modern_stochastic/checkpoints_all
Will store results in imitation_runs/modern_stochastic/checkpoints_all/results.h5


In [53]:
evals_to_do

[({'cuts_off_on_success': False,
   'data_subsamp_freq': 20,
   'env': 'Hopper-v1',
   'name': 'hopper',
   'policy': 'expert_policies/modern/log_Hopper-v0_3.h5/snapshots/iter0000500'},
  {'cmd': 'python scripts/imitate_mj.py --mode ga --env {env} --data {dataset} --limit_trajs {num_trajs} --data_subsamp_freq {data_subsamp_freq} --favor_zero_expert_reward {cuts_off_on_success} --min_total_sa 50000 --max_iter 501 --reward_include_time 0 --reward_lr .01 --log {out}\n',
   'name': 'ga'},
  25,
  0,
  'imitation_runs/modern_stochastic/checkpoints_all/alg=ga,task=hopper,num_trajs=25,run=0.h5')]

In [54]:
collected_results = []
for i_eval, (task, alg, num_trajs, run, checkptfile) in enumerate(evals_to_do):
    util.header('Evaluating run {}/{}: alg={},task={},num_trajs={},run={}'.format(
        i_eval+1, len(evals_to_do), alg['name'], task['name'], num_trajs, run))

[95mEvaluating run 1/1: alg=ga,task=hopper,num_trajs=25,run=0[0m


In [55]:
task

{'cuts_off_on_success': False,
 'data_subsamp_freq': 20,
 'env': 'Hopper-v1',
 'name': 'hopper',
 'policy': 'expert_policies/modern/log_Hopper-v0_3.h5/snapshots/iter0000500'}

In [56]:
taskname2dset[task['name']]

'imitation_runs/modern_stochastic/trajs/trajs_hopper.h5'

In [57]:
taskname2dset[task['name']] = 'imitation_runs/modern_stochastic/trajs/trajs_hopper.h5'

In [58]:
checkptfile = 'imitation_runs/modern_stochastic/checkpoints_all/alg=ga,task=hopper,num_trajs=25,run=0.h5'

In [59]:
with h5py.File(taskname2dset[task['name']], 'r') as trajf:
    # Expert's true return and traj lengths
    ex_traj_returns = trajf['r_B_T'][...].sum(axis=1)
    ex_traj_lengths = trajf['len_B'][...]


In [60]:
ex_traj_returns

array([ 3600.85445404,  3593.59654845,  3597.21851935,  3608.1891969 ,
        3608.3514956 ,  3600.76942597,  3569.24886298,  3592.20780256,
        3603.65330353,  3565.55785808,  3610.86397149,  3588.87158289,
        3611.97644071,  3602.31506325,  2708.99340777,  3611.81577382,
        3596.76356634,  3571.606662  ,  3560.50301102,  3603.85814728,
        3587.46706745,  3591.32415659,  3596.71503242,  3589.61325206,
        3602.19630788,  3624.32164267,  3578.48811349,  3585.01101376,
        3598.42751121,  3618.6025666 ,  3571.13502982,  3576.87113557,
        3605.17074857,  3583.68594682,  3604.49595658,  3608.20571546,
        3623.89661819,  3603.75170096,  3583.24676243,  3600.82721793,
        3573.94252232,  3598.8362137 ,  3599.59836232,  3606.74182361,
        3599.07646239,  3603.15528772,  3605.62355385,  3631.95912756,
        3590.98805243,  3609.8550191 ,  3599.61206318])

In [61]:
ex_traj_lengths

array([1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000,
       1000, 1000, 1000,  744, 1000, 1000, 1000, 1000, 1000, 1000, 1000,
       1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000,
       1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000,
       1000, 1000, 1000, 1000, 1000, 1000, 1000], dtype=int32)

In [62]:
alg

{'cmd': 'python scripts/imitate_mj.py --mode ga --env {env} --data {dataset} --limit_trajs {num_trajs} --data_subsamp_freq {data_subsamp_freq} --favor_zero_expert_reward {cuts_off_on_success} --min_total_sa 50000 --max_iter 501 --reward_include_time 0 --reward_lr .01 --log {out}\n',
 'name': 'ga'}

In [69]:
def load_trained_policy_and_mdp(env_name, policy_state_str):
    import gym
    import policyopt
    from policyopt import nn, rl
    from environments import rlgymenv

    # Load the saved state
    policy_file, policy_key = util.split_h5_name(policy_state_str)
    print 'Loading policy parameters from %s in %s' % (policy_key, policy_file)
    with h5py.File(policy_file, 'r') as f:
        train_args = json.loads(f.attrs['args'])

    # Initialize the MDP
    print 'Loading environment', env_name
    mdp = rlgymenv.RLGymMDP(env_name)
    print 'MDP observation space, action space sizes: %d, %d\n' % (mdp.obs_space.dim, mdp.action_space.storage_size)

    # Initialize the policy
    nn.reset_global_scope()
    enable_obsnorm = bool(train_args['enable_obsnorm']) if 'enable_obsnorm' in train_args else train_args['obsnorm_mode'] != 'none'
    if isinstance(mdp.action_space, policyopt.ContinuousSpace):
        policy_cfg = rl.GaussianPolicyConfig(
            hidden_spec=train_args['policy_hidden_spec'],
            min_stdev=0.,
            init_logstdev=0.,
            enable_obsnorm=enable_obsnorm)
        policy = rl.GaussianPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GaussianPolicy')
    else:
        policy_cfg = rl.GibbsPolicyConfig(
            hidden_spec=train_args['policy_hidden_spec'],
            enable_obsnorm=enable_obsnorm)
        policy = rl.GibbsPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GibbsPolicy')

    # Load the policy parameters
    policy.load_h5(policy_file, policy_key)

    return mdp, policy, train_args



In [70]:
def exec_saved_policy(env_name, policystr, num_trajs, deterministic, max_traj_len=None):
    import policyopt
    from policyopt import SimConfig, rl, util, nn, tqdm
    from environments import rlgymenv
    import gym

    # Load MDP and policy
    mdp, policy, _ = load_trained_policy_and_mdp(env_name, policystr)
    max_traj_len = min(mdp.env_spec.timestep_limit, max_traj_len) if max_traj_len is not None else mdp.env_spec.timestep_limit

    print 'Sampling {} trajs (max len {}) from policy {} in {}'.format(num_trajs, max_traj_len, policystr, env_name)

    # Sample trajs
    trajbatch = mdp.sim_mp(
        policy_fn=lambda obs_B_Do: policy.sample_actions(obs_B_Do, deterministic),
        obsfeat_fn=lambda obs:obs,
        cfg=policyopt.SimConfig(
            min_num_trajs=num_trajs,
            min_total_sa=-1,
            batch_size=None,
            max_traj_len=max_traj_len))

    return trajbatch, policy, mdp


In [71]:
def eval_snapshot(env_name, checkptfile, snapshot_idx, num_trajs, deterministic):
    policystr = '{}/snapshots/iter{:07d}'.format(checkptfile, snapshot_idx)
    trajbatch, _, _ = exec_saved_policy(
        env_name,
        policystr,
        num_trajs,
        deterministic=deterministic,
        max_traj_len=None)
    returns = trajbatch.r.padded(fill=0.).sum(axis=1)
    lengths = np.array([len(traj) for traj in trajbatch])
    util.header('{} gets return {} +/- {}'.format(policystr, returns.mean(), returns.std()))
    return returns, lengths


In [72]:

# Load the checkpoint file
with pd.HDFStore(checkptfile, 'r') as f:
    log_df = f['log']
    log_df.set_index('iter', inplace=True)

    # Evaluate true return for the learned policy
    if alg['name'] == 'bclone':
        # Pick the policy with the best validation accuracy
        best_snapshot_idx = log_df['valacc'].argmax()
        alg_traj_returns, alg_traj_lengths = eval_snapshot(
            task['env'], checkptfile, best_snapshot_idx,
            spec['options']['eval_num_trajs'], deterministic=True)

    elif any(alg['name'].startswith(s) for s in ('ga', 'fem', 'simplex')):
        # Evaluate the last saved snapshot
        snapshot_names = f.root.snapshots._v_children.keys()
        assert all(name.startswith('iter') for name in snapshot_names)
        snapshot_inds = sorted([int(name[len('iter'):]) for name in snapshot_names])
        best_snapshot_idx = snapshot_inds[-1]
        alg_traj_returns, alg_traj_lengths = eval_snapshot(
            task['env'], checkptfile, best_snapshot_idx,
            spec['options']['eval_num_trajs'], deterministic=True)

Loading policy parameters from /snapshots/iter0000500 in imitation_runs/modern_stochastic/checkpoints_all/alg=ga,task=hopper,num_trajs=25,run=0.h5
Loading environment Hopper-v1
Gym version: 0.9.1
MDP observation space, action space sizes: 11, 3

[95mLoading feedforward net specification[0m
[
  {
    "type": "fc",
    "n": 100
  },
  {
    "type": "nonlin",
    "func": "tanh"
  },
  {
    "type": "fc",
    "n": 100
  },
  {
    "type": "nonlin",
    "func": "tanh"
  }
]
[95mAffine(in=11, out=100)[0m
[95mNonlinearity(func=tanh)[0m
[95mAffine(in=100, out=100)[0m
[95mNonlinearity(func=tanh)[0m
[95mAffine(in=100, out=3)[0m
Reading GaussianPolicy/logstdevs_1_Da
Reading GaussianPolicy/obsnorm/Standardizer/count
Reading GaussianPolicy/obsnorm/Standardizer/mean_1_D
Reading GaussianPolicy/obsnorm/Standardizer/meansq_1_D
Reading GaussianPolicy/hidden/FeedforwardNet/layer_0/AffineLayer/W
Reading GaussianPolicy/hidden/FeedforwardNet/layer_0/AffineLayer/b
Reading GaussianPolicy/hidden/Fe

In [73]:
collected_results.append({
    # Trial info
    'alg': alg['name'],
    'task': task['name'],
    'num_trajs': num_trajs,
    'run': run,
    # Expert performance
    'ex_traj_returns': ex_traj_returns,
    'ex_traj_lengths': ex_traj_lengths,
    # Learned policy performance
    'alg_traj_returns': alg_traj_returns,
    'alg_traj_lengths': alg_traj_lengths,
})

In [74]:
collected_results = pd.DataFrame(collected_results)

In [75]:
collected_results

Unnamed: 0,alg,alg_traj_lengths,alg_traj_returns,ex_traj_lengths,ex_traj_returns,num_trajs,run,task
0,ga,"[1000, 1000, 1000, 1000, 1000, 1000, 1000, 100...","[3584.7193234, 3576.74629321, 3579.24360482, 3...","[1000, 1000, 1000, 1000, 1000, 1000, 1000, 100...","[3600.85445404, 3593.59654845, 3597.21851935, ...",25,0,hopper


In [90]:
collected_results[['alg_traj_returns', 'ex_traj_returns']]

Unnamed: 0,alg_traj_returns,ex_traj_returns
0,"[3584.7193234, 3576.74629321, 3579.24360482, 3...","[3600.85445404, 3593.59654845, 3597.21851935, ..."
