In [1]:
import argparse
import json
import h5py
import numpy as np
import yaml
import os, os.path, shutil

from policyopt import util

[93mMKL runtime not found. Will not attempt to disable multithreaded MKL for parallel rollouts.[0m


In [19]:
def load_trained_policy_and_mdp(env_name, policy_state_str):
    print(policy_state_str)
    import gym
    import policyopt
    from policyopt import nn, rl
    from environments import rlgymenv

    # Load the saved state
    policy_file, policy_key = util.split_h5_name(policy_state_str)
    print 'Loading policy parameters from %s in %s' % (policy_key, policy_file)
    with h5py.File(policy_file, 'r') as f:
        train_args = json.loads(f.attrs['args'])

    # Initialize the MDP
    print 'Loading environment', env_name
    mdp = rlgymenv.RLGymMDP(env_name)
    print 'MDP observation space, action space sizes: %d, %d\n' % (mdp.obs_space.dim, mdp.action_space.storage_size)

    # Initialize the policy
    nn.reset_global_scope()
    enable_obsnorm = bool(train_args['enable_obsnorm']) if 'enable_obsnorm' in train_args else train_args['obsnorm_mode'] != 'none'
    if isinstance(mdp.action_space, policyopt.ContinuousSpace):
        if 'sparse_mixture' in policy_file:
            policy_cfg = rl.SparseMixtureGaussianPolicyConfig(
                hidden_spec=train_args['policy_hidden_spec'],
                min_stdev=0.,
                init_logstdev=0.,
                enable_obsnorm=enable_obsnorm)
            policy = rl.SparseMixtureGaussianPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GaussianSparseMixturePolicy')
        elif 'mixture' in policy_file:
            policy_cfg = rl.MixtureGaussianPolicyConfig(
                hidden_spec=train_args['policy_hidden_spec'],
                min_stdev=0.,
                init_logstdev=0.,
                enable_obsnorm=enable_obsnorm)
            policy = rl.MixtureGaussianPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GaussianMixturePolicy')
        elif 'ga' in policy_file:
            policy_cfg = rl.GaussianPolicyConfig(
                hidden_spec=train_args['policy_hidden_spec'],
                min_stdev=0.,
                init_logstdev=0.,
                enable_obsnorm=enable_obsnorm)
            policy = rl.GaussianPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GaussianPolicy')
    else:
        if 'gibbs' in policy_file:
            policy_cfg = rl.GibbsPolicyConfig(
                hidden_spec=train_args['policy_hidden_spec'],
                enable_obsnorm=enable_obsnorm)
            policy = rl.GibbsPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GibbsPolicy')
        elif 'sparse' in policy_file:
            policy_cfg = rl.SparsePolicyConfig(
                hidden_spec=train_args['policy_hidden_spec'],
                enable_obsnorm=enable_obsnorm)
            policy = rl.SparsePolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'SparsePolicy')
        

    # Load the policy parameters
    policy.load_h5(policy_file, policy_key)

    return mdp, policy, train_args


def gen_taskname2outfile(spec, assert_not_exists=False):
    '''
    Generate dataset filenames for each task. Phase 0 (sampling) writes to these files,
    phase 1 (training) reads from them.
    '''
    taskname2outfile = {}
    trajdir = os.path.join(spec['options']['storagedir'], spec['options']['traj_subdir'])
    util.mkdir_p(trajdir)
    for task in spec['tasks']:
        assert task['name'] not in taskname2outfile
        fname = os.path.join(trajdir, 'trajs_{}.h5'.format(task['name']))
        if assert_not_exists:
            assert not os.path.exists(fname), 'Traj destination {} already exists'.format(fname)
        taskname2outfile[task['name']] = fname
    return taskname2outfile



def exec_saved_policy(env_name, policystr, num_trajs, deterministic, max_traj_len=None):
    import policyopt
    from policyopt import SimConfig, rl, util, nn, tqdm
    from environments import rlgymenv
    import gym

    # Load MDP and policy
    mdp, policy, _ = load_trained_policy_and_mdp(env_name, policystr)
    max_traj_len = min(mdp.env_spec.timestep_limit, max_traj_len) if max_traj_len is not None else mdp.env_spec.timestep_limit

    print 'Sampling {} trajs (max len {}) from policy {} in {}'.format(num_trajs, max_traj_len, policystr, env_name)

    # Sample trajs
    trajbatch = mdp.sim_mp(
        policy_fn=lambda obs_B_Do: policy.sample_actions(obs_B_Do, deterministic),
        obsfeat_fn=lambda obs:obs,
        cfg=policyopt.SimConfig(
            min_num_trajs=num_trajs,
            min_total_sa=-1,
            batch_size=None,
            max_traj_len=max_traj_len))

    return trajbatch, policy, mdp


def eval_snapshot(env_name, checkptfile, snapshot_idx, num_trajs, deterministic):
    policystr = '{}/snapshots/iter{:07d}'.format(checkptfile, snapshot_idx)
    trajbatch, _, _ = exec_saved_policy(
        env_name,
        policystr,
        num_trajs,
        deterministic=deterministic,
        max_traj_len=None)
    returns = trajbatch.r.padded(fill=0.).sum(axis=1)
    lengths = np.array([len(traj) for traj in trajbatch])
    util.header('{} gets return {} +/- {}'.format(policystr, returns.mean(), returns.std()))
    return returns, lengths


In [20]:
with open('./pipelines/im_test_gail_pipeline.yaml', 'r') as f:
# with open('./pipelines/im_classic_pipeline_kj.yaml', 'r') as f:
    spec = yaml.load(f)
    
util.header('=== Phase 2: evaluating trained models ===')
import pandas as pd

taskname2dset = gen_taskname2outfile(spec)

# This is where model logs are stored.
# We will also store the evaluation here.
checkptdir = os.path.join(spec['options']['storagedir'], spec['options']['checkpt_subdir'])
print 'Evaluating results in {}'.format(checkptdir)

results_full_path = os.path.join(checkptdir, spec['options']['results_filename'])
print 'Will store results in {}'.format(results_full_path)
# if os.path.exists(results_full_path):
#     raise RuntimeError('Results file {} already exists'.format(results_full_path))

# First, pre-determine which evaluations we have to do
evals_to_do = []
nonexistent_checkptfiles = []
for task in spec['tasks']:
    # See how well the algorithms did...
    for alg in spec['training']['algorithms']:
        # ...on various dataset sizes
        for num_trajs in spec['training']['dataset_num_trajs']:
            # for each rerun, for mean / error bars later
            for run in range(spec['training']['runs']):
                # Make sure the checkpoint file exists (maybe PBS dropped some jobs)
                strid = 'alg={},task={},num_trajs={},run={}'.format(alg['name'], task['name'], num_trajs, run)
                checkptfile = os.path.join(checkptdir, strid + '.h5')
                
                if not os.path.exists(checkptfile):
                    nonexistent_checkptfiles.append(checkptfile)
                evals_to_do.append((task, alg, num_trajs, run, checkptfile))

if nonexistent_checkptfiles:
    print 'Cannot find checkpoint files:\n', '\n'.join(nonexistent_checkptfiles)
    raise RuntimeError

# Walk through all saved checkpoints
collected_results = []
for i_eval, (task, alg, num_trajs, run, checkptfile) in enumerate(evals_to_do):
    util.header('Evaluating run {}/{}: alg={},task={},num_trajs={},run={}'.format(
        i_eval+1, len(evals_to_do), alg['name'], task['name'], num_trajs, run))

    # Load the task's traj dataset to see how well the expert does
    with h5py.File(taskname2dset[task['name']], 'r') as trajf:
        # Expert's true return and traj lengths
        ex_traj_returns = trajf['r_B_T'][...].sum(axis=1)
        ex_traj_lengths = trajf['len_B'][...]

    # Load the checkpoint file
    with pd.HDFStore(checkptfile, 'r') as f:
        log_df = f['log']
        log_df.set_index('iter', inplace=True)

        # Evaluate true return for the learned policy
        if any(alg['name'].startswith(s) for s in ('bclone_gibbs', 'bclone_sparse', 'bclone_mixture', 'bclone_gauss')):
            # Pick the policy with the best validation accuracy
            best_snapshot_idx = log_df['valacc'].argmax()
            alg_traj_returns, alg_traj_lengths = eval_snapshot(
                task['env'], checkptfile, best_snapshot_idx,
                spec['options']['eval_num_trajs'], deterministic=True)

        elif any(alg['name'].startswith(s) for s in ('ga_mixture', 'ga', 'fem', 'simplex')):
            # Evaluate the last saved snapshot
            snapshot_names = f.root.snapshots._v_children.keys()
            assert all(name.startswith('iter') for name in snapshot_names)
            snapshot_inds = sorted([int(name[len('iter'):]) for name in snapshot_names])
            best_snapshot_idx = snapshot_inds[-1]
            alg_traj_returns, alg_traj_lengths = eval_snapshot(
                task['env'], checkptfile, best_snapshot_idx,
                spec['options']['eval_num_trajs'], deterministic=True)

        else:
            raise NotImplementedError('Analysis not implemented for {}'.format(alg['name']))

        collected_results.append({
            # Trial info
            'alg': alg['name'],
            'task': task['name'],
            'num_trajs': num_trajs,
            'run': run,
            # Expert performance
            'ex_traj_returns': ex_traj_returns,
            'ex_traj_lengths': ex_traj_lengths,
            # Learned policy performance
            'alg_traj_returns': alg_traj_returns,
            'alg_traj_lengths': alg_traj_lengths,
        })

collected_results = pd.DataFrame(collected_results)
with pd.HDFStore(results_full_path, 'w') as outf:
    outf['results'] = collected_results


[95m=== Phase 2: evaluating trained models ===[0m
Evaluating results in imitation_runs/test/checkpoints_all
Will store results in imitation_runs/test/checkpoints_all/results.h5
[95mEvaluating run 1/12: alg=ga_mixture_0,task=hopper,num_trajs=4,run=0[0m
imitation_runs/test/checkpoints_all/alg=ga_mixture_0,task=hopper,num_trajs=4,run=0.h5/snapshots/iter0000200
Loading policy parameters from /snapshots/iter0000200 in imitation_runs/test/checkpoints_all/alg=ga_mixture_0,task=hopper,num_trajs=4,run=0.h5
Loading environment Hopper-v1
Gym version: 0.9.3
MDP observation space, action space sizes: 11, 3

[95mLoading feedforward net specification[0m
[
  {
    "type": "fc",
    "n": 64
  },
  {
    "type": "nonlin",
    "func": "tanh"
  },
  {
    "type": "fc",
    "n": 64
  },
  {
    "type": "nonlin",
    "func": "tanh"
  }
]
[95mAffine(in=11, out=64)[0m
[95mNonlinearity(func=tanh)[0m
[95mAffine(in=64, out=64)[0m
[95mNonlinearity(func=tanh)[0m
[95mAffine(in=64, out=12)[0m
Reading

[95mimitation_runs/test/checkpoints_all/alg=ga_mixture_1,task=hopper,num_trajs=4,run=0.h5/snapshots/iter0000200 gets return 185.140935122 +/- 111.055037755[0m
[95mEvaluating run 6/12: alg=ga_mixture_1,task=hopper,num_trajs=11,run=0[0m
imitation_runs/test/checkpoints_all/alg=ga_mixture_1,task=hopper,num_trajs=11,run=0.h5/snapshots/iter0000200
Loading policy parameters from /snapshots/iter0000200 in imitation_runs/test/checkpoints_all/alg=ga_mixture_1,task=hopper,num_trajs=11,run=0.h5
Loading environment Hopper-v1
Gym version: 0.9.3
MDP observation space, action space sizes: 11, 3

[95mLoading feedforward net specification[0m
[
  {
    "type": "fc",
    "n": 64
  },
  {
    "type": "nonlin",
    "func": "tanh"
  },
  {
    "type": "fc",
    "n": 64
  },
  {
    "type": "nonlin",
    "func": "tanh"
  }
]
[95mAffine(in=11, out=64)[0m
[95mNonlinearity(func=tanh)[0m
[95mAffine(in=64, out=64)[0m
[95mNonlinearity(func=tanh)[0m
[95mAffine(in=64, out=12)[0m
Reading GaussianMixtur

[95mimitation_runs/test/checkpoints_all/alg=ga,task=hopper,num_trajs=11,run=0.h5/snapshots/iter0000200 gets return 3638.76535412 +/- 10.4435862227[0m
[95mEvaluating run 11/12: alg=ga,task=hopper,num_trajs=18,run=0[0m
imitation_runs/test/checkpoints_all/alg=ga,task=hopper,num_trajs=18,run=0.h5/snapshots/iter0000200
Loading policy parameters from /snapshots/iter0000200 in imitation_runs/test/checkpoints_all/alg=ga,task=hopper,num_trajs=18,run=0.h5
Loading environment Hopper-v1
Gym version: 0.9.3
MDP observation space, action space sizes: 11, 3

[95mLoading feedforward net specification[0m
[
  {
    "type": "fc",
    "n": 64
  },
  {
    "type": "nonlin",
    "func": "tanh"
  },
  {
    "type": "fc",
    "n": 64
  },
  {
    "type": "nonlin",
    "func": "tanh"
  }
]
[95mAffine(in=11, out=64)[0m
[95mNonlinearity(func=tanh)[0m
[95mAffine(in=64, out=64)[0m
[95mNonlinearity(func=tanh)[0m
[95mAffine(in=64, out=3)[0m
Reading GaussianPolicy/logstdevs_1_Da
Reading GaussianPolicy/

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block1_values] [items->['alg', 'alg_traj_lengths', 'alg_traj_returns', 'ex_traj_lengths', 'ex_traj_returns', 'task']]

  exec(code_obj, self.user_global_ns, self.user_ns)
