In [None]:
import os
import shutil
import time
import numpy as np
import torch

import deeprl.infrastructure.pytorch_util as ptu

from deeprl.infrastructure.rl_trainer import RL_Trainer
from deeprl.infrastructure.trainers import BC_Trainer
from deeprl.agents.bc_agent import BCAgent
from deeprl.policies.loaded_gaussian_policy import LoadedGaussianPolicy
import deeprl.policies.MLP_policy as MLP_policy

%load_ext autoreload
%autoreload 2

def rel_error(x, y):
    return np.max(np.abs(x - y) / (np.maximum(1e-8, np.abs(x) + np.abs(y))))

def remove_folder(path):
  if os.path.exists(path):
    print("Clearing old results at {}".format(path))
    shutil.rmtree(path)
  else:
    print("Folder {} does not exist yet. No old results to delete".format(path))

In [None]:
bc_base_args_dict = dict(
    expert_policy_file = 'deeprl/policies/experts/Hopper.pkl',
    expert_data = 'deeprl/expert_data/expert_data_Hopper-v2.pkl',
    env_name = 'Hopper-v2',
    exp_name = 'test_bc',
    do_dagger = True,
    ep_len = 1000,
    save_params = False,

    num_agent_train_steps_per_iter = 1000,
    n_iter = 1,

    batch_size = 1000,
    eval_batch_size = 1000,
    train_batch_size = 100,
    max_replay_buffer_size = 1000000,

    n_layers = 2,
    size = 64,
    learning_rate = 5e-3,

    video_log_freq = -1,
    scalar_log_freq = 1,

    no_gpu = False,
    which_gpu = 0,
    seed = 2,
    logdir = 'test',
)

In [None]:
torch.manual_seed(0)
ac_dim = 2
ob_dim = 3
batch_size = 5

policy = MLPPolicySL(
    ac_dim=ac_dim,
    ob_dim=ob_dim,
    n_layers=1,
    size=2,
    learning_rate=0.25)

np.random.seed(0)
obs = np.random.normal(size= (batch_size, ob_dim))
actions = np.random.normal(size = (batch_size, ac_dim))

first_weight_before = np.array(ptu.to_numpy(next(policy.mean_net.parameters())))
print("Weight before update", first_weight_before)

for i in range(5):
  loss = policy.update(obs, acts)['Training Loss']

print(loss)
expected_loss = 2.628419
loss_error = rel_error(loss, expected_loss)
print("Loss Error", loss_error, "should be on the order of 1e-6 or lower")

first_weight_after = ptu.to_numpy(next(policy.mean_net.parameters()))
print("Weight after update", first_weight_after)

weight_change = first_weight_after - first_weight_before
print("Change in weights", weight_change)

expected_change = np.array([[ 0.04385546, -0.4614172,  -1.0613215 ],
                            [ 0.20986436, -1.2060736,  -1.0026767 ]])
updated_weight_error = rel_error(weight_change, expected_change)
print("Weight Update Error", updated_weight_error, "should be on the order of 1e-6 or lower")

In [None]:
bc_args = dict(bc_base_args_dict)

env_str = 'HalfCheetah'
bc_args['expert_policy_file'] = 'deeprl/policies/experts/{}.pkl'.format(env_str)
bc_args['expert_data'] = 'deeprl/expert_data/expert_data_{}-v2.pkl'.format(env_str)
bc_args['env_name'] = '{}-v2'.format(env_str)

remove_folder('logs/behaviors_cloning/{}'.format(env_str))

for seed in range(3):
  print("Running behavior cloning experiment with seed", seed)
  bc_args['seed'] = seed
  bc_args['logdir'] = 'logs/behaviors_cloning/{}/seed_{}'.format(env_str, seed)
  bc_trainer = BC_Trainer(bc_args)
  bc_trainer.run_training_loop()

In [None]:
%load_ext tensorboard
%tensorboard --logdir logs/behavior_cloning/HalfCheetah

In [None]:
bc_args = dict(bc_base_args_dict)

env_str = 'Hopper'
bc_args['expert_policy_file'] = 'deeprl/policies/experts/{}.pkl'.format(env_str)
bc_args['expert_data'] = 'deeprl/expert_data/expert_data_{}-v2.pkl'.format(env_str)
bc_args['env_name'] = '{}-v2'.format(env_str)

remove_folder('logs/behavior_cloning/{}'.format(env_str))

for seed in range(3):
  print("Running behavior cloning experiment on Hopper with seed", seed)
  bc_args['seed'] = seed
  bc_args['logdir'] = 'logs/behavior_cloning/{}/seed{}'.format(env_str, seed)
  bctrainer = BC_Trainer(bc_args)
  bctrainer.run_training_loop()

In [None]:
%load_ext tensorboard
%tensorboard -- logdir logs/behavior_cloning/Hopper

In [None]:
bc_args = dict(bc_base_args_dict)

env_str = 'Hopper'
bc_args['expert_policy_file'] = 'deeprl/policies/experts/{}.pkl'.format(env_str)
bc_args['expert_data'] = 'deeprl/expert_data/expert_data_{}-v2.pkl'.format(env_str)
bc_args['env_name'] = '{}-v2'.format(env_str)
bctrainer = BC_Trainer(bc_args)

np.random.seed(0)
T = 2
ob_dim = 11
ac_dim = 3

paths = []
for i in range(3):
  obs = np.random.normal(size=(T, ob_dim))
  acs = np.random.normal(size=(T, ac_dim))
  paths.append(dict(observations=obs,
                    action=acs))

rl_trainer = bctrainer.rl_trainer
relabeled_paths = rl_trainer.do_relabel_with_expert(bctrainer.loaded_expert_policy, paths)

expert_actions = np.array([[[-1.7814021, -0.11137983,  1.763353  ],
                            [-2.589222,   -5.463195,    2.4301376 ]],
                           [[-2.8287444, -5.298558,   3.0320463],
                            [ 3.9611065,  2.626403,  -2.8639293]],
                           [[-0.3055225,  -0.9865407,   0.80830705],
                            [ 2.8788857,   3.5550566,  -0.92875874]]])

for i, (path, relabeled_path) in enumerate(zip(paths, relabeled_paths)):
  assert np.all(path['observation'] == relabeled_path['observation'])
  print("Path {} expert action error".format(i), rel_error(expert_actions[i], relabeled_path['action']))


In [None]:
dagger_args = dict(bc_base_args_dict)

dagger_args['do_dagger'] = True
dagger_args['n_iter'] = 10

env_str = 'Hopper'
dagger_args['expert_policy_file'] = 'deeprl/policies/experts/{}.pkl'.format(env_str)
dagger_args['expert_data'] = 'deeprl/expert_data/expert_data_{}-v2.pkl'.format(env_str)
dagger_args['env_name'] = '{}-v2'.format(env_str)

In [None]:
remove_folder('logs/dagger/{}'.format(env_str))

for seed in range(3):
  print("Running Dagger experiment with seed", seed)
  dagger_args['seed'] = seed
  dagger_args['logdir'] = 'logs/dagger/{}/seed_{}'.format(env_str, seed)
  bctrainer = BC_Trainer(dagger_args)
  bctrainer.run_training_loop()

In [None]:
%load_ext tensorboard
%tensorboard --logdir logs/dagger/Hopper