In [None]:
import os
import shutil
import time
import numpy as np
import torch

import deeprl.infrastructure.pytorch_util as ptu
from deeprl.infrastructure.rl_trainer import RL_Trainer
from deeprl.infrastructure.trainers import AC_Trainer

from deeprl.policies.MLP_policy import MLPPolicyAC

%load_ext autoreload
%autoreload 2

def rel_error(x,y):
  return np.max(np.abs(x-y)/ (np.maximum(1e-8, np.abs(x) + np.abs(y))))

def remove_folder(path):
  if os.path.exists(path):
    print("clearing old results at {}".format(path))
    shutil.rmtree(path)
  else:
    print("folder {} does not exist yet. no old results to delete".format(path))


ac_base_args_dict = dict(
   env_name = 'hopper-v2',
   exp_name = 'test_ac'
   save_params = False,

   ep_len = 200,
   discount = 0.99,

   num_agent_train_steps_per_iter = 1000,
   n_iter = 100,

   batch_size = 1000,
   eval_batch_size = 1000,
   train_batch_size = 256,
   max_replay_buffer_size = 1000000,

   n_layers = 2,
   size = 256,
   entropy_weight = 0,
   learning_rate = 3e-4,

   critic_n_layers = 2,
   critic_size = 256,
   target_update_rate = 5e-3,

   video_log_freq = -1,
   scalar_log_freq = 1,

   no_gpu = False,
   which_gpu = 0,
   seed = 2,
   logdir = 'test',
)

ac_dim = 3
ob_dim = 11
N = 5

np.random.seed(0)
obs = np.random.normal(size=(N, ob_dim))
acts = np.random.choice(ac_dim, size=(N,))
next_obs = np.random.normal(size=(N, ob_dim))
rewards = np.random.normal(size=N)
terminals = np.zero(N)
terminals[0] = 1

ac_args = dict(ac_base_args_dict)

env_str = 'Hooper'
ac_args['env_name'] = '{}- v2'.format(env_str)
ac_args['entropy_weight'] = 0.1
actrainer = AC_Trainer(ac_args)
critic = actrainer.rl_trainer.agent.critic

class DummyDist:
  def sample(self):
    return ptu.from_numpy(1 + np.zeros(shape=(N, ac_dim)))


  def dummy_actors(next_obs):
    return DummyDist()

target_vals = critic.compute_target_value(ptu.from_numpy(next_obs),
                                          ptu.from_numpy(rewards),
                                          ptu.from_numpy(terminals),
                                          dummy_actor)
target_vals = ptu.to_numpy(target_vals)
expected_targets = np.array([-0.9167948, -0.11123351, -0.36787638, -2.1131861,  -0.13868617])

target_error = rel_error(target_vals, expected_targets)
print("target value error",target_error, "should be on the order of 1e-6 or lower")


ac_args = dict(ac_base_args_dict)
env_str = 'Hopper'
ac_args['env_name'] = '{}- v2'.format(env_str)
ac_args['entropy_weight'] = 0.1
actrainer = AC_Trainer(ac_args)
critic = actrainer.rl_trainer.agent.critic

critic.target_update_rate = 0.5

for p in critic.critic_network.parameters():
  p.data += 1.

critic.update_target_network_ema()

for p, target_p in zip(critic.critic_network.parameters(), critic.target_network.parameters()):
  assert np.all(ptu.to_numpy((p-target_p)) == 0.5)

torch.manual_seed(0)
ac_dim = 2
ob_dim = 3
batch_size = 5

np.random.seed(0)
obs = np.random.normal(size=(N, ob_dim))

policy = MLPPolicyAC(
            ac_dim = ac_dim,
            ob_dim = ob_dim,
            n_layers = 1,
            size = 2,
            learning_rate = 0.25,
            entropy_weight = 0.)

def dummy_critic(obs, acts):
  return torch.sum(acts + 1) + torch.sum(obs)

initial_loss = policy.update(obs, dummy_critic)['Actor Training Loss']
expected_initial_loss = -17.083496

print("Initial loss error", rel_error(expected_initial_loss, initial_loss), "should be on the order of 1e-6 or less.")
for i in range(5):
  loss = policy.update(obs, dummy_critic)['Actor Training Loss']
  print(loss)

expected_final_loss = -30.103575

print("Final loss error", rel_error(expected_final_loss, loss), "should be on the order of 1e-6 or less.")

ac_args = dict(ac_base_args_dict)
env_str = 'HalfCheetah'
ac_args['env_name'] = '{}- v2'.format(env_str)
ac_args['n_iter'] = 50

remove_folder('logs/actor_critic/{}'.format(env_str))

for seed in range(3):
  print("Running actor critic experiment with seed", seed)
  ac_args['seed'] = seed
  ac_args['logdir'] = 'logs/actor_critic/{}/seed{}'.format(env_str, seed)
  actrainer = AC_Trainer(ac_args)
  actrainer.run_training_loop()

%load_ext tensorboard
%tensorboard --logdir logs/actor_critic/HalfCheetah
