## Set things up

In [1]:
import numpy as np
import tensorflow as tf

from nn_policy import FeedForwardCritic
from nn_policy import FeedForwardPolicy
from rllab.envs.mujoco.half_cheetah_env import HalfCheetahEnv
from rllab.exploration_strategies.ou_strategy import OUStrategy
from sandbox.rocky.tf.algos.ddpg import DDPG as ShaneDDPG
from sandbox.rocky.tf.envs.base import TfEnv
from sandbox.rocky.tf.policies.deterministic_mlp_policy import \
    DeterministicMLPPolicy
from sandbox.rocky.tf.q_functions.continuous_mlp_q_function import \
    ContinuousMLPQFunction

from ddpg import DDPG as MyDDPG
from testing_utils import are_np_arrays_equal

In [2]:
env = TfEnv(HalfCheetahEnv())
action_dim = env.action_dim
obs_dim = env.observation_space.low.shape[0]

batch_size = 2
rewards = np.random.rand(batch_size)
terminals = (np.random.rand(batch_size) > 0.5).astype(np.int)
obs = np.random.rand(batch_size, obs_dim)
actions = np.random.rand(batch_size, action_dim)
next_obs = np.random.rand(batch_size, obs_dim)

ddpg_params = dict(
    batch_size=64,
    n_epochs=0,
    epoch_length=0,
    eval_samples=0,
    discount=0.99,
    qf_learning_rate=1e-3,
    policy_learning_rate=1e-4,
    soft_target_tau=0.001,
    replay_pool_size=1000000,
    min_pool_size=1000,
    scale_reward=0.1,
)
discount = ddpg_params['discount']

In [3]:
print(rewards)
print(terminals)
print(obs)
print(actions)
print(next_obs)

[ 0.15005835  0.81457649]
[0 1]
[[ 0.43511439  0.21486068  0.43619294  0.66923761  0.20440605  0.82207058
   0.83291033  0.72373561  0.89668103  0.67410786  0.80799981  0.64763201
   0.01083204  0.4382325   0.93362274  0.55795521  0.63737658  0.7260999
   0.9175968   0.17842764]
 [ 0.41534872  0.5935848   0.63982088  0.23709139  0.9229585   0.80080515
   0.99038569  0.92861875  0.28002253  0.97068026  0.24973167  0.93388785
   0.99066874  0.4360376   0.57956691  0.67015587  0.19678966  0.18611555
   0.22873158  0.39150123]]
[[ 0.04384032  0.64044176  0.06986806  0.99731914  0.78400959  0.12711896]
 [ 0.90925847  0.96190726  0.1259375   0.01973137  0.47221903  0.60472708]]
[[ 0.29052842  0.92648082  0.00907505  0.4897972   0.45359199  0.36603501
   0.26034967  0.76724245  0.64317068  0.36499064  0.72187408  0.24276138
   0.22878558  0.8248953   0.64472811  0.08181222  0.31025709  0.35683179
   0.68326028  0.1779539 ]
 [ 0.93819824  0.93290809  0.15855846  0.27508406  0.55827918  0.51646

## Create my stuff

In [4]:
sess_me = tf.Session()
with sess_me.as_default():
    es = OUStrategy(env_spec=env.spec)
    ddpg_params['Q_weight_decay'] = 0.
    qf_params = dict(
        embedded_hidden_sizes=(100, ),
        observation_hidden_sizes=(100, ),
        hidden_nonlinearity=tf.nn.relu,
    )
    policy_params = dict(
        observation_hidden_sizes=(100, 100),
        hidden_nonlinearity=tf.nn.relu,
        output_nonlinearity=tf.nn.tanh,
    )
    qf = FeedForwardCritic(
        "critic",
        env.observation_space.flat_dim,
        env.action_space.flat_dim,
        **qf_params
    )
    policy = FeedForwardPolicy(
        "actor",
        env.observation_space.flat_dim,
        env.action_space.flat_dim,
        **policy_params
    )
    my_algo = MyDDPG(
        env,
        es,
        policy,
        qf,
        **ddpg_params
    )
    my_policy = my_algo.actor
    my_qf = my_algo.critic
    my_target_policy = my_algo.target_actor
    my_target_qf = my_algo.target_critic

## Set up Shane

In [5]:
sess_shane = tf.Session()
with sess_shane.as_default():
    es = OUStrategy(env_spec=env.spec)
    policy = DeterministicMLPPolicy(
        name="init_policy",
        env_spec=env.spec,
        hidden_sizes=(100, 100),
        hidden_nonlinearity=tf.nn.relu,
        output_nonlinearity=tf.nn.tanh,
    )
    qf = ContinuousMLPQFunction(
        name="qf",
        env_spec=env.spec,
        hidden_sizes=(100, 100),
    )
    ddpg_params.pop('Q_weight_decay')
    shane_algo = ShaneDDPG(
        env,
        policy,
        qf,
        es,
        **ddpg_params
    )
    sess_shane.run(tf.initialize_all_variables())
    shane_algo.init_opt()
#     This initializes the optimizer parameters
    sess_shane.run(tf.initialize_all_variables())
    f_train_policy = shane_algo.opt_info['f_train_policy']
    f_train_qf = shane_algo.opt_info['f_train_qf']
    shane_target_qf = shane_algo.opt_info["target_qf"]
    shane_target_policy = shane_algo.opt_info["target_policy"]
    shane_policy = shane_algo.policy
    shane_qf = shane_algo.qf

## Measure stuff from Shane's algo

In [6]:
with sess_shane.as_default():
    shane_policy_param_values = shane_policy.flat_to_params(
        shane_policy.get_param_values()
    )
    shane_qf_param_values = shane_qf.flat_to_params(
        shane_qf.get_param_values()
    )
    # TODO(vpong): why are these two necessary?
    shane_target_policy.set_param_values(shane_policy.get_param_values())
    shane_target_qf.set_param_values(shane_qf.get_param_values())

    shane_actions, _ = shane_policy.get_actions(obs)
    shane_qf_out = shane_qf.get_qval(obs, actions)
    shane_next_actions, _ = shane_target_policy.get_actions(next_obs)
    shane_next_target_qf_values = shane_target_qf.get_qval(next_obs, shane_next_actions)
    shane_ys = rewards + (1. - terminals) * discount * shane_next_target_qf_values

## Copy things to my algo

In [7]:
with sess_me.as_default():
    my_policy.set_param_values(shane_policy_param_values)
    my_target_policy.set_param_values(shane_policy_param_values)
    my_qf.set_param_values(shane_qf_param_values)
    my_target_qf.set_param_values(shane_qf_param_values)

## Measure stuff from my algo

In [8]:
feed_dict = my_algo._update_feed_dict(rewards, terminals, obs,
                                      actions, next_obs)


my_actions = sess_me.run(
        my_policy.output,
        feed_dict=feed_dict
    )
my_qf_out = sess_me.run(
        my_qf.output,
        feed_dict=feed_dict
    ).flatten()
my_next_actions = sess_me.run(
        my_target_policy.output,
        feed_dict=feed_dict
    )
my_next_target_qf_values = sess_me.run(
    my_algo.target_critic.output,
    feed_dict=feed_dict).flatten()
my_ys = sess_me.run(my_algo.ys, feed_dict=feed_dict).flatten()

my_policy_loss = sess_me.run(
    my_algo.actor_surrogate_loss,
    feed_dict=feed_dict)
my_qf_loss = sess_me.run(
    my_algo.critic_loss,
    feed_dict=feed_dict)

## Check that Shane and my params stayed the same

In [9]:
shane_policy = shane_algo.policy
shane_qf = shane_algo.qf
with sess_shane.as_default():
    shane_policy_param_values_new = shane_policy.flat_to_params(
        shane_policy.get_param_values()
    )
    shane_qf_param_values_new = shane_qf.flat_to_params(
        shane_qf.get_param_values()
    )
    shane_target_policy_param_values_new = shane_target_policy.flat_to_params(
        shane_target_policy.get_param_values()
    )
    shane_target_qf_param_values_new = shane_target_qf.flat_to_params(
        shane_target_qf.get_param_values()
    )
my_policy_params_values_new = my_algo.actor.get_param_values()
my_qf_params_values_new = my_algo.critic.get_param_values()
my_target_policy_params_values_new = my_algo.target_actor.get_param_values()
my_target_qf_params_values_new = my_algo.target_critic.get_param_values()
print(all((a==b).all() for a, b in zip(shane_policy_param_values, shane_policy_param_values_new)))
print(all((a==b).all() for a, b in zip(shane_policy_param_values, my_policy_params_values_new)))
print(all((a==b).all() for a, b in zip(shane_policy_param_values, shane_target_policy_param_values_new)))
print(all((a==b).all() for a, b in zip(shane_policy_param_values, my_target_policy_params_values_new)))
print(all((a==b).all() for a, b in zip(shane_qf_param_values, shane_qf_param_values_new)))
print(all((a==b).all() for a, b in zip(shane_qf_param_values, my_qf_params_values_new)))
print(all((a==b).all() for a, b in zip(shane_qf_param_values, shane_target_qf_param_values_new)))
print(all((a==b).all() for a, b in zip(shane_qf_param_values, my_target_qf_params_values_new)))

True
True
True
True
True
True
True
True


## Check critic outputs are the same

In [10]:
W1, b1, W2, b2, W3, b3 = shane_qf_param_values
output = np.matmul(obs, W1) + b1
output = np.maximum(output, 0)
output = np.hstack((output, actions))
output = np.matmul(output, W2) + b2
output = np.maximum(output, 0)
output = np.matmul(output, W3) + b3
expected_qf_out = output.flatten()

print(my_qf_out)
print(shane_qf_out)
print(expected_qf_out)

[-0.07917806  0.00283957]
[-0.07917806  0.00283957]
[-0.07917813  0.00283952]


## Check actor outputs are the same

In [11]:
W1, b1, W2, b2, W3, b3 = shane_policy_param_values
output = np.matmul(obs, W1) + b1
output = np.maximum(output, 0)
output = np.matmul(output, W2) + b2
output = np.maximum(output, 0)
output = np.matmul(output, W3) + b3
expected_action = output

print(my_actions)
print(shane_actions)
print(expected_action)

[[-0.20947778  0.04484395  0.08546824  0.01056851  0.00029767  0.0958475 ]
 [ 0.01458523 -0.0430692   0.10159081 -0.15388419 -0.06008253  0.18279688]]
[[-0.20947778  0.04484395  0.08546824  0.01056851  0.00029767  0.0958475 ]
 [ 0.01458523 -0.0430692   0.10159081 -0.15388419 -0.06008253  0.18279688]]
[[-0.21262505  0.04487398  0.0856773   0.01056885  0.00029774  0.09614267]
 [ 0.01458626 -0.04309584  0.10194247 -0.15511645 -0.06015505  0.18487474]]


## Check that next action outputs are the same

In [12]:
W1, b1, W2, b2, W3, b3 = shane_policy_param_values
output = np.matmul(next_obs, W1) + b1
output = np.maximum(output, 0)
output = np.matmul(output, W2) + b2
output = np.maximum(output, 0)
output = np.matmul(output, W3) + b3
expected_next_action = output

print(my_next_actions)
print(shane_next_actions)
print(expected_next_action)

[[-0.086945   -0.01997953  0.02840678  0.09882895  0.02658396  0.11652762]
 [ 0.01991368 -0.0152898   0.01624201  0.11547601 -0.00939338  0.18017189]]
[[-0.086945   -0.01997953  0.02840678  0.09882895  0.02658396  0.11652762]
 [ 0.01991368 -0.0152898   0.01624201  0.11547601 -0.00939338  0.18017189]]
[[-0.08716509 -0.01998221  0.02841444  0.09915265  0.02659021  0.11705939]
 [ 0.0199163  -0.015291    0.01624345  0.1159935  -0.00939367  0.18216033]]


## Check next critic outputs are the same

In [13]:
W1, b1, W2, b2, W3, b3 = shane_qf_param_values
output = np.matmul(next_obs, W1) + b1
output = np.maximum(output, 0)
output = np.hstack((output, expected_next_action))
output = np.matmul(output, W2) + b2
output = np.maximum(output, 0)
output = np.matmul(output, W3) + b3
expected_target_qf_values = output.flatten()

print(shane_next_target_qf_values)
print(my_next_target_qf_values)
print(expected_target_qf_values)

[-0.03675101  0.01799645]
[-0.03675101  0.01799645]
[-0.03672561  0.01806539]


In [14]:
my_expected_ys = rewards + (1. - terminals) * discount * my_next_target_qf_values
shane_expected_ys = rewards + (1. - terminals) * discount * shane_next_target_qf_values
expected_ys = rewards + (1. - terminals) * discount * expected_target_qf_values
print(shane_ys)
print(shane_expected_ys)
print(my_ys)
print(my_expected_ys)
print(expected_ys)

[ 0.11367485  0.81457649]
[ 0.11367485  0.81457649]
[ 0.11367485  0.81457651]
[ 0.11367485  0.81457649]
[ 0.11369999  0.81457649]


## Check losses are the same
Only do this once since it changes the params!

In [15]:
with sess_shane.as_default():
    shane_policy_loss, _ = f_train_policy(obs)
    shane_qf_loss, qval, _ = f_train_qf(shane_ys, obs, actions)

In [16]:
print(my_policy_loss)
print(shane_policy_loss)

0.0512864
0.0512864


In [17]:
print(shane_qf_loss)
print(my_qf_loss)

0.348055
0.348055


In [None]:
sess.close()