In [1]:
import numpy as np
import tensorflow as tf
import gym
import time

In [None]:
def placeholder(dim=None):
    return tf.placeholder(dtype=tf.float32, shape=(None,dim) if dim else (None,))

def placeholders(*args):
    return [placeholder(dim) for dim in args]

def mlp(x, hidden_sizes=(32,), activation=tf.tanh, output_activation=None, kernel_initializer=tf.orthogonal_initializer(), output_kernel_initializer=tf.orthogonal_initializer()):
    for h in hidden_sizes[:-1]:
        x = tf.layers.dense(x, units=h, activation=activation, kernel_initializer=kernel_initializer)
    return tf.layers.dense(x, units=hidden_sizes[-1], activation=output_activation, kernel_initializer=output_kernel_initializer)

def mlp_disc(x, hidden_sizes=(32,), activation=tf.tanh, output_activation=None, kernel_initializer=tf.orthogonal_initializer(), output_kernel_initializer=tf.orthogonal_initializer()):
    for h in hidden_sizes[:-1]:
        x = tf.layers.dense(x, units=h, activation=activation, kernel_initializer=kernel_initializer)
    return tf.layers.dense(x, units=hidden_sizes[-1], activation=output_activation, kernel_initializer=output_kernel_initializer)

def get_vars(scope):
    return [x for x in tf.global_variables() if scope in x.name]

def count_vars(scope):
    v = get_vars(scope)
    return sum([np.prod(var.shape.as_list()) for var in v])

"""
Actor-Critics
"""
def mlp_actor_critic(x, a, x2, x_exp, a_exp, batch_size, hidden_sizes=(400,300), activation=tf.nn.relu, 
                     output_activation=tf.tanh, action_space=None):
    act_dim = a.shape.as_list()[-1]
    act_limit = action_space.high[0]
    with tf.variable_scope('pi'):
        pi = act_limit * mlp(x, list(hidden_sizes)+[act_dim], activation, output_activation, output_kernel_initializer=tf.orthogonal_initializer(0.01))
    with tf.variable_scope('q1'):
        q1 = tf.squeeze(mlp(tf.concat([x,a], axis=-1), list(hidden_sizes)+[1], activation, None), axis=1)
    with tf.variable_scope('q2'):
        q2 = tf.squeeze(mlp(tf.concat([x,a], axis=-1), list(hidden_sizes)+[1], activation, None), axis=1)
    with tf.variable_scope('q1', reuse=True):
        q1_pi = tf.squeeze(mlp(tf.concat([x,pi], axis=-1), list(hidden_sizes)+[1], activation, None), axis=1)
    
    df_mlp = lambda x : tf.squeeze(mlp_disc(x, [256,256,1], tf.tanh, None), axis=1)
    with tf.variable_scope('th'):
        r = df_mlp(tf.concat([x,a], axis=-1))
    with tf.variable_scope('th', reuse=True):
        r_exp = df_mlp(tf.concat([x_exp,a_exp], axis=-1))
    with tf.variable_scope('th', reuse=True):
        eps = tf.random_uniform((batch_size,1), 0.0, 1.0)
        x_eps = eps * x_exp + (1. - eps) * x
        a_eps = eps * a_exp + (1. - eps) * a
        
        d_input = tf.concat([x_eps,a_eps], axis=-1) 
        r_eps = df_mlp(d_input)
        gradients = tf.gradients(r_eps, d_input)[0]
        grad_l2 = tf.sqrt(tf.reduce_sum(tf.square(gradients),axis=-1))
        gp_loss = tf.reduce_mean(tf.square(grad_l2 - 1.0)) 
        
    return pi, q1, q2, q1_pi, r, r_exp, gp_loss

In [None]:
class ReplayBuffer:
    """
    A simple FIFO experience replay buffer for TD3 agents.
    """

    def __init__(self, obs_dim, act_dim, size):
        self.obs1_buf = np.zeros([size, obs_dim], dtype=np.float32)
        self.obs2_buf = np.zeros([size, obs_dim], dtype=np.float32)
        self.acts_buf = np.zeros([size, act_dim], dtype=np.float32)
        self.rews_buf = np.zeros(size, dtype=np.float32)
        self.done_buf = np.zeros(size, dtype=np.float32)
        
        self.absorbing_state = np.zeros((obs_dim), dtype=np.float32)
        self.zero_action = np.zeros((act_dim), dtype=np.float32)
        
        self.ptr, self.size, self.max_size = 0, 0, size

    def store(self, obs, act, rew, next_obs, done):
        
        if done:
            self.obs1_buf[self.ptr] = obs
            self.obs2_buf[self.ptr] = self.absorbing_state
            self.acts_buf[self.ptr] = act
            self.rews_buf[self.ptr] = rew
            self.done_buf[self.ptr] = False
        else:
            self.obs1_buf[self.ptr] = obs
            self.obs2_buf[self.ptr] = next_obs
            self.acts_buf[self.ptr] = act
            self.rews_buf[self.ptr] = rew
            self.done_buf[self.ptr] = False
                
        self.ptr = (self.ptr+1) % self.max_size
        self.size = min(self.size+1, self.max_size)
    
    def add_absorbing(self):
        self.store(self.absorbing_state, self.zero_action, 0, self.absorbing_state, False)
    
    def sample_batch(self, batch_size=32):
        idxs = np.random.randint(0, self.size, size=batch_size)
        return dict(obs1=self.obs1_buf[idxs],
                    obs2=self.obs2_buf[idxs],
                    acts=self.acts_buf[idxs],
                    rews=self.rews_buf[idxs],
                    done=self.done_buf[idxs])

env_name = 'Hopper-v2'
env_fn = lambda : gym.make(env_name)
actor_critic=mlp_actor_critic
ac_kwargs=dict(hidden_sizes=[400, 300])
seed=0
steps_per_epoch=5000
epochs=200
replay_size=int(1e6)
total_exp_data_size=10000
exp_data_size=5000
gamma=0.8
polyak=0.995
th_lr=1e-3
th_lr_decay=0.5
th_lr_schedule=100000
pi_lr=1e-3
q_lr=1e-3
batch_size=100
start_steps=10000
act_noise=0.1
target_noise=0.2
noise_clip=0.5
policy_delay=2
disc_delay=1
max_ep_len=1000
save_freq=1

tf.set_random_seed(seed)
np.random.seed(seed)

env, test_env = env_fn(), env_fn()
obs_dim = env.observation_space.shape[0]
act_dim = env.action_space.shape[0]

# Action limit for clamping: critically, assumes all dimensions share the same bound!
act_limit = env.action_space.high[0]

# Share information about action space with policy architecture
ac_kwargs['action_space'] = env.action_space

# Inputs to computation graph
x_ph, a_ph, x2_ph, x_exp_ph, a_exp_ph, r_ph, d_ph = placeholders(obs_dim, act_dim, obs_dim, obs_dim, act_dim, None, None)
th_lr_ph = tf.placeholder(dtype=tf.float32, shape=())

# Main outputs from computation graph
with tf.variable_scope('main'):
    pi, q1, q2, q1_pi, th, th_exp, gp_loss = actor_critic(x_ph, a_ph, x2_ph, x_exp_ph, a_exp_ph, batch_size, **ac_kwargs)

# Target policy network
with tf.variable_scope('target'):
    pi_targ, _, _, _, _, _, _  = actor_critic(x2_ph, a_ph, x2_ph, x_exp_ph, a_exp_ph, batch_size, **ac_kwargs)

# Target Q networks
with tf.variable_scope('target', reuse=True):

    # Target policy smoothing, by adding clipped noise to target actions
    epsilon = tf.random_normal(tf.shape(pi_targ), stddev=target_noise)
    epsilon = tf.clip_by_value(epsilon, -noise_clip, noise_clip)
    a2 = pi_targ + epsilon
    a2 = tf.clip_by_value(a2, -act_limit, act_limit)

    # Target Q-values, using action from target policy
    _, q1_targ, q2_targ, _, _, _, _ = actor_critic(x2_ph, a2, x2_ph, x_exp_ph, a_exp_ph, batch_size, **ac_kwargs)

# Experience buffer
replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size)
exp_replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=exp_data_size)

# Load Expert's Demos
exp_data_delay = 1#int(total_exp_data_size/exp_data_size)
demo_list = np.load('exp_'+env_name+'.npy')
for demo in demo_list:
    for t in range(len(demo['rew'])):
        if (t%exp_data_delay)==0:
            o = demo['obs'][t]
            a = demo['act'][t]
            r = demo['rew'][t]
            d = demo['done'][t]
            o2 = demo['obs'][t+1]
            if d:
                exp_replay_buffer.add_absorbing()
            else:
                exp_replay_buffer.store(o,a,r,o2,d)
        if exp_data_size < exp_replay_buffer.size:
            break
    if exp_data_size < exp_replay_buffer.size:
        break
        
# Count variables
var_counts = tuple(count_vars(scope) for scope in ['main/th', 'main/pi', 'main/q1', 'main/q2', 'main'])
print('\nNumber of parameters: \t th: %d, \t pi: %d, \t q1: %d, \t q2: %d, \t total: %d\n'%var_counts)

# Bellman backup for Q functions, using Clipped Double-Q targets

min_q_targ = tf.minimum(q1_targ, q2_targ)
r_backup = th
# backup = tf.stop_gradient(r_backup + gamma*(1-d_ph)*min_q_targ)
backup = tf.stop_gradient(r_backup + gamma*min_q_targ)

# TD3 losses
th_loss = tf.reduce_mean(th) - tf.reduce_mean(th_exp) + 10.*gp_loss
gen_acc = tf.reduce_mean(tf.sigmoid(th))
exp_acc = tf.reduce_mean(tf.sigmoid(th_exp))

pi_loss = -tf.reduce_mean(q1_pi)
q1_loss = tf.reduce_mean((q1-backup)**2)
q2_loss = tf.reduce_mean((q2-backup)**2)
q_loss = q1_loss + q2_loss

th_optimizer = tf.train.AdamOptimizer(learning_rate=th_lr_ph)
train_th_op = th_optimizer.minimize(th_loss, var_list=get_vars('main/th'))

# Separate train ops for pi, q
pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr)
q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr)
train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi'))
train_q_op = q_optimizer.minimize(q_loss, var_list=get_vars('main/q'))
    
# Polyak averaging for target variables
target_update = tf.group([tf.assign(v_targ, polyak*v_targ + (1-polyak)*v_main)
                          for v_main, v_targ in zip(get_vars('main'), get_vars('target'))])

q_step_ops = [q_loss, q1, q2, train_q_op]
pi_step_ops = [pi_loss, train_pi_op, target_update]
th_step_ops = [tf.reduce_mean(r_backup), th_loss, gen_acc, exp_acc, train_th_op]

# Initializing targets to match main variables
target_init = tf.group([tf.assign(v_targ, v_main)
                          for v_main, v_targ in zip(get_vars('main'), get_vars('target'))])

config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)
sess.run(tf.global_variables_initializer())
sess.run(target_init)

def get_action(o, noise_scale):
    a = sess.run(pi, feed_dict={x_ph: o.reshape(1,-1)})
#     a += noise_scale * np.random.randn(act_dim)
    return np.clip(a, -act_limit, act_limit)

def test_agent(n=10):
    ep_ret_list = []
    for j in range(n):
        o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
        while not(d or (ep_len == max_ep_len)):
            # Take deterministic actions at test time (noise_scale=0)
            o, r, d, _ = test_env.step(get_action(o, 0))
            ep_ret += r
            ep_len += 1
        ep_ret_list.append(ep_ret)
    return ep_ret_list

start_time = time.time()
o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
total_steps = steps_per_epoch * epochs

ep_ret_list = []

# Main loop: collect experience in env and update/log each epoch
for t in range(total_steps):
#     if t > start_steps:
#         a = get_action(o, act_noise)
#     else:
    a = env.action_space.sample()

    # Step the env
    o2, r, d, _ = env.step(a)
    ep_ret += r
    ep_len += 1

    # Ignore the "done" signal if it comes from hitting the time
    # horizon (that is, when it's an artificial terminal signal
    # that isn't based on the agent's state)
    d = False if ep_len==max_ep_len else d

    
    if d:
        replay_buffer.add_absorbing()
    else:
        replay_buffer.store(o, a, r, o2, d)

    # Super critical, easy to overlook step: make sure to update 
    # most recent observation!
    o = o2

    if d or (ep_len == max_ep_len):
        for j in range(ep_len):
            batch_exp = exp_replay_buffer.sample_batch(batch_size)
            batch = replay_buffer.sample_batch(batch_size)
            feed_dict = {x_ph: batch['obs1'],
                         x2_ph: batch['obs2'],
                         a_ph: batch['acts'],
                         x_exp_ph: batch_exp['obs1'],
                         a_exp_ph: batch_exp['acts'],
                         d_ph: batch['done'], 
                         th_lr_ph: th_lr
                        }
            if j % disc_delay == 0:
                th_outs = sess.run(th_step_ops, feed_dict)
                
        for j in range(ep_len):
            batch_exp = exp_replay_buffer.sample_batch(batch_size)
            batch = replay_buffer.sample_batch(batch_size)
            feed_dict = {x_ph: batch['obs1'],
                         x2_ph: batch['obs2'],
                         a_ph: batch['acts'],
                         x_exp_ph: batch_exp['obs1'],
                         a_exp_ph: batch_exp['acts'],
                         d_ph: batch['done'], 
                         th_lr_ph: th_lr
                        }
            outs = sess.run(q_step_ops, feed_dict)

            if j % policy_delay == 0:
                # Delayed policy update
                outs = sess.run(pi_step_ops, feed_dict)

        o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
    
    if t > 0 and t % th_lr_schedule == 0:
        th_lr *= th_lr_decay
    
    # End of epoch wrap-up
    if t > 0 and t % steps_per_epoch == 0:
        epoch = t // steps_per_epoch
        # Test the performance of the deterministic version of the agent.
        test_ep_ret = test_agent()
        ep_ret_list.append(np.mean(test_ep_ret))
        print('Test Ret:%f'%np.mean(test_ep_ret))
        print(th_outs[0],th_outs[1],th_outs[2],th_outs[3])


Number of parameters: 	 th: 69889, 	 pi: 126003, 	 q1: 126601, 	 q2: 126601, 	 total: 449094

Test Ret:2.905045
-9.416693 -7.1582 0.00021290638 0.43628243
Test Ret:-0.793679
-8.065256 -6.7616076 0.0006740146 0.4675132
Test Ret:112.672675
-7.565964 -7.366626 0.0013355899 0.6056822
Test Ret:36.824106
-7.8366313 -6.3153634 0.0023149333 0.43670315
Test Ret:96.868555
-7.6556077 -7.9196305 0.0007522656 0.6101448
Test Ret:45.345858
-7.8434343 -7.375608 0.001734449 0.52630734
Test Ret:40.180334
-7.8471346 -7.165455 0.0008275561 0.52538896
Test Ret:61.114138
-8.303458 -6.971796 0.00040117846 0.47681358
Test Ret:44.699716
-7.9858465 -7.5913343 0.00065875775 0.5495264
Test Ret:47.530134
-7.5837865 -6.1215568 0.0010890146 0.42488706
Test Ret:69.987013
-7.358836 -6.5364265 0.0012357379 0.48407504
Test Ret:58.500110
-7.340701 -7.074217 0.0009851581 0.5574119
Test Ret:-1.580682
-7.575751 -7.2487245 0.001046163 0.5182341
Test Ret:-1.351147
-7.45604 -7.4401116 0.0010948793 0.542935
Test Ret:55.955081


In [None]:
from matplotlib import pyplot as plt
%matplotlib inline

In [None]:
plt.plot(ep_ret_list)