In [1]:
import numpy as np
import tensorflow as tf
import gym
import time

In [None]:
EPS = 1e-8

def placeholder(dim=None):
    return tf.placeholder(dtype=tf.float32, shape=(None,dim) if dim else (None,))

def placeholders(*args):
    return [placeholder(dim) for dim in args]

def mlp(x, hidden_sizes=(32,), activation=tf.tanh, output_activation=None):
    for h in hidden_sizes[:-1]:
        x = tf.layers.dense(x, units=h, activation=activation)
    return tf.layers.dense(x, units=hidden_sizes[-1], activation=output_activation)

def get_vars(scope):
    return [x for x in tf.global_variables() if scope in x.name]

def count_vars(scope):
    v = get_vars(scope)
    return sum([np.prod(var.shape.as_list()) for var in v])

def gaussian_likelihood(x, mu, log_std):
    pre_sum = -0.5 * (((x-mu)/(tf.exp(log_std)+EPS))**2 + 2*log_std + np.log(2*np.pi))
    return tf.reduce_sum(pre_sum, axis=1)

def clip_but_pass_gradient(x, l=-1., u=1.):
    clip_up = tf.cast(x > u, tf.float32)
    clip_low = tf.cast(x < l, tf.float32)
    return x + tf.stop_gradient((u - x)*clip_up + (l - x)*clip_low)


"""
Policies
"""

LOG_STD_MAX = 2
LOG_STD_MIN = -20

def mlp_gaussian_policy(x, a, hidden_sizes, activation, output_activation):
    act_dim = a.shape.as_list()[-1]
    net = mlp(x, list(hidden_sizes), activation, activation)
    mu = tf.layers.dense(net, act_dim, activation=output_activation)

    """
    Because algorithm maximizes trade-off of reward and entropy,
    entropy must be unique to state---and therefore log_stds need
    to be a neural network output instead of a shared-across-states
    learnable parameter vector. But for deep Relu and other nets,
    simply sticking an activationless dense layer at the end would
    be quite bad---at the beginning of training, a randomly initialized
    net could produce extremely large values for the log_stds, which
    would result in some actions being either entirely deterministic
    or too random to come back to earth. Either of these introduces
    numerical instability which could break the algorithm. To 
    protect against that, we'll constrain the output range of the 
    log_stds, to lie within [LOG_STD_MIN, LOG_STD_MAX]. This is 
    slightly different from the trick used by the original authors of
    SAC---they used tf.clip_by_value instead of squashing and rescaling.
    I prefer this approach because it allows gradient propagation
    through log_std where clipping wouldn't, but I don't know if
    it makes much of a difference.
    """
    log_std = tf.layers.dense(net, act_dim, activation=tf.tanh)
    log_std = LOG_STD_MIN + 0.5 * (LOG_STD_MAX - LOG_STD_MIN) * (log_std + 1)

    std = tf.exp(log_std)
    pi = mu + tf.random_normal(tf.shape(mu)) * std
    logp_pi = gaussian_likelihood(pi, mu, log_std)
    return mu, pi, logp_pi

def apply_squashing_func(mu, pi, logp_pi):
    mu = tf.tanh(mu)
    pi = tf.tanh(pi)
    # To avoid evil machine precision error, strictly clip 1-pi**2 to [0,1] range.
    logp_pi -= tf.reduce_sum(tf.log(clip_but_pass_gradient(1 - pi**2, l=0, u=1) + 1e-6), axis=1)
    return mu, pi, logp_pi


"""
Actor-Critics
"""
def mlp_actor_critic(x, a, hidden_sizes=(400,300), activation=tf.nn.relu, 
                     output_activation=None, policy=mlp_gaussian_policy, action_space=None):
    # policy
    with tf.variable_scope('pi'):
        mu, pi, logp_pi = policy(x, a, hidden_sizes, activation, output_activation)
        mu, pi, logp_pi = apply_squashing_func(mu, pi, logp_pi)

    # make sure actions are in correct range
    action_scale = action_space.high[0]
    mu *= action_scale
    pi *= action_scale

    # vfs
    vf_mlp = lambda x : tf.squeeze(mlp(x, list(hidden_sizes)+[1], activation, None), axis=1)
    with tf.variable_scope('q1'):
        q1 = vf_mlp(tf.concat([x,a], axis=-1))
    with tf.variable_scope('q1', reuse=True):
        q1_pi = vf_mlp(tf.concat([x,pi], axis=-1))
    with tf.variable_scope('q2'):
        q2 = vf_mlp(tf.concat([x,a], axis=-1))
    with tf.variable_scope('q2', reuse=True):
        q2_pi = vf_mlp(tf.concat([x,pi], axis=-1))
    with tf.variable_scope('v'):
        v = vf_mlp(x)
    return mu, pi, logp_pi, q1, q2, q1_pi, q2_pi, v

In [None]:
class ReplayBuffer:
    """
    A simple FIFO experience replay buffer for SAC agents.
    """

    def __init__(self, obs_dim, act_dim, size):
        self.obs1_buf = np.zeros([size, obs_dim], dtype=np.float32)
        self.obs2_buf = np.zeros([size, obs_dim], dtype=np.float32)
        self.acts_buf = np.zeros([size, act_dim], dtype=np.float32)
        self.rews_buf = np.zeros(size, dtype=np.float32)
        self.done_buf = np.zeros(size, dtype=np.float32)
        self.ptr, self.size, self.max_size = 0, 0, size

    def store(self, obs, act, rew, next_obs, done):
        self.obs1_buf[self.ptr] = obs
        self.obs2_buf[self.ptr] = next_obs
        self.acts_buf[self.ptr] = act
        self.rews_buf[self.ptr] = rew
        self.done_buf[self.ptr] = done
        self.ptr = (self.ptr+1) % self.max_size
        self.size = min(self.size+1, self.max_size)

    def sample_batch(self, batch_size=32):
        idxs = np.random.randint(0, self.size, size=batch_size)
        return dict(obs1=self.obs1_buf[idxs],
                    obs2=self.obs2_buf[idxs],
                    acts=self.acts_buf[idxs],
                    rews=self.rews_buf[idxs],
                    done=self.done_buf[idxs])
    
env_name = 'Humanoid-v2'
env_fn = lambda : gym.make(env_name)
actor_critic=mlp_actor_critic
ac_kwargs=dict(hidden_sizes=[400, 300])
seed=0
steps_per_epoch=5000
epochs=2000
replay_size=int(1e6)
gamma=0.99
polyak=0.995
lr=1e-3
alpha=0.2
batch_size=100
start_steps=10000
max_ep_len=1000
save_freq=1

tf.set_random_seed(seed)
np.random.seed(seed)

env, test_env = env_fn(), env_fn()
obs_dim = env.observation_space.shape[0]
act_dim = env.action_space.shape[0]

# Action limit for clamping: critically, assumes all dimensions share the same bound!
act_limit = env.action_space.high[0]

# Share information about action space with policy architecture
ac_kwargs['action_space'] = env.action_space

# Inputs to computation graph
x_ph, a_ph, x2_ph, r_ph, d_ph = placeholders(obs_dim, act_dim, obs_dim, None, None)

# Main outputs from computation graph
with tf.variable_scope('main'):
    mu, pi, logp_pi, q1, q2, q1_pi, q2_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs)

# Target value network
with tf.variable_scope('target'):
    _, _, _, _, _, _, _, v_targ  = actor_critic(x2_ph, a_ph, **ac_kwargs)

# Experience buffer
replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size)

# Count variables
var_counts = tuple(count_vars(scope) for scope in 
                   ['main/pi', 'main/q1', 'main/q2', 'main/v', 'main'])
print(('\nNumber of parameters: \t pi: %d, \t' + \
       'q1: %d, \t q2: %d, \t v: %d, \t total: %d\n')%var_counts)

# Min Double-Q:
min_q_pi = tf.minimum(q1_pi, q2_pi)

# Targets for Q and V regression
q_backup = tf.stop_gradient(r_ph + gamma*(1-d_ph)*v_targ)
v_backup = tf.stop_gradient(min_q_pi - alpha * logp_pi)

# Soft actor-critic losses
pi_loss = tf.reduce_mean(alpha * logp_pi - q1_pi)
q1_loss = 0.5 * tf.reduce_mean((q_backup - q1)**2)
q2_loss = 0.5 * tf.reduce_mean((q_backup - q2)**2)
v_loss = 0.5 * tf.reduce_mean((v_backup - v)**2)
value_loss = q1_loss + q2_loss + v_loss

# Policy train op 
# (has to be separate from value train op, because q1_pi appears in pi_loss)
pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi'))

# Value train op
# (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order)
value_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
value_params = get_vars('main/q') + get_vars('main/v')
with tf.control_dependencies([train_pi_op]):
    train_value_op = value_optimizer.minimize(value_loss, var_list=value_params)

# Polyak averaging for target variables
# (control flow because sess.run otherwise evaluates in nondeterministic order)
with tf.control_dependencies([train_value_op]):
    target_update = tf.group([tf.assign(v_targ, polyak*v_targ + (1-polyak)*v_main)
                              for v_main, v_targ in zip(get_vars('main'), get_vars('target'))])

# All ops to call during one training step
step_ops = [pi_loss, q1_loss, q2_loss, v_loss, q1, q2, v, logp_pi, 
            train_pi_op, train_value_op, target_update]

# Initializing targets to match main variables
target_init = tf.group([tf.assign(v_targ, v_main)
                          for v_main, v_targ in zip(get_vars('main'), get_vars('target'))])

config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)
sess.run(tf.global_variables_initializer())
sess.run(target_init)

def get_action(o, deterministic=False):
    act_op = mu if deterministic else pi
    return sess.run(act_op, feed_dict={x_ph: o.reshape(1,-1)})

def test_agent(n=10):
    global sess, mu, pi, q1, q2, q1_pi, q2_pi
    ep_ret_list = []
    for j in range(n):
        o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
        while not(d or (ep_len == max_ep_len)):
            # Take deterministic actions at test time 
            o, r, d, _ = test_env.step(get_action(o, True))
            ep_ret += r
            ep_len += 1
        ep_ret_list.append(ep_ret)
    return ep_ret_list

start_time = time.time()
o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
total_steps = steps_per_epoch * epochs

# Main loop: collect experience in env and update/log each epoch
for t in range(total_steps):

    """
    Until start_steps have elapsed, randomly sample actions
    from a uniform distribution for better exploration. Afterwards, 
    use the learned policy. 
    """
    if t > start_steps:
        a = get_action(o)
    else:
        a = env.action_space.sample()

    # Step the env
    o2, r, d, _ = env.step(a)
    ep_ret += r
    ep_len += 1

    # Ignore the "done" signal if it comes from hitting the time
    # horizon (that is, when it's an artificial terminal signal
    # that isn't based on the agent's state)
    d = False if ep_len==max_ep_len else d

    # Store experience to replay buffer
    replay_buffer.store(o, a, r, o2, d)

    # Super critical, easy to overlook step: make sure to update 
    # most recent observation!
    o = o2

    if d or (ep_len == max_ep_len):
        """
        Perform all SAC updates at the end of the trajectory.
        This is a slight difference from the SAC specified in the
        original paper.
        """
        for j in range(ep_len):
            batch = replay_buffer.sample_batch(batch_size)
            feed_dict = {x_ph: batch['obs1'],
                         x2_ph: batch['obs2'],
                         a_ph: batch['acts'],
                         r_ph: batch['rews'],
                         d_ph: batch['done'],
                        }
            outs = sess.run(step_ops, feed_dict)
        o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0


    # End of epoch wrap-up
    if t > 0 and t % steps_per_epoch == 0:
        epoch = t // steps_per_epoch

        # Test the performance of the deterministic version of the agent.
        ep_ret_list = test_agent()
        print('Test Ret:%f'%np.mean(ep_ret_list))


Number of parameters: 	 pi: 281334, 	q1: 278201, 	 q2: 278201, 	 v: 271401, 	 total: 1109137

Test Ret:283.702815
Test Ret:498.413462
Test Ret:265.442737
Test Ret:478.326412
Test Ret:393.147350
Test Ret:626.067483
Test Ret:486.475988
Test Ret:456.820201
Test Ret:411.577806
Test Ret:359.563490
Test Ret:313.102030
Test Ret:318.860593
Test Ret:462.067695
Test Ret:462.588447
Test Ret:622.569235
Test Ret:451.159121
Test Ret:452.473440
Test Ret:434.464177
Test Ret:434.982440
Test Ret:439.034469
Test Ret:429.438684
Test Ret:418.636664
Test Ret:398.420719
Test Ret:449.783412
Test Ret:455.992983
Test Ret:434.682448
Test Ret:453.871258
Test Ret:407.876822
Test Ret:347.345777
Test Ret:387.467781
Test Ret:329.500110
Test Ret:384.300975
Test Ret:425.756405
Test Ret:399.106759
Test Ret:465.872194
Test Ret:438.196273
Test Ret:548.312754
Test Ret:581.241803
Test Ret:415.184591
Test Ret:425.003182
Test Ret:519.546145
Test Ret:533.682242
Test Ret:548.018380
Test Ret:504.877966
Test Ret:449.583914
Test 

In [None]:
total_steps = 10000
total_ep_len = 0
demo_list = []

o, r, d, ep_ret, ep_len, n = env.reset(), 0, False, 0, 0, 0
obs_list = [o]
act_list = []
rew_list = []
don_list = []

for t in range(total_steps):
    
    a = get_action(o, True)
    o, r, d, _ = env.step(a)
    
    act_list.append(a)
    rew_list.append(r)
    don_list.append(d)
    obs_list.append(o)
    
    ep_ret += r
    ep_len += 1

    if d or (ep_len == max_ep_len):
        print(ep_len)
        demo_list.append({'obs':obs_list,'act':act_list,'rew':rew_list,'done':don_list})
        total_ep_len += ep_len
        
        o, r, d, ep_ret, ep_len, n = env.reset(), 0, False, 0, 0, 0
        obs_list = [o]
        act_list = []
        rew_list = []
        don_list = []
        
if total_ep_len < total_steps:
    demo_list.append({'obs':obs_list,'act':act_list,'rew':rew_list,'done':don_list})

In [None]:
np.save('exp_'+env_name+'.npy',demo_list)