In [1]:
import os.path, gym
import numpy as np
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import tensorflow as tf
import roboschool
import pdb

def apply_clipped_optimizer(opt_fcn,
                            loss,
                            clip_norm=.1,
                            clip_single=.03,
                            clip_global_norm=False,
                            var_list=None):
    if var_list is None:
        gvs = opt_fcn.compute_gradients(loss)
    else:
        gvs = opt_fcn.compute_gradients(loss, var_list = var_list)
        

    if clip_global_norm:
        gs, vs = zip(*[(g, v) for g, v in gvs if g is not None])
        capped_gs, grad_norm_total = tf.clip_by_global_norm([g for g in gs],clip_norm)
        capped_gvs = list(zip(capped_gs, vs))
    else:
        grad_norm_total = tf.sqrt(
                tf.reduce_sum([
                        tf.reduce_sum(tf.square(grad)) for grad, var in gvs
                        if grad is not None
                ]))
        capped_gvs = [(tf.clip_by_value(grad, -1 * clip_single, clip_single), var)
                                    for grad, var in gvs if grad is not None]
        capped_gvs = [(tf.clip_by_norm(grad, clip_norm), var)
                                    for grad, var in capped_gvs if grad is not None]

    optimizer = opt_fcn.apply_gradients(capped_gvs)

    return optimizer, grad_norm_total

def MLP(x, lshapes, output_units, name_fcn):
    h = [x]
    h.append(tf.nn.leaky_relu(tf.layers.dense(h[-1], lshapes[0], name=name_fcn())))
    for size in lshapes:
        h.append(tf.nn.leaky_relu(h[-1] + tf.layers.dense(h[-1], size, name=name_fcn())))
    output = tf.layers.dense(h[-1], output_units, name=name_fcn())
    if output_units == 1:
        output = tf.squeeze(output, -1)
    return h, output
NUM_HISTORY = 6
INPUT_UNITS = 44 * NUM_HISTORY
class PolicyLearner(object):
    def __init__(self, ob_space, ac_space, take_weights_here=None, 
                 lshapes = [128]*4, config = None):
        self.a_idx = 0
        self.c_idx = 0
        self.sess = tf.InteractiveSession(config=config)
        self.obs = tf.placeholder(tf.float32, (None, None, INPUT_UNITS))
        self.metaobs = tf.placeholder(tf.float32, (None, None, 1))
        self.returns = tf.placeholder(tf.float32, (None, None))
        self.mask = tf.placeholder(tf.float32, (None, None))
        self.lr = tf.placeholder_with_default(1e-5, (None))

        self.actions_input = tf.concat((self.obs, self.metaobs), axis=-1)
        
        self.h, pi = MLP(self.actions_input, lshapes, 17, self.a_name)
        self.pi = tf.nn.tanh(pi/20) * 5

        self.hs, self.state_value_estimate = MLP(self.actions_input, lshapes, 1, self.c_name)
        
        self.critic_input = tf.concat((self.actions_input, self.pi), -1)
        
        self.advantage = ((
            self.state_value_estimate[:,1:] + self.returns) -
            self.state_value_estimate[:,:-1])
        
        self.hae, self.advantage_estimator = MLP(self.critic_input, lshapes, 1, self.c_name)
        
        self.t_vars = tf.trainable_variables()
        self.c_vars = [var for var in self.t_vars if 'c_' in var.name]
        self.a_vars = [var for var in self.t_vars if 'a_' in var.name]
        
        self.critic_loss = tf.reduce_mean(tf.square(
            self.advantage_estimator[:,:-1] - self.advantage) * self.mask)
        self.actor_loss = -tf.reduce_mean(
            self.advantage_estimator[:,:-1] * self.mask) + tf.reduce_mean(
            tf.square(pi[:,:-1,:]) * tf.expand_dims(self.mask, -1))/100
        self.total_loss = self.critic_loss + self.actor_loss/10
        self.critic_opt = tf.train.AdamOptimizer(self.lr)
        self.actor_opt = tf.train.AdamOptimizer(self.lr)
        self.copt, self.c_norm = apply_clipped_optimizer(
            self.critic_opt, self.critic_loss, var_list = self.c_vars)
        self.aopt, self.a_norm = apply_clipped_optimizer(
            self.actor_opt, self.actor_loss, var_list = self.a_vars)

    def a_name(self):
        self.a_idx += 1
        return 'a_' + str(self.a_idx)
    
    def c_name(self):
        self.c_idx += 1
        return 'c_' + str(self.c_idx)
    
    def load_weights(self):
        feed_dict = {}
        for (var, w), ph in zip(self.assigns, self.weight_assignment_placeholders):
            feed_dict[ph] = w
        self.sess.run(self.weight_assignment_nodes, feed_dict=feed_dict)

    def act(self, obs, metaobs, cx):
        # Because we need batch dimension, data[None] changes shape from [A] to [1,A]
        a = self.sess.run(
            self.pi, feed_dict={self.obs:np.reshape(obs, (-1, 1, INPUT_UNITS)),
                                self.metaobs:np.reshape(metaobs, (-1, 1, 1))
            })
        return a[0][0]  # return first in batch

    
    

config = tf.ConfigProto(
    inter_op_parallelism_threads=0,
    intra_op_parallelism_threads=0,
    device_count = { "GPU": 0 } )
tf.reset_default_graph()

env = gym.make("RoboschoolHumanoidFlagrun-v1")
pi = PolicyLearner(env.observation_space, env.action_space, config = config)
sess = pi.sess
self = pi
sess.run(tf.global_variables_initializer())
ah, sh = [np.zeros((0, 0, i)) for i in [17, INPUT_UNITS]]
mh, rh = [np.zeros((0, 0)) for i in [1, 1]]
globalframes = []
localframes = []

  return f(*args, **kwds)


In [2]:

for ep in range(1000):
    an, sn = [np.zeros((0, i)) for i in [17, INPUT_UNITS]]
    mn, rn, maskn = [], [], []
    frame = 0
    score = 0
    restart_delay = 0
    obs = env.reset()
    obs_mat = np.concatenate((obs[None,:],np.zeros((NUM_HISTORY-1, 44))), 0)
    metaobs = .000
    mn.append(metaobs)
    sn = np.concatenate((sn, obs_mat.reshape(1, -1)), 0)
    while 1:
        a = pi.act(obs_mat.flatten(), metaobs, env)
        a = a + np.random.randn(*a.shape)/2
        an = np.concatenate((an, a[None,:]), 0)
        
        obs, r, done, _ = env.step(a)
        r = r + 2
        obs_mat = np.concatenate((obs[None,:], obs_mat[:-1,:]/1.3), 0)
        metaobs = metaobs + .001
        mn.append(metaobs)
        rn.append(r)
        sn = np.concatenate((sn, obs_mat.reshape(1, -1)), 0)
        score += r
        frame += 1
        still_open = env.render("human")
        if done:
            if ep % 1000 == 0:
                print('score', score, ' frames', frame)
            break
        if still_open==False:
            crashhere
        if not done: continue
        if restart_delay==0:
            print("score=%0.2f in %i frames" % (score, frame))
            if still_open!=True:      # not True in multiplayer or non-Roboschool environment
                break
            restart_delay = 200*2  # 2 sec at 60 fps
        restart_delay -= 1
        if restart_delay==0: 
            break
    localframes.append(frame)
    rn = np.array(rn)
    second_half_run = len(rn)//2
    subtract_fail = np.power(np.arange(len(rn)), 1.2)
#     subtract_fail = np.concatenate((
#         np.zeros_like(rn[:-second_half_run]), 
#         2 * np.power(np.arange(second_half_run), 1.2)))
    subtract_fail = subtract_fail / subtract_fail.sum()
#     print('rn', rn)
#     print('subtract_fail', subtract_fail * 50)
    rn = 3 + rn - subtract_fail * 100
#     print('rn', rn)
#     for i in range(15):
#         rn[-i] = rn[-i] - (3 * (11 - i))
    mn = np.array(mn)
    maskn = np.ones_like(rn)
    if ep == 0:
        ah, sh, mh, rh, maskh = [np.expand_dims(v, 0) for v in [an, sn, mn, rn, maskn]]
    else:
        def get_updated_h(h, n, third_dim):
            hshape = h.shape[1]
            nshape = n.shape[0]
            if third_dim:
                if hshape > nshape:
                    n = np.concatenate((n, np.zeros((hshape - nshape, n.shape[-1]))), 0)
                if nshape > hshape:
                    h = np.concatenate((h, np.zeros((
                        h.shape[0], nshape - hshape, h.shape[-1]))), 1)
            else:
                if hshape > nshape:
                    n = np.concatenate((n, np.zeros((hshape - nshape))), 0)
                if nshape > hshape:
                    h = np.concatenate((h, np.zeros((h.shape[0], nshape - hshape))), 1)
            #pdb.set_trace()
            h = np.concatenate((h, np.expand_dims(n, 0)), 0)
            return h
            
        ah, sh = [get_updated_h(h, n, 1) for  h, n in zip([ah, sh], [an, sn])]
        
        mh, rh, maskh = [
            get_updated_h(h, n, 0) for h, n in zip([mh, rh, maskh], [mn, rn, maskn])]
        
    if ep % 2 == 0 and ep > 10:
        ah, sh, mh, rh, maskh = [v[-100000:] for v in [ah, sh, mh, rh, maskh]]
        globalframes.append(np.mean(localframes))
        localframes = []
        print(globalframes[-20:])
        batch_size = 64
        if ep < batch_size:
            batch_size = ep
        num_hist = ah.shape[0]
        total_aloss = 0
        total_closs = 0
        for itr in range(5):
            if num_hist >  batch_size:
                forced_hist = 10
                samples = np.concatenate((
                    np.random.choice(
                        num_hist - forced_hist, batch_size - forced_hist, replace=False),
                    np.arange(
                        num_hist - forced_hist, num_hist)))
            else:
                np.random.choice(num_hist, batch_size, replace=False)
            actions, states, meta, returns, mask = [
                v[samples] for v in [ah, sh, mh, rh, maskh]]
            feed_dict={
                        self.obs:states,
                        self.metaobs:meta[:,:,None],
                        self.returns:returns,
                        self.mask:mask}
            _, aloss = sess.run(
                [self.aopt, self.actor_loss],
                feed_dict = feed_dict
                    )
            feed_dict[self.pi] = actions
            feed_dict[self.obs] = states[:,:-1,:]
            feed_dict[self.metaobs] = meta[:,:-1,None]
            feed_dict[self.returns] = returns[:,:-1]
            feed_dict[self.mask] = mask[:,:-1]
            _, closs = sess.run(
                [self.copt, self.critic_loss],
                    feed_dict=feed_dict)
        print('aloss', aloss, 'closs', closs)
        print('abs action',np.abs(ah)[-1,0,:].shape, np.abs(ah)[-1,0,:].mean())
        


score 32.995879736364486  frames 18
[19.307692307692307]
aloss -2.0195224 closs 22.383434
abs action (17,) 0.4686826697561865
[19.307692307692307, 19.5]
aloss -1.9347465 closs 21.221773
abs action (17,) 0.36874489033892965
[19.307692307692307, 19.5, 19.0]
aloss -1.8922404 closs 22.409315
abs action (17,) 0.3915043950649687
[19.307692307692307, 19.5, 19.0, 20.0]
aloss -1.8276502 closs 21.136396
abs action (17,) 0.3188281528690732
[19.307692307692307, 19.5, 19.0, 20.0, 19.0]
aloss -1.7287114 closs 19.90932
abs action (17,) 0.3587886738225638
[19.307692307692307, 19.5, 19.0, 20.0, 19.0, 18.5]
aloss -1.6484339 closs 19.784727
abs action (17,) 0.46917826190012923
[19.307692307692307, 19.5, 19.0, 20.0, 19.0, 18.5, 19.5]
aloss -1.5703189 closs 19.080061
abs action (17,) 0.4607102719414178
[19.307692307692307, 19.5, 19.0, 20.0, 19.0, 18.5, 19.5, 19.0]
aloss -1.5105034 closs 19.17028
abs action (17,) 0.41348595093902935


KeyboardInterrupt: 

In [None]:
rh[-1]

In [None]:
rh[0]

In [None]:
rh[-1]

In [None]:
ah[0]

In [None]:
ah[-1]

In [None]:

for itr in range(ep // 10):
    #samples = np.random.choice(num_hist, batch_size, replace=False)
    actions, states, meta, returns, mask = [v[samples] for v in [ah, sh, mh, rh, maskh]]
    feed_dict[self.lr] = 1e-4
    feed_dict={
                self.obs:states,
                self.metaobs:meta[:,:,None],
                self.returns:returns,
                self.mask:mask}
    _, aloss = sess.run(
        [self.aopt, self.actor_loss],
        feed_dict = feed_dict
            )
#     feed_dict[self.lr] = 1e-5
    feed_dict[self.pi] = actions
    feed_dict[self.obs] = states[:,:-1,:]
    feed_dict[self.metaobs] = meta[:,:-1,None]
    feed_dict[self.returns] = returns[:,:-1]
    feed_dict[self.mask] = mask[:,:-1]
#     _, closs = sess.run(
#         [self.copt, self.critic_loss],
#             feed_dict=feed_dict)
    print(aloss)

In [None]:
actions.shape, states.shape

In [None]:
_, aloss = sess.run(
                [self.aopt, self.actor_loss],
                feed_dict = feed_dict
                    )

In [None]:
print('2',[v.shape for v in sess.run([self.advantage], feed_dict)])

In [None]:
np.abs(ah).mean(-1).mean(-1)

In [None]:
for itr in range(10):
            samples = np.random.choice(num_hist, batch_size, replace=False)
            actions, states, meta, returns = [v[samples] for v in [ah, sh, mh, rh]]
            feed_dict={
                        self.obs:states,
                        self.metaobs:meta[:,:,None],
                        self.returns:returns}
            _, aloss = sess.run(
                [self.aopt, self.actor_loss],
                feed_dict = feed_dict
                    )
            feed_dict[self.pi] = actions
            feed_dict[self.obs] = states[:,:-1,:]
            feed_dict[self.metaobs] = meta[:,:-1,None]
            _, closs = sess.run(
                [self.copt, self.critic_loss],
                    feed_dict=feed_dict)
            print('aloss', aloss)
            print('closs', closs)
        


In [None]:
[
    v.shape for v in sess.run([self.state_value_estimate[:,:-1],
                               self.state_value_estimate[:,1:], self.returns], feed_dict)]

In [None]:
sess.run(tf.concat((self.actions_input, self.pi), -1), feed_dict).shape

In [None]:
[
    v.shape for v in sess.run([self.actions_input, self.pi], feed_dict)]

In [None]:
[
    v.shape for v in sess.run([self.advantage_estimator, self.advantage], feed_dict)]

In [None]:
#feed_dict[self.pi] = actions
_, closs = sess.run(
    [self.copt, self.critic_loss],
        feed_dict=feed_dict)


In [None]:
sess.run(self.critic_input, feed_dict).shape

In [None]:
sess.run(self.actions_input, feed_dict).shape

In [None]:
[v.shape for v in sess.run([self.advantage, self.advantage_estimator, self.mask], feed_dict)]

In [None]:
sess.run(self.advantage_estimator, feed_dict).shape, sess.run(self.mask, feed_dict).shape

In [None]:
np.array(rn)

In [None]:
obs_mat.flatten().shape

In [None]:
a

In [None]:
obs_mat.flatten().shape

In [None]:
a, r, d, m, s = [
    [h[v] for h in history_sampled]
    for v  in ['a','r', 'd', 'm', 's']]

batch = {}
vec = a

In [None]:
v = vec[0]
longest = max([len(v) for v in vec])

In [None]:
[np.array(v).shape, np.zeros((longest - len(v), len(v[0]))).shape]

In [None]:
[len(history_sampled[i]['s']) for i in range(len(history_sampled))]

In [None]:
np.array(v).shape

In [None]:
np.stack(
        ([np.stack(np.array(v), 0).shape, np.zeros((longest - len(v), len(v[-1]))).shape]), 0)


In [None]:

batch[vname] = np.stack(
    ([np.concatenate([np.stack(np.array(v), 0), np.zeros((longest - len(v), len(v[-1])))], 1) for v in vec]), 1)


In [None]:
a, r, d, m, s = [
    [h[v] for h in history_sampled]
    for v  in ['a','r', 'd', 'm', 's']]

batch = {}

for vname, vec in zip(['a', 's'], [a, s]):
    longest = max([len(v) for v in vec])
    batch[vname] = np.stack(
        ([np.concatenate([np.stack(np.array(v), 0), np.zeros((longest - len(v), len(v[-1])))], 0) for v in vec]), 0)


for vname, vec in zip(['r', 'd', 'm'], [r, d, m]):
    longest = max([len(v) for v in vec])
    batch[vname] = np.stack(
        ([np.concatenate([np.array(v), np.zeros((longest - len(v)))], 0) for v in vec]), 0)


In [None]:
[(i, b.shape) for i, b in batch.items()]

In [None]:
def build_batch(history_sampled):
    a, r, d, m, s = [
        [h[v] for h in history_sampled]
        for v  in ['a','r', 'd', 'm', 's']]

    batch = {}

    for vname, vec in zip(['a', 's'], [a, s]):
        longest = max([len(v) for v in vec])
        batch[vname] = np.stack(
            ([np.concatenate([np.array(v), np.zeros((longest - len(v), len(v[0])))], 0) for v in vec]), 0)


    for vname, vec in zip(['r', 'd', 'm'], [r, d, m]):
        longest = max([len(v) for v in vec])
        batch[vname] = np.stack(
            ([np.concatenate([np.array(v), np.zeros((longest - len(v)))], 0) for v in vec]), 0)
    return batch

In [None]:
history_sampled = [history[s] for s in states_sampled]

In [None]:
[[len(h[blah]) for h in history_sampled] for blah in ['a','r', 's', 'd', 'm']]

In [None]:
[[h[blah][0].shape for h in history_sampled] for blah in ['a', 's']]

In [None]:
a, r, d, m, s = [
    [h[v] for h in history_sampled]
    for v  in ['a','r', 'd', 'm', 's']]

batch = {}

for vname, vec in zip(['a', 's'], [a, s]):
    longest = max([len(v) for v in vec])
    batch[vname] = np.stack(
        ([np.concatenate([np.array(v), np.zeros((longest - len(v), len(v[0])))], 0) for v in vec]), 0)
    

for vname, vec in zip(['r', 'd', 'm'], [r, d, m]):
    longest = max([len(v) for v in vec])
    batch[vname] = np.stack(
        ([np.concatenate([np.array(v), np.zeros((longest - len(v)))], 0) for v in vec]), 0)
    

In [None]:
[b.shape for b in batch.values()]

In [None]:
x[0][0].shape

In [None]:
[len(x) for x in x]

In [None]:
actions = 

In [None]:
history_sampled

In [None]:
num_hist = len(history)
states_sampled = np.random.choice(num_hist, batch_size, replace=False)

In [None]:
states_sampled

In [None]:
history

In [None]:
obs.shape

In [None]:
def train(pi, history, n_steps = 100, batch_size = 5):
    

In [None]:
obs.shape, obs_mat.shape

In [None]:
[h['r'] for h in history]

In [None]:
a

In [None]:
history

In [None]:
obs.shape

In [None]:
a

In [None]:
a.shape