In [None]:
import os.path, gym
import numpy as np
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import tensorflow as tf
import roboschool
import pdb



class ZooPolicyTensorflow(object):
    def __init__(self, name, ob_space, ac_space, take_weights_here=None):
        self.name = name

        with tf.variable_scope(name):
            obs_tuple = [
                tf.placeholder(tf.float32,         (None, 1), name="obs0"),
                tf.placeholder(tf.float32,        (None, 44), name="obs1"),
            ]
            self.obs_tuple = obs_tuple

            actions_input = []
            actions_input.append(obs_tuple[1])

            x = tf.concat( actions_input, axis=1 )
            dense1_w = tf.get_variable("dense1_w", [44,256])
            dense1_b = tf.get_variable("dense1_b", [256])
            x = tf.matmul(x, dense1_w) + dense1_b
            x = tf.nn.relu(x)
            dense2_w = tf.get_variable("dense2_w", [256,128])
            dense2_b = tf.get_variable("dense2_b", [128])
            x = tf.matmul(x, dense2_w) + dense2_b
            x = tf.nn.relu(x)
            final_w = tf.get_variable("final_w", [128,17])
            final_b = tf.get_variable("final_b", [17])
            x = tf.matmul(x, final_w) + final_b
            pi = x
            self.pi = pi

        if take_weights_here is None:
            take_weights_here = {}
            exec(open(
                "RoboschoolHumanoidFlagrun_v1_2017jul.weights").read(), take_weights_here)
        self.assigns = [
            (  dense1_w, take_weights_here["weights_dense1_w"]),
            (  dense1_b, take_weights_here["weights_dense1_b"]),
            (  dense2_w, take_weights_here["weights_dense2_w"]),
            (  dense2_b, take_weights_here["weights_dense2_b"]),
            (   final_w, take_weights_here["weights_final_w"]),
            (   final_b, take_weights_here["weights_final_b"]),
        ]

        self.weight_assignment_placeholders = []
        self.weight_assignment_nodes = []
        for var, w in self.assigns:
            ph = tf.placeholder(tf.float32, w.shape)
            self.weight_assignment_placeholders.append(ph)
            self.weight_assignment_nodes.append( tf.assign(var, ph) )

        self.load_weights()

    def load_weights(self):
        feed_dict = {}
        for (var, w), ph in zip(self.assigns, self.weight_assignment_placeholders):
            feed_dict[ph] = w
        tf.get_default_session().run(self.weight_assignment_nodes, feed_dict=feed_dict)

    def act(self, obs_data, cx):
        obs_data = [np.ones((1,)), obs_data]
        obs_data = [obs_data[0], obs_data[1]]
        # Because we need batch dimension, data[None] changes shape from [A] to [1,A]
        a = tf.get_default_session().run(
            self.pi, feed_dict=dict(
                (ph,data[None]) for ph,data in zip(self.obs_tuple, obs_data) ))
        return a[0]  # return first in batch




def apply_clipped_optimizer(opt_fcn,
                            loss,
                            clip_norm=.1,
                            clip_single=.03,
                            clip_global_norm=False,
                            var_list=None):
    if var_list is None:
        gvs = opt_fcn.compute_gradients(loss)
    else:
        gvs = opt_fcn.compute_gradients(loss, var_list = var_list)
        

    if clip_global_norm:
        gs, vs = zip(*[(g, v) for g, v in gvs if g is not None])
        capped_gs, grad_norm_total = tf.clip_by_global_norm([g for g in gs],clip_norm)
        capped_gvs = list(zip(capped_gs, vs))
    else:
        grad_norm_total = tf.sqrt(
                tf.reduce_sum([
                        tf.reduce_sum(tf.square(grad)) for grad, var in gvs
                        if grad is not None
                ]))
        capped_gvs = [(tf.clip_by_value(grad, -1 * clip_single, clip_single), var)
                                    for grad, var in gvs if grad is not None]
        capped_gvs = [(tf.clip_by_norm(grad, clip_norm), var)
                                    for grad, var in capped_gvs if grad is not None]

    optimizer = opt_fcn.apply_gradients(capped_gvs)

    return optimizer, grad_norm_total

def MLP(x, lshapes, output_units, name_fcn):
    h = [x]
    h.append(tf.nn.leaky_relu(tf.layers.dense(h[-1], lshapes[0], name=name_fcn())))
    init = 0
    for size in lshapes:
        if not init:
            init = 1
            h2 = h[-1]
        h.append(tf.nn.leaky_relu(h[-1] + tf.layers.dense(
            h2, size, name=name_fcn())))
        h2 = tf.concat((h[-1], x), -1)
    output = tf.layers.dense(h2, output_units, name=name_fcn())
    if output_units == 1:
        output = tf.squeeze(output, -1)
    return h, output
NUM_HISTORY = 6
INPUT_UNITS = (44 + 17) * NUM_HISTORY
class PolicyLearner(object):
    def __init__(self, ob_space, ac_space, take_weights_here=None, 
                 lshapes = [256] * 8, config = None, lshapes_small = [128] * 8):
        self.a_idx = 0
        self.c_idx = 0
        self.v_idx = 0
        self.sess = tf.InteractiveSession(config=config)
        self.obs = tf.placeholder(tf.float32, (None, None, INPUT_UNITS))
        self.training_expert = tf.placeholder(tf.float32, (None, None, 17))
        self.metaobs = tf.placeholder(tf.float32, (None, None, 1))
        self.returns = tf.placeholder(tf.float32, (None, None))
        self.returnsdecayed = tf.placeholder(tf.float32, (None, None))
        self.mask = tf.placeholder(tf.float32, (None, None))
        self.lr = tf.placeholder_with_default(1e-3, (None))

        self.actions_input = tf.concat((self.obs, self.metaobs), axis=-1)
        
        self.h, pi = MLP(self.actions_input, lshapes, 17, self.a_name)
        self.pi = tf.nn.tanh(pi/20) * 3

        self.hs, self.state_value_estimate = MLP(
            self.actions_input, lshapes_small, 1, self.v_name)
        
        self.critic_input = tf.concat((self.actions_input, self.pi), -1)
        
        self.advantage = ((
            self.state_value_estimate[:,1:] + self.returns) -
            self.state_value_estimate[:,:-1])
        
        self.hae, self.advantage_estimator = MLP(
            self.critic_input, lshapes_small, 1, self.c_name)
        
        self.t_vars = tf.trainable_variables()
        self.c_vars = [var for var in self.t_vars if 'c_' in var.name]
        self.a_vars = [var for var in self.t_vars if 'a_' in var.name]
        self.v_vars = [var for var in self.t_vars if 'v_' in var.name]
        
        self.creg, self.areg, self.vreg = [
            tf.reduce_mean([tf.reduce_mean(tf.square(v)) for v in optvars]) * 1e-5
            for optvars in [self.c_vars, self.a_vars, self.v_vars]]
        
        self.v_loss = tf.reduce_mean(tf.square(
            self.returnsdecayed - self.state_value_estimate[:,1:]) * self.mask) + self.vreg
        self.critic_loss = tf.reduce_mean(tf.square(
            self.advantage_estimator[:,:-1] - self.advantage) * self.mask) + self.creg
            
        self.actor_loss = -tf.reduce_mean(
            self.advantage_estimator[:,:-1] * self.mask) + tf.reduce_mean(
            tf.square(pi[:,:-1,:]) * tf.expand_dims(self.mask, -1))/100 + self.areg
        self.total_loss = self.critic_loss + self.actor_loss/10
        self.critic_opt = tf.train.AdamOptimizer(self.lr)
        self.value_opt = tf.train.AdamOptimizer(self.lr)
        self.actor_opt = tf.train.AdamOptimizer(self.lr)
        self.copt, self.c_norm = apply_clipped_optimizer(
            self.critic_opt, self.critic_loss, var_list = self.c_vars)
        self.vopt, self.v_norm = apply_clipped_optimizer(
            self.value_opt, self.v_loss, var_list = self.v_vars)
        self.aopt, self.a_norm = apply_clipped_optimizer(
            self.actor_opt, self.actor_loss, var_list = self.a_vars)

    def a_name(self):
        self.a_idx += 1
        return 'a_' + str(self.a_idx)
    
    def c_name(self):
        self.c_idx += 1
        return 'c_' + str(self.c_idx)
    def v_name(self):
        self.v_idx += 1
        return 'v_' + str(self.v_idx)
    
    def load_weights(self):
        feed_dict = {}
        for (var, w), ph in zip(self.assigns, self.weight_assignment_placeholders):
            feed_dict[ph] = w
        self.sess.run(self.weight_assignment_nodes, feed_dict=feed_dict)

    def act(self, obs, metaobs, cx):
        # Because we need batch dimension, data[None] changes shape from [A] to [1,A]
        a = self.sess.run(
            self.pi, feed_dict={
                self.obs:np.reshape(obs, (1, 1, INPUT_UNITS)),
                self.metaobs:np.reshape(metaobs, (1, 1, 1))
            })
        return a[0][0]  # return first in batch

    
    

config = tf.ConfigProto(
    inter_op_parallelism_threads=0,
    intra_op_parallelism_threads=0,
    device_count = { "GPU": 0 } )
tf.reset_default_graph()

env = gym.make("RoboschoolHumanoidFlagrun-v1")
pi = PolicyLearner(env.observation_space, env.action_space, config = config)

sess = pi.sess
self = pi
sess.run(tf.global_variables_initializer())
#trainer = ZooPolicyTensorflow("mymodel1", env.observation_space, env.action_space)
saver = tf.train.Saver()
ah, sh = [np.zeros((0, 0, i)) for i in [17, INPUT_UNITS]]
mh, rh, rdecayedh = [np.zeros((0, 0)) for i in [None, None, None]]
globalframes = []
localframes = []
ep = 0
trained = 1
obj_fname = 'saveobjs.pkl'
import pickle

In [None]:
tffile = "tmp/unguided_trained.ckpt"

In [None]:
# env.flag_timeout

# blah = env.reset()

# env.step(np.zeros_like(a))
# env.render("human")
# print(env.flag_timeout)

# while env.flag_timeout > 1:
#     env.step(np.zeros_like(a))
#     env.render("human")

# env.flag_timeout

In [None]:
inference = False
if inference:
    try:
        saver.restore(sess, tffile)
        with open(obj_fname, "rb") as f:
            ah, th, sh, mh, rh, maskh, ep = pickle.load(f)
        print('restored from save file')
    except:
        print('no save file detected')
    env = gym.make("RoboschoolHumanoidFlagrun-v1")

    for ep in range(ep, 10000000):
        frame = 0
        score = 0
        an, tn, sn = [np.zeros((0, i)) for i in [17, 17, INPUT_UNITS]]
        mn, rn, maskn = [], [], []
        restart_delay = 0
        obs = env.reset()
        obs = np.concatenate((obs, np.zeros((17))))
        obs_mat = np.concatenate((
            obs[None,:],np.zeros((NUM_HISTORY-1, 44 + 17))), 0)
        metaobs = .000
        mn.append(metaobs)
        sn = np.concatenate((sn, obs_mat.reshape(1, -1)), 0)
        while 1:
            if ep % 5 != 20: #always act on your own
                a = pi.act(obs_mat.flatten(), metaobs, env)
                a = a + np.random.randn(*a.shape)/np.sqrt(ep + 10)
                #t = trainer.act(obs, env)
            else:
                #a = trainer.act(obs, env)
                #t = a
            an = np.concatenate((an, a[None,:]), 0)
            tn = np.concatenate((tn, t[None,:]), 0)

            obs, r, done, _ = env.step(a)
            r = r + 2
            obs_mat = np.concatenate((obs[None,:], obs_mat[:-1,:]/1.5), 0)
            metaobs = metaobs + .001
            mn.append(metaobs)
            rn.append(r)
            sn = np.concatenate((sn, obs_mat.reshape(1, -1)), 0)
            score += r
            frame += 1
            still_open = env.render("human")
            if done:
                if ep % 2000 == 0:
                    print('score', score, ' frames', frame)
                break
            if still_open==False:
                crashhere
            if not done: continue
            if restart_delay==0:
                print("score=%0.2f in %i frames" % (score, frame))
                if still_open!=True:      # not True in multiplayer or non
                    break
                restart_delay = 2000*2  # 2 sec at 60 fps
            restart_delay -= 1
            if restart_delay==0: 
                break

In [None]:

# save_path = saver.save(sess, "tmp/pep_trained.ckpt")
# print('saved at epoch', ep)
# with open(obj_fname,"wb") as f:
#     pickle.dump(
#         [ah[-100:], th[-100:], sh[-100:], mh[-100:],
#          rh[-100:], rdecayedh[-100:], maskh[-100:], ep
#         ], f)

In [None]:
trained=1

In [None]:
if trained:
    try:
        saver.restore(sess, tffile)
        with open(obj_fname, "rb") as f:
            ah, th, sh, mh, rh, rdecayedh, maskh, ep = pickle.load(f)
        print('restored from save file')
    except:
        print('no save file detected')
MAX_SEQ_LEN = 5000
for ep in range(ep, 10000000):
    if ep % 100 == 0 and trained:
        save_path = saver.save(sess, tffile)
        print('saved at epoch', ep)
        with open(obj_fname,"wb") as f:
            pickle.dump(
                [ah[-1000:], th[-1000:], sh[-1000:], mh[-1000:],
                 rh[-1000:], rdecayedh[-1000:], maskh[-1000:], ep
                ], f)
    trained = 1
    an, tn, sn = [np.zeros((0, i)) for i in [17, 17, INPUT_UNITS]]
    mn, rn, maskn = [], [], []
    frame = 0
    score = 0
    restart_delay = 0
    obs = env.reset()
    obs_mat = np.concatenate((
        obs[None,:],np.zeros((NUM_HISTORY-1, 44 + 17))), 0)
    metaobs = .000
    mn.append(metaobs)
    sn = np.concatenate((sn, obs_mat.reshape(1, -1)), 0)
    while 1:
        if ep % 5 != 20: #always act on your own
            a = pi.act(obs_mat.flatten(), metaobs, env)
            if np.random.rand() > .9:
                a = a + np.random.randn(*a.shape)/np.sqrt(np.sqrt(ep + 2))*4
            elif np.random.rand() > .9:
                impact = np.random.binomial(size=a.shape, n=1, p= 0.3) * 2
                if np.random.rand() > .5:
                    a = a + impact
                else:
                    a = a - impact
            elif np.random.rand() > .9:
                impact = np.random.binomial(size=a.shape, n=1, p= 0.5) * 2 - 1
                a = a + impact
            elif np.random.rand() > .9:
                a = a * .7
            elif np.random.rand() > .9:
                a = a * 1.5
#             t = trainer.act(obs, env)
        else:
#             a = trainer.act(obs, env)
            t = a
        an = np.concatenate((an, a[None,:]), 0)
        tn = np.concatenate((tn, t[None,:]), 0)
        
        obs, r, done, _ = env.step(a)
        r = r + 2
        obs_mat = np.concatenate((obs[None,:], obs_mat[:-1,:]/1.3), 0)
        metaobs = metaobs + .001
        mn.append(metaobs)
        rn.append(r)
        sn = np.concatenate((sn, obs_mat.reshape(1, -1)), 0)
        score += r
        frame += 1
        still_open = env.render("human")
        if done:
            if ep % MAX_SEQ_LEN == 0:
                print('score', score, ' frames', frame)
            break
        if still_open==False:
            crashhere
        if not done: continue
        if restart_delay==0:
            print("score=%0.2f in %i frames" % (score, frame))
            if still_open!=True:      # not True in multiplayer or non-Roboschool environment
                break
            restart_delay = 2000*2  # 2 sec at 60 fps
        restart_delay -= 1
        if restart_delay==0: 
            break
    localframes.append(frame)
    rn = np.array(rn)
    second_half_run = len(rn)//2
    subtract_fail = np.power(np.arange(len(rn)), 2)
    subtract_fail = subtract_fail / subtract_fail.sum()
    rn = 1 + rn - subtract_fail * 100
    rewards = [0]
    for ir in rn[::-1]:
        rewards.append(rewards[-1] * .9 + ir)
    rdecayedn = np.array(rewards)[:0:-1]
    rdecayedn[-1] = rdecayedn[-1] - 100
    rdecayedn[-2] = rdecayedn[-2] - 70
    rdecayedn[-3] = rdecayedn[-3] - 40
    rdecayedn[-4] = rdecayedn[-4] - 20
    rdecayedn[-5] = rdecayedn[-5] - 10
    mn = np.array(mn)
    maskn = np.ones_like(rn)
    if ep == 0:
        ah, th, sh, mh, rh, rdecayedh, maskh = [
            np.expand_dims(v, 0) for v in [an, tn, sn, mn, rn,rdecayedn, maskn]]
    else:
        def get_updated_h(h, n, third_dim):
            hshape = h.shape[1]
            nshape = n.shape[0]
            if third_dim:
                if hshape > nshape:
                    n = np.concatenate((n, np.zeros((hshape - nshape, n.shape[-1]))), 0)
                if nshape > hshape:
                    h = np.concatenate((h, np.zeros((
                        h.shape[0], nshape - hshape, h.shape[-1]))), 1)
            else:
                if hshape > nshape:
                    n = np.concatenate((n, np.zeros((hshape - nshape))), 0)
                if nshape > hshape:
                    h = np.concatenate((h, np.zeros((h.shape[0], nshape - hshape))), 1)
            h = np.concatenate((h, np.expand_dims(n, 0)), 0)
            return h
            
        ah, th, sh = [get_updated_h(h, n, 1) for  h, n in zip([ah, th, sh], [an, tn, sn])]
        
        mh, rh, rdecayedh, maskh = [
            get_updated_h(h, n, 0) for h, n in zip(
                [mh, rh, rdecayedh, maskh], [mn, rn, rdecayedn, maskn])]
        
    if ep % 1 == 0 and ep > 10:
        ah, th, sh, mh, rh,rdecayedh, maskh = [
            v[-400:] for v in [ah, th, sh, mh, rh,rdecayedh, maskh]]
        globalframes.append(np.mean(localframes))
        localframes = []
        batch_size = 16
        if ep < batch_size:
            batch_size = ep
        num_hist = ah.shape[0]
        total_aloss = 0
        total_closs = 0
        for itr in range(2):
            if num_hist >  batch_size:
                forced_hist = 2
                probability = np.sqrt(np.arange(num_hist - forced_hist))
                probability = probability / probability.sum()
                samples = np.concatenate((
                    np.random.choice(
                        num_hist - forced_hist, batch_size - forced_hist, 
                        replace=False, p=probability),
                    np.arange(
                        num_hist - forced_hist, num_hist)))
            else:
                samples = np.random.choice(num_hist, num_hist, replace=False)
            actions, training_expert, states, meta, returns, returnsdecayed, mask = [
                v[samples] for v in [ah, th, sh, mh, rh,rdecayedh, maskh]]
            feed_dict={
                        self.obs:states,
                        self.metaobs:meta[:,:,None],
                        self.returns:returns,
                        self.returnsdecayed:returnsdecayed,
                        self.training_expert:training_expert,
                        self.mask:mask}
            _, aloss = sess.run(
                [self.aopt, self.actor_loss],
                feed_dict = feed_dict
                    )
            feed_dict[self.pi] = actions
            feed_dict[self.obs] = states[:,:-1,:]
            feed_dict[self.metaobs] = meta[:,:-1,None]
            feed_dict[self.returns] = returns[:,:-1]
            feed_dict[self.returnsdecayed] = returnsdecayed[:,:-1]
            feed_dict[self.mask] = mask[:,:-1]
            _,_, closs, vloss = sess.run(
                [self.copt,self.vopt, self.critic_loss, self.v_loss],
                    feed_dict=feed_dict)
            if 0:
                if ep % 10 == 0:
                    print('aloss', aloss, 'closs', closs, 'vloss', vloss)
        if ep % 10 == 0:
            print('aloss', aloss, 'closs', closs, ' ep, ', ep)
            print('abs action',np.abs(ah)[-1,0,:].shape, np.abs(ah)[-1,0,:].mean())
            print(globalframes[-20:])
        


In [None]:
# sess.run(self.state_value_estimate,feed_dict)[1][np.where(mask[1])].shape

# sess.run(self.state_value_estimate,feed_dict)[1][np.where(mask[1])]

In [24]:

save_path = saver.save(sess, tffile)
print('saved at epoch', ep)
with open(obj_fname,"wb") as f:
    pickle.dump(
        [ah[-1000:], th[-1000:], sh[-1000:], mh[-1000:],
         rh[-1000:], rdecayedh[-1000:], maskh[-1000:], ep
        ], f)
trained = 1

saved at epoch 801


In [None]:
ah[-2][:5].round(2)