In [1]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.patches as mpatches
import pickle
import gym
from sklearn.model_selection import train_test_split

In [2]:
def tf_reset():
    try:
        sess.close()
    except:
        pass
    tf.reset_default_graph()
    return tf.Session()

In [3]:
envname = 'Hopper-v2'
obsshape = gym.make(envname).observation_space.shape
actionshape = gym.make(envname).action_space.shape

In [4]:
filename = 'expert_data/{}.pkl'.format(envname)
with open(filename, 'rb') as f:
    data = pickle.loads(f.read())
inputs = data['observations']
numobs = inputs.shape[0]
outputs = data['actions'].reshape((numobs,) + actionshape)
print(inputs.shape)
print(outputs.shape)

(20000, 111)
(20000, 8)


In [7]:
sess = tf_reset()

def create_model():
    # create inputs
    input_ph = tf.placeholder(dtype=tf.float32, shape=(None,) + obsshape)
    output_ph = tf.placeholder(dtype=tf.float32, shape=(None,) + actionshape)

    # create variables
    W0 = tf.get_variable(name='W0', shape=obsshape + (20,), initializer=tf.contrib.layers.xavier_initializer())
    W1 = tf.get_variable(name='W1', shape=[20, 20], initializer=tf.contrib.layers.xavier_initializer())
    W2 = tf.get_variable(name='W2', shape=(20,) + actionshape, initializer=tf.contrib.layers.xavier_initializer())

    b0 = tf.get_variable(name='b0', shape=[20], initializer=tf.constant_initializer(0.))
    b1 = tf.get_variable(name='b1', shape=[20], initializer=tf.constant_initializer(0.))
    b2 = tf.get_variable(name='b2', shape=[1], initializer=tf.constant_initializer(0.))

    weights = [W0, W1, W2]
    biases = [b0, b1, b2]
    activations = [tf.nn.relu, tf.nn.relu, None]

    # create computation graph
    layer = input_ph
    for W, b, activation in zip(weights, biases, activations):
        layer = tf.matmul(layer, W) + b
        if activation is not None:
            layer = activation(layer)
    output_pred = layer
    
    return input_ph, output_ph, output_pred
    
input_ph, output_ph, output_pred = create_model()
print(output_ph.shape)
    
# create loss
mse = tf.reduce_mean(0.5 * tf.square(output_pred - output_ph))

# create optimizer
opt = tf.train.AdamOptimizer().minimize(mse)

# initialize variables
sess.run(tf.global_variables_initializer())
# create saver to save model variables
saver = tf.train.Saver()

# split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(inputs, outputs, test_size=0.3)

# run training
batch_size = 32
for training_step in range(100000):
    # get a random subset of the training data
    indices = np.random.randint(low=0, high=len(X_train), size=batch_size)
    input_batch = X_train[indices]
    output_batch = y_train[indices]
    
    # run the optimizer and get the mse
    _, mse_run = sess.run([opt, mse], feed_dict={input_ph: input_batch, output_ph: output_batch})
    
    # print the mse every so often
    if training_step % 1000 == 0:
        mse_test = sess.run(mse, feed_dict={input_ph: X_test, output_ph: y_test})
        print('{0:04d} mse train: {1:.3f}'.format(training_step, mse_run))
        print('{0:04d} mse test: {1:.3f}'.format(training_step, mse_test))
        saver.save(sess, '/tmp/model.ckpt')

(?, 8)
0000 mse train: 0.388
0000 mse test: 0.350
1000 mse train: 0.006
1000 mse test: 0.006
2000 mse train: 0.004
2000 mse test: 0.004
3000 mse train: 0.003
3000 mse test: 0.003
4000 mse train: 0.003
4000 mse test: 0.002
5000 mse train: 0.002
5000 mse test: 0.002
6000 mse train: 0.002
6000 mse test: 0.002
7000 mse train: 0.002
7000 mse test: 0.001
8000 mse train: 0.001
8000 mse test: 0.001
9000 mse train: 0.001
9000 mse test: 0.001
10000 mse train: 0.001
10000 mse test: 0.001
11000 mse train: 0.001
11000 mse test: 0.001
12000 mse train: 0.001
12000 mse test: 0.001
13000 mse train: 0.001
13000 mse test: 0.001
14000 mse train: 0.001
14000 mse test: 0.001
15000 mse train: 0.001
15000 mse test: 0.001
16000 mse train: 0.001
16000 mse test: 0.001
17000 mse train: 0.001
17000 mse test: 0.001
18000 mse train: 0.001
18000 mse test: 0.001
19000 mse train: 0.000
19000 mse test: 0.001
20000 mse train: 0.000
20000 mse test: 0.001
21000 mse train: 0.000
21000 mse test: 0.001
22000 mse train: 0.000


In [6]:
env = gym.make(envname)

max_steps = env.spec.timestep_limit
num_rollouts = 20
render = False

returns = []
observations = []
actions = []
for i in range(num_rollouts):
    print('iter', i)
    obs = env.reset()
    done = False
    totalr = 0.
    steps = 0
    while not done:
        action = sess.run(output_pred, feed_dict={input_ph: obs.reshape((1,) + obsshape)})
        observations.append(obs)
        actions.append(action)
        obs, r, done, _ = env.step(action)
        totalr += r
        steps += 1
        if render:
            env.render()
        if steps % 100 == 0: print("%i/%i"%(steps, max_steps))
        if steps >= max_steps:
            break
    returns.append(totalr)

print('returns', returns)
print('mean return', np.mean(returns))
print('std of return', np.std(returns))

iter 0
100/1000
200/1000
300/1000
400/1000
500/1000
600/1000
700/1000
800/1000
900/1000
1000/1000
iter 1
100/1000
200/1000
300/1000
400/1000
500/1000
600/1000
700/1000
800/1000
900/1000
1000/1000
iter 2
100/1000
200/1000
300/1000
400/1000
500/1000
600/1000
700/1000
800/1000
900/1000
1000/1000
iter 3
100/1000
200/1000
300/1000
400/1000
500/1000
600/1000
700/1000
800/1000
900/1000
1000/1000
iter 4
100/1000
200/1000
300/1000
400/1000
500/1000
600/1000
700/1000
800/1000
900/1000
1000/1000
iter 5
100/1000
200/1000
300/1000
400/1000
500/1000
600/1000
700/1000
800/1000
900/1000
1000/1000
iter 6
100/1000
200/1000
300/1000
400/1000
500/1000
600/1000
700/1000
800/1000
900/1000
1000/1000
iter 7
100/1000
200/1000
300/1000
400/1000
500/1000
600/1000
700/1000
800/1000
900/1000
1000/1000
iter 8
100/1000
200/1000
300/1000
400/1000
500/1000
600/1000
700/1000
800/1000
900/1000
1000/1000
iter 9
100/1000
200/1000
300/1000
400/1000
500/1000
600/1000
700/1000
800/1000
900/1000
1000/1000
iter 10
100/1000
200