In [1]:
"""
Code to load an expert policy and generate roll-out data for behavioral cloning.
Example usage:
    python run_expert.py experts/Humanoid-v1.pkl Humanoid-v1 --render \
            --num_rollouts 20

Author of this script and included expert policies: Jonathan Ho (hoj@openai.com)
"""

import pickle
import tensorflow as tf
import numpy as np
import tf_util
import gym
import load_policy

In [2]:
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('expert_policy_file', type=str)
parser.add_argument('envname', type=str)
parser.add_argument('--render', action='store_true')
parser.add_argument("--max_timesteps", type=int)
parser.add_argument('--num_rollouts', type=int, default=20,
                    help='Number of expert roll outs')
#args = parser.parse_args(["experts/Humanoid-v1.pkl", "Humanoid-v1", "--render", "--num_rollouts", "200"])
args = parser.parse_args(["experts/Humanoid-v1.pkl", "Humanoid-v1", "--num_rollouts", "20"])
#args = parser.parse_args(["experts/Reacher-v1.pkl", "Reacher-v1", "--render", "--num_rollouts", "20"])
#args = parser.parse_args(["experts/Reacher-v1.pkl", "Reacher-v1", "--num_rollouts", "20000"])
#args = parser.parse_args(["experts/Hopper-v1.pkl", "Hopper-v1", "--render", "--num_rollouts", "20"])
#args = parser.parse_args(["experts/Hopper-v1.pkl", "Hopper-v1", "--num_rollouts", "200"])
#args = parser.parse_args(["experts/Walker2d-v1.pkl", "Walker2d-v1", "--render", "--num_rollouts", "20"])
#args = parser.parse_args(["experts/Walker2d-v1.pkl", "Walker2d-v1", "--num_rollouts", "200"])



In [3]:
print('loading and building expert policy')
policy_fn = load_policy.load_policy(args.expert_policy_file)

loading and building expert policy
('obs', (1, 376), (1, 376))


**Create Initial Training Data for Clone**

In [8]:
print('loaded and built')

with tf.Session():
    tf_util.initialize()

    import gym
    env = gym.make(args.envname)  
    print("The observation space for this environment:", env.observation_space)
    print("The action space for this environment:", env.action_space)
    max_steps = args.max_timesteps or env.spec.timestep_limit 

    returns = []
    observations = []
    actions = []
    for i in range(50):
        print('iter', i)
        obs = env.reset()
        done = False
        totalr = 0.
        steps = 0
        while not done:
            action = policy_fn(obs[None,:])
            observations.append(obs)
            actions.append(action)
            obs, r, done, _ = env.step(action) 
            #env.render()
            totalr += r
            steps += 1
            if steps % 100 == 0: print("%i/%i"%(steps, max_steps))
            if steps >= max_steps:
                break
        returns.append(totalr)

    print('returns', returns)
    print('mean return', np.mean(returns))
    print('std of return', np.std(returns))

    expert_data = {'observations': np.array(observations), 
                   'actions': np.array(actions)} 


loaded and built


[2017-03-27 23:24:32,651] Making new env: Humanoid-v1


('The observation space for this environment:', Box(376,))
('The action space for this environment:', Box(17,))
('iter', 0)
100/1000
200/1000
300/1000
400/1000
500/1000
600/1000
700/1000
800/1000
900/1000
1000/1000
('iter', 1)
100/1000
200/1000
300/1000
400/1000
500/1000
600/1000
700/1000
800/1000
900/1000
1000/1000
('iter', 2)
100/1000
200/1000
300/1000
400/1000
500/1000
600/1000
700/1000
800/1000
900/1000
1000/1000
('iter', 3)
100/1000
200/1000


KeyboardInterrupt: 

**Perform Initial Fit of Neural Network With Data From Expert Policy**

In [5]:
import keras
from keras.layers import Dense
from keras.models import Sequential, load_model

Using TensorFlow backend.


In [6]:
width_dense = 1000

In [7]:
model = Sequential()
model.add(Dense(width_dense, input_shape=env.observation_space.shape, activation="relu"))
model.add(Dense(width_dense, activation="relu"))
model.add(Dense(np.prod(env.action_space.shape)))

In [8]:
model.compile(loss='mse', optimizer='adam')

In [9]:
model.fit(x = expert_data["observations"], y = np.squeeze(expert_data["actions"]), nb_epoch=50, verbose=2)

Epoch 1/50
55s - loss: 7.1993
Epoch 2/50
54s - loss: 0.1904
Epoch 3/50
54s - loss: 0.1305
Epoch 4/50
54s - loss: 0.1011
Epoch 5/50
54s - loss: 0.0942
Epoch 6/50
54s - loss: 0.0828
Epoch 7/50
57s - loss: 0.0735
Epoch 8/50
58s - loss: 0.0712
Epoch 9/50
61s - loss: 0.0654
Epoch 10/50
67s - loss: 0.0601
Epoch 11/50
61s - loss: 0.0551
Epoch 12/50
57s - loss: 0.0522
Epoch 13/50
75s - loss: 0.0498
Epoch 14/50
66s - loss: 0.0469
Epoch 15/50
69s - loss: 0.0458
Epoch 16/50
61s - loss: 0.0433
Epoch 17/50
65s - loss: 0.0421
Epoch 18/50
71s - loss: 0.0414
Epoch 19/50
60s - loss: 0.0398
Epoch 20/50
62s - loss: 0.0396
Epoch 21/50
70s - loss: 0.0378
Epoch 22/50
84s - loss: 0.0386
Epoch 23/50
83s - loss: 0.0371
Epoch 24/50
62s - loss: 0.0363
Epoch 25/50
68s - loss: 0.0360
Epoch 26/50
78s - loss: 0.0353
Epoch 27/50
61s - loss: 0.0346
Epoch 28/50
64s - loss: 0.0340
Epoch 29/50
70s - loss: 0.0335
Epoch 30/50
81s - loss: 0.0335
Epoch 31/50
69s - loss: 0.0334
Epoch 32/50
60s - loss: 0.0332
Epoch 33/50
77s -

<keras.callbacks.History at 0x120f9cf50>

**Check the Behaviour of the Clone**

In [10]:
returns = []
observations = []
actions = []
for i in range(10):
    print('iter', i)
    obs = env.reset()
    done = False
    totalr = 0.
    steps = 0
    #while True:
    while not done:
        action = model.predict(obs[None,:]) 
        observations.append(obs)
        actions.append(action)
        obs, r, done, _ = env.step(action)
        totalr += r
        steps += 1
        env.render()
        if steps % 100 == 0: print("%i/%i"%(steps, max_steps))
        if steps >= max_steps:
            break
    returns.append(totalr)
    

print('returns', returns)
print('mean return', np.mean(returns))
print('std of return', np.std(returns))

('iter', 0)
100/1000
('iter', 1)
100/1000
200/1000
('iter', 2)
100/1000
200/1000
('iter', 3)
100/1000
200/1000
('iter', 4)
100/1000
('iter', 5)
100/1000
('iter', 6)
('iter', 7)
100/1000
200/1000
300/1000
('iter', 8)
100/1000
200/1000
300/1000
('iter', 9)
100/1000
200/1000
('returns', [829.57438906331629, 1621.0355389772174, 1943.0625146717084, 1807.0657725372907, 836.75772165303476, 829.80933578189911, 549.74161722477004, 3738.062344596513, 3425.3628627787934, 2311.8072435051022])
('mean return', 1789.2279340789646)
('std of return', 1052.8627145099747)


In [12]:
model.save("Test.h5")

**Use DAGGER to Generate Data and Re-Training**

In [None]:
returns = []
observations = []
actions = []
# generate 10 roll-out of new training data and aggregate to the old training data
for i in range(1):
    print('iter', i)
    obs = env.reset()
    done = False
    steps = 0
    while not done:
        action = model.predict(obs[None,:])
        observations.append(obs)
        with tf.Session():
            actions.append(policy_fn(obs[None,:])) # use the expert policy to add the action as y
        actions.append(action) # use the clone policy to perform the action
        obs, r, done, _ = env.step(action) 
        steps += 1
        if steps % 100 == 0: print("%i/%i"%(steps, max_steps))
        if steps >= max_steps:
            break
expert_data["observations"] = np.concatenate((expert_data["observations"], np.array(observations)))
expert_data["actions"] = np.concatenate((expert_data["actions"], np.array(actions)))
model.fit(x = expert_data["observations"], y = np.squeeze(expert_data["actions"]), nb_epoch=5, verbose=10) # perform 5 epochs of training

In [13]:
for ind_cycle in range(30):
    print("Number of cycle: %d/%d" %(ind_cycle, 30))
    returns = []
    observations = []
    actions = []
    # generate 10 roll-out of new training data and aggregate to the old training data
    for i in range(20):
        print('iter', i)
        obs = env.reset()
        done = False
        steps = 0
        while not done:
            action = model.predict(obs[None,:])
            observations.append(obs)
            with tf.Session():
                actions.append(policy_fn(obs[None,:])) # use the expert policy to add the action as y
            obs, r, done, _ = env.step(action) # use the clone policy to perform the action
            steps += 1
            if steps % 100 == 0: print("%i/%i"%(steps, max_steps))
            if steps >= max_steps:
                break
    expert_data["observations"] = np.concatenate((expert_data["observations"], np.array(observations)))
    expert_data["actions"] = np.concatenate((expert_data["actions"], np.array(actions)))
    model.fit(x = expert_data["observations"], y = np.squeeze(expert_data["actions"]), nb_epoch=10, verbose=2) # perform training on updated data

Number of cycle: 0/30
('iter', 0)
('iter', 1)
('iter', 2)
100/1000
('iter', 3)
100/1000
('iter', 4)
100/1000
200/1000
('iter', 5)
('iter', 6)
('iter', 7)
100/1000
('iter', 8)
100/1000
200/1000
('iter', 9)
100/1000
('iter', 10)
100/1000
200/1000
300/1000
('iter', 11)
100/1000
('iter', 12)
100/1000
200/1000
('iter', 13)
('iter', 14)
('iter', 15)
('iter', 16)
100/1000
200/1000
300/1000
('iter', 17)
100/1000
200/1000
('iter', 18)
100/1000
200/1000
300/1000
('iter', 19)
100/1000
Epoch 1/10
65s - loss: 0.0428
Epoch 2/10
68s - loss: 0.0387
Epoch 3/10
67s - loss: 0.0380
Epoch 4/10
63s - loss: 0.0374
Epoch 5/10
59s - loss: 0.0361
Epoch 6/10
56s - loss: 0.0355
Epoch 7/10
56s - loss: 0.0352
Epoch 8/10
57s - loss: 0.0346
Epoch 9/10
56s - loss: 0.0346
Epoch 10/10
56s - loss: 0.0340
Number of cycle: 1/30
('iter', 0)
100/1000
200/1000
300/1000
400/1000
500/1000
600/1000
700/1000
800/1000
900/1000
('iter', 1)
100/1000
('iter', 2)
100/1000
200/1000
300/1000
400/1000
500/1000
600/1000
700/1000
('iter', 

KeyboardInterrupt: 

In [14]:
model.save("Humanoid_DAGGER.h5")

**Model Performance After DAGGER**

In [4]:
from keras.models import load_model

In [5]:
model = load_model("Humanoid_DAGGER.h5")

In [9]:
returns = []
observations = []
actions = []
for i in range(10):
    print('iter', i)
    env = gym.make(args.envname)  
    obs = env.reset()
    done = False
    totalr = 0.
    steps = 0
    #while True:
    while not done:
        action = model.predict(obs[None,:]) 
        observations.append(obs)
        actions.append(action)
        obs, r, done, _ = env.step(action)
        totalr += r
        steps += 1
        env.render()
        if steps % 100 == 0: print("%i/%i"%(steps, max_steps))
        if steps >= max_steps:
            break
    returns.append(totalr)
    

print('returns', returns)
print('mean return', np.mean(returns))
print('std of return', np.std(returns))

[2017-03-27 23:24:57,461] Making new env: Humanoid-v1


('iter', 0)
100/1000
200/1000
300/1000
400/1000
500/1000
600/1000
700/1000
800/1000
900/1000


[2017-03-27 23:25:08,755] Making new env: Humanoid-v1


1000/1000
('iter', 1)
100/1000
200/1000
300/1000
400/1000
500/1000
600/1000
700/1000
800/1000
900/1000


[2017-03-27 23:25:15,741] Making new env: Humanoid-v1


1000/1000
('iter', 2)
100/1000
200/1000
300/1000
400/1000
500/1000
600/1000
700/1000
800/1000
900/1000


[2017-03-27 23:25:24,325] Making new env: Humanoid-v1


1000/1000
('iter', 3)
100/1000
200/1000
300/1000
400/1000
500/1000
600/1000
700/1000
800/1000
900/1000


[2017-03-27 23:25:33,313] Making new env: Humanoid-v1


1000/1000
('iter', 4)
100/1000
200/1000
300/1000
400/1000
500/1000
600/1000


KeyboardInterrupt: 