In [78]:
"""
Code to load an expert policy and generate roll-out data for behavioral cloning.
Example usage:
    python run_expert.py experts/Humanoid-v1.pkl Humanoid-v1 --render \
            --num_rollouts 20

Author of this script and included expert policies: Jonathan Ho (hoj@openai.com)
"""

import pickle
import tensorflow as tf
import numpy as np
import tf_util
import gym
import load_policy

In [82]:
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('expert_policy_file', type=str)
parser.add_argument('envname', type=str)
parser.add_argument('--render', action='store_true')
parser.add_argument("--max_timesteps", type=int)
parser.add_argument('--num_rollouts', type=int, default=20,
                    help='Number of expert roll outs')
#args = parser.parse_args(["experts/Humanoid-v1.pkl", "Humanoid-v1", "--render", "--num_rollouts", "200"])
args = parser.parse_args(["experts/Humanoid-v1.pkl", "Humanoid-v1", "--render", "--num_rollouts", "20"])
#args = parser.parse_args(["experts/Reacher-v1.pkl", "Reacher-v1", "--render", "--num_rollouts", "20"])
#args = parser.parse_args(["experts/Reacher-v1.pkl", "Reacher-v1", "--num_rollouts", "20000"])
#args = parser.parse_args(["experts/Hopper-v1.pkl", "Hopper-v1", "--render", "--num_rollouts", "20"])
#args = parser.parse_args(["experts/Hopper-v1.pkl", "Hopper-v1", "--num_rollouts", "200"])
#args = parser.parse_args(["experts/Walker2d-v1.pkl", "Walker2d-v1", "--render", "--num_rollouts", "20"])
#args = parser.parse_args(["experts/Walker2d-v1.pkl", "Walker2d-v1", "--num_rollouts", "200"])



In [83]:
print('loading and building expert policy')
# policy_fn is a function that takes an observation and outputs an action
policy_fn = load_policy.load_policy(args.expert_policy_file)
# The printed value is the shape of observation mean and observation std, to normalize observation.

loading and building expert policy
('obs', (1, 376), (1, 376))


In [84]:
print('loaded and built')

with tf.Session():
    tf_util.initialize()

    import gym
    env = gym.make(args.envname) # generate the environment
    # This indicates the observation space, i.e. the universe in this environment, is indicated by this 376 dimension real rector
    print("The observation space for this environment:", env.observation_space)
    # This indicates the action space, i.e. the action in this environment. Here the action is indicated by 17 dimension real vector.
    print("The action space for this environment:", env.action_space)
    # use args.max_timesteps if it is not None, else use envs.spec.timestep_limit, here is envs.spec.timestep_limit = 1000
    max_steps = args.max_timesteps or env.spec.timestep_limit 

    returns = []
    observations = []
    actions = []
    for i in range(args.num_rollouts):
        print('iter', i)
        obs = env.reset() # obs is not the initial observation of the environment
        done = False
        totalr = 0. # total reward
        steps = 0
        while not done:
            # the obs[None,:] is equivalent to obs.reshape(1,-1)
            action = policy_fn(obs[None,:]) # generate the action from the current observation
            observations.append(obs)
            actions.append(action)
            # obs: object, an environment-specific object representing the observation of the environment
            # r: float, reward obtained from action
            # done: boolean, whether it's time to reset the environment
            # _, info: dict, diagnostic information
            obs, r, done, _ = env.step(action) # the simulator perform the action
            totalr += r
            steps += 1
            if args.render:
                env.render()
            if steps % 100 == 0: print("%i/%i"%(steps, max_steps))
            if steps >= max_steps:
                break
        returns.append(totalr)

    print('returns', returns) # the sum of rewards acorss each rollout
    print('mean return', np.mean(returns)) # the mean across each rollout
    print('std of return', np.std(returns)) # the std across each rollout

    expert_data = {'observations': np.array(observations), # obsevations array has shape rollout * max_steps, each observation has length 376 as dimension of world
                   'actions': np.array(actions)} # 




loaded and built


[2017-03-26 21:26:38,162] Making new env: Humanoid-v1


('The observation space for this environment:', Box(376,))
('The action space for this environment:', Box(17,))
('iter', 0)
100/1000
200/1000
300/1000
400/1000
500/1000
600/1000
700/1000
800/1000


KeyboardInterrupt: 

In [68]:
import keras
from keras.layers import Dense
from keras.models import Sequential

In [69]:
width_dense = 50

In [70]:
model = Sequential()
model.add(Dense(width_dense, input_shape=env.observation_space.shape, activation="relu"))
model.add(Dense(width_dense, activation="relu"))
model.add(Dense(np.prod(env.action_space.shape)))

In [71]:
model.compile(loss='mse', optimizer='adam')

In [72]:
model.fit(x = expert_data["observations"], y = np.squeeze(expert_data["actions"]), nb_epoch=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x1261ff990>

In [73]:
model.save("BehaviorCloning_Walker2d.h5")

In [85]:
model = load_model("BehaviorCloning_Humanoid.h5")

In [88]:
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('expert_policy_file', type=str)
parser.add_argument('envname', type=str)
parser.add_argument('--render', action='store_true')
parser.add_argument("--max_timesteps", type=int)
parser.add_argument('--num_rollouts', type=int, default=20,
                    help='Number of expert roll outs')
#args = parser.parse_args(["experts/Humanoid-v1.pkl", "Humanoid-v1", "--render", "--num_rollouts", "200"])
args = parser.parse_args(["experts/Humanoid-v1.pkl", "Humanoid-v1", "--render", "--num_rollouts", "20"])
#args = parser.parse_args(["experts/Reacher-v1.pkl", "Reacher-v1", "--render", "--num_rollouts", "20"])
#args = parser.parse_args(["experts/Reacher-v1.pkl", "Reacher-v1", "--num_rollouts", "20000"])
#args = parser.parse_args(["experts/Hopper-v1.pkl", "Hopper-v1", "--render", "--num_rollouts", "20"])
#args = parser.parse_args(["experts/Walker2d-v1.pkl", "Walker2d-v1", "--num_rollouts", "20"])


In [89]:
print('loaded and built')

import gym
env = gym.make(args.envname) # generate the environment
# This indicates the observation space, i.e. the universe in this environment, is indicated by this 376 dimension real rector
print("The observation space for this environment:", env.observation_space)
# This indicates the action space, i.e. the action in this environment. Here the action is indicated by 17 dimension real vector.
print("The action space for this environment:", env.action_space)
# use args.max_timesteps if it is not None, else use envs.spec.timestep_limit, here is envs.spec.timestep_limit = 1000
max_steps = args.max_timesteps or env.spec.timestep_limit 

returns = []
observations = []
actions = []
for i in range(args.num_rollouts):
    print('iter', i)
    obs = env.reset() # obs is not the initial observation of the environment
    done = False
    totalr = 0. # total reward
    steps = 0
    while True:
    #while not done:
        # the obs[None,:] is equivalent to obs.reshape(1,-1)
        action = model.predict(obs[None,:]) # generate the action from the current observation
        observations.append(obs)
        actions.append(action)
        # obs: object, an environment-specific object representing the observation of the environment
        # r: float, reward obtained from action
        # done: boolean, whether it's time to reset the environment
        # _, info: dict, diagnostic information
        obs, r, done, _ = env.step(action) # the simulator perform the action
        totalr += r
        steps += 1
        if args.render:
            env.render()
        if steps % 100 == 0: print("%i/%i"%(steps, max_steps))
        if steps >= max_steps:
            break
    returns.append(totalr)

print('returns', returns) # the sum of rewards acorss each rollout
print('mean return', np.mean(returns)) # the mean across each rollout
print('std of return', np.std(returns)) # the std across each rollout

expert_data = {'observations': np.array(observations), # obsevations array has shape rollout * max_steps, each observation has length 376 as dimension of world
               'actions': np.array(actions)} # 




[2017-03-26 21:27:35,155] Making new env: Humanoid-v1


loaded and built
('The observation space for this environment:', Box(376,))
('The action space for this environment:', Box(17,))
('iter', 0)
100/1000
200/1000
300/1000
400/1000
500/1000
600/1000
700/1000
800/1000
900/1000
1000/1000
('iter', 1)
100/1000
200/1000
300/1000
400/1000
500/1000
600/1000
700/1000
800/1000
900/1000


KeyboardInterrupt: 