In [2]:
import gym

from collections import deque

import numpy as np
import scipy.signal

import tensorflow as tf
from sklearn.utils import shuffle

seed = 0
np.random.seed(seed)

In [20]:
def run_episode(env): # Run policy and collect (state, action, reward) pairs
    obs = env.reset()
    observes, actions, rewards, infos = [], [], [], []
    done = False
    while not done:
        obs = obs.astype(np.float32).reshape((1, -1))
        observes.append(obs)
        action = env.action_space.sample().reshape((1, -1)).astype(np.float32)
        actions.append(action)
        obs, reward, done, info = env.step(action)
        if not isinstance(reward, float):
            reward = np.asscalar(reward)
        rewards.append(reward)
        infos.append(info)
        
    return (np.concatenate(observes), np.concatenate(actions), np.array(rewards, dtype=np.float32), infos)

def collect_negative_demos(env, episodes, traj_length=10): # collect trajectories. if 'evaluation' is ture, then only mean value of policy distribution is used without sampling.
    total_steps = 0
    trajectories = []
    for e in range(episodes):
        observes, actions, rewards, infos = run_episode(env)
        total_steps += observes.shape[0]
        traj_len = len(rewards)
        if traj_len > traj_length:
            traj_len = traj_length
        trajectory = {'observes': observes[-traj_len:,:],
                      'actions': actions[-traj_len:,:],
                      'rewards': rewards[-traj_len:],
                      'infos': infos[-traj_len:]}
        trajectories.append(trajectory)
    return trajectories

In [21]:
envname = 'Hopper-v1'
env = gym.make(envname)
env.seed(seed=seed)

[2018-07-04 15:21:58,544] Making new env: Hopper-v1


[0]

In [23]:
import pickle
output = open('./'+envname+'-negative-demo.pkl', 'wb')
trajectories = collect_negative_demos(env, episodes=1000)
print("{}".format(np.mean([len(t['rewards']) for t in trajectories])))
print("{}".format(np.mean([np.sum(t['rewards']) for t in trajectories])))
pickle.dump([trajectories], output)
output.close()

9.967
6.955898284912109
