In [1]:
import gym
import os
import os.path as osp
import time
import numpy as np

from stable_baselines import HER, TD3
from stable_baselines.common.atari_wrappers import FrameStack
from rrc_iprl_package.envs import custom_env, rrc_utils, env_wrappers
from gym.wrappers import FilterObservation

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



In [2]:
def make_reorient_env_p1():
    info_keys = ['is_success', 'is_success_ori_dist', 'dist', 'final_dist', 'final_score',
                 'final_ori_dist']

    wrappers = [gym.wrappers.ClipAction,
                {'cls': custom_env.LogInfoWrapper,
                 'kwargs': dict(info_keys=info_keys)},
                {'cls': custom_env.CubeRewardWrapper,
                 'kwargs': dict(pos_coef=1., ori_coef=1.,
                                ac_norm_pen=0.2, rew_fn='exp',
                                goal_env=True)},
                {'cls': custom_env.ReorientWrapper,
                 'kwargs': dict(goal_env=True)},
                {'cls': gym.wrappers.TimeLimit,
                 'kwargs': dict(max_episode_steps=rrc_utils.EPLEN)},
                custom_env.FlattenGoalWrapper]
    initializer = custom_env.ReorientInitializer(1, 0.1)
    env_fn = rrc_utils.make_env_fn('real_robot_challenge_phase_1-v1', wrapper_params=wrappers,
                                   action_type=rrc_utils.action_type,
                                   initializer=initializer,
                                   frameskip=rrc_utils.FRAMESKIP,
                                   visualization=False)
    env = env_fn()
    return env


def make_curr_env_p1():
    info_keys = ['is_success_ori_dist', 'dist', 'final_dist', 'final_score',
                 'final_ori_dist', 'goal_sample_radius',
                 'init_sample_radius']

    wrappers = [gym.wrappers.ClipAction, 
                {'cls': custom_env.LogInfoWrapper,
                 'kwargs': dict(info_keys=info_keys)},
                {'cls': gym.wrappers.TimeLimit,
                 'kwargs': dict(max_episode_steps=rrc_utils.EPLEN)},
                custom_env.FlattenGoalWrapper,]

    env_fn = rrc_utils.make_env_fn('real_robot_challenge_phase_1-v4', wrapper_params=wrappers,
                                   action_type=rrc_utils.action_type,
                                   initializer=rrc_utils.push_curr_initializer,
                                   frameskip=rrc_utils.FRAMESKIP,
                                   visualization=False)
    env = env_fn()
    return env


def make_curr_env_p2():
    env_fn = rrc_utils.build_env_fn(goal_env=True)
    env = env_fn()
    return env

In [3]:
env = make_curr_env_p2()
env = FilterObservation(env, ['achieved_goal', 'desired_goal', 'observation'])



In [5]:
exp_dir = './data/HER-SAC_sparse_push/2020-09-18_12-28-22/'
load_dir = osp.join(exp_dir, '2e6-steps.zip') # './data/HER-SAC_push_reorient/2020-09-20_15-58-02/1e6-steps.zip'
model = HER('MlpPolicy', env, TD3, n_sampled_goal=4,
            tensorboard_log=exp_dir,
            goal_selection_strategy='future',
            verbose=1, buffer_size=int(1e6),
            learning_rate=3e-4,
            gamma=0.95, batch_size=100,
            policy_kwargs=dict(layers=[256, 256]))
model.load(load_dir)

Loading a model without an environment, this model cannot be trained until it has a valid environment.


<stable_baselines.her.her.HER at 0x7f291421d9e8>

In [6]:
n_eps = 10
final_infos = []

for _ in range(n_eps):
    d = False
    obs = env.reset()

    r_total = 0
    while not d:
        obs, r, d, i = env.step(model.predict(obs)[0])
        r_total += r
    i['total_rew'] = r_total
    final_infos.append(i)
    
print('total_rew:', np.mean([i['total_rew'] for i in final_infos]),
      'final_dist min:', np.min([i['final_dist'] for i in final_infos]), 
      'final_dist mean:', np.mean([i['final_dist'] for i in final_infos]), 
      'final_dist std:', np.std([i['final_dist'] for i in final_infos]))

0

In [None]:
exp_root = './data'
hms_time = time.strftime("%Y-%m-%d_%H-%M-%S")
exp_name = 'HER-TD3_push_reorient'
exp_dir = osp.join(exp_root, exp_name, hms_time)
os.makedirs(exp_dir)

model = HER('MlpPolicy', env, TD3, n_sampled_goal=4,
            tensorboard_log=exp_dir,
            goal_selection_strategy='future',
            verbose=1, buffer_size=int(1e6),
            learning_rate=3e-4,
            gamma=0.95, batch_size=100,
            policy_kwargs=dict(layers=[256, 256]))

# Train for 1e6 steps
model.learn(int(1e6),)
# Save the trained agent
model.save(osp.join(exp_dir, '1e6-steps'))





Instructions for updating:
Use keras.layers.flatten instead.
Instructions for updating:
Please use `layer.__call__` method instead.
Instructions for updating:
Use keras.layers.Dense instead.





Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where






  "Cameras are not enabled, so images in the camera observation"



