In [2]:
import gym
import os
import os.path as osp
import time
import numpy as np

from stable_baselines import HER, SAC
from stable_baselines.common.atari_wrappers import FrameStack
from rrc_simulation.gym_wrapper.envs import custom_env, cube_env
from rrc_simulation.tasks import move_cube
from spinup.utils import rrc_utils
import functools

In [2]:
def make_reorient_env():
    info_keys = ['is_success', 'is_success_ori_dist', 'dist', 'final_dist', 'final_score',
                 'final_ori_dist']

    wrappers = [gym.wrappers.ClipAction,
                functools.partial(custom_env.LogInfoWrapper, info_keys=info_keys),
                functools.partial(custom_env.CubeRewardWrapper, pos_coef=1., ori_coef=1., fingertip_coef=1.),
                                ac_norm_pen=0.2, rew_fn='exp',
                                goal_env=True)},
                {'cls': custom_env.ReorientWrapper,
                 'kwargs': dict(goal_env=True)},
                {'cls': gym.wrappers.TimeLimit,
                 'kwargs': dict(max_episode_steps=rrc_utils.EPLEN)},
                functools.partial(custom_env.ScaledActionWrapper,
                    goal_env=False, relative=True),
                functools.partial(wrappers.TimeLimit, max_episode_steps=EPLEN_SHORT*2),
                    reorient_log_info_wrapper,
                wrappers.FlattenObservation]

    initializer = custom_env.ReorientInitializer(1, 0.1)
    env_fn = rrc_utils.make_env_fn('real_robot_challenge_phase_1-v1', wrapper_params=wrappers,
                                   action_type=rrc_utils.action_type,
                                   initializer=initializer,
                                   frameskip=rrc_utils.FRAMESKIP,
                                   visualization=False)
    env = env_fn()
    return env


def make_curr_env():
    info_keys = ['is_success_ori_dist', 'dist', 'final_dist', 'final_score',
                 'final_ori_dist', 'goal_sample_radius',
                 'init_sample_radius']

    wrappers = [gym.wrappers.ClipAction, 
                {'cls': custom_env.LogInfoWrapper,
                 'kwargs': dict(info_keys=info_keys)},
                {'cls': gym.wrappers.TimeLimit,
                 'kwargs': dict(max_episode_steps=rrc_utils.EPLEN)},
                custom_env.FlattenGoalWrapper,]

    env_fn = rrc_utils.make_env_fn('real_robot_challenge_phase_1-v4', wrapper_params=wrappers,
                                   action_type=rrc_utils.action_type,
                                   initializer=rrc_utils.push_curr_initializer,
                                   frameskip=rrc_utils.FRAMESKIP,
                                   visualization=False)
    env = env_fn()
    return env

In [None]:
env = make_reorient_env()

In [25]:
load_dir = './data/HER-SAC_sparse_push/2020-09-18_12-28-22/2e6-steps.zip' # './data/HER-SAC_push_reorient/2020-09-20_15-58-02/1e6-steps.zip'
model.load(load_dir)

Loading a model without an environment, this model cannot be trained until it has a valid environment.


<stable_baselines.her.her.HER at 0x7fd9d5b30be0>

In [44]:
model.predict(obs)[0]

array([-0.8761622 ,  1.4965066 , -0.0684433 , -0.01645833,  0.9974102 ,
       -0.5804248 , -0.24998152,  1.4394349 , -2.669657  ], dtype=float32)

In [36]:
n_eps = 10
final_infos = []

initial_pose = move_cube.Pose(np.array([0.02176933,0.11905757,0.0325]),
                              np.array([0,0,0.47478757,0.88010043]))
goal_pose =  move_cube.Pose(position=np.array([0,0,0.0825]), orientation=np.array([0,0,0,1]))

initializer = cube_env.FixedInitializer(
    2, initial_pose, goal_pose
)
env.unwrapped.initializer = initializer

for _ in range(n_eps):
    d = False
    obs = env.reset()

    r_total = 0
    while not d and not i.get('is_success'):
        obs, r, d, i = env.step(model.predict(obs)[0])
        r_total += r
    i['total_rew'] = r_total
    final_infos.append(i)
    
print(np.mean([i['total_rew'] for i in final_infos]))

937.5


In [37]:
np.min([i['final_dist'] for i in final_infos]), np.mean([i['final_dist'] for i in final_infos]), np.std([i['final_dist'] for i in final_infos])

(0.12102928082417459, 0.12102928082417459, 0.0)

In [None]:
exp_root = './data'
hms_time = time.strftime("%Y-%m-%d_%H-%M-%S")
exp_name = 'HER-SAC_push_reorient'
exp_dir = osp.join(exp_root, exp_name, hms_time)
os.makedirs(exp_dir)

model = HER('MlpPolicy', env, SAC, n_sampled_goal=4,
            tensorboard_log=exp_dir,
            goal_selection_strategy='future',
            verbose=1, buffer_size=int(1e6),
            learning_rate=1e-4,
            gamma=0.95, batch_size=256,
            policy_kwargs=dict(layers=[256, 256]))

# Train for 1e6 steps
model.learn(int(1e6),)
# Save the trained agent
model.save(osp.join(exp_dir, '1e6-steps'))

-----------------------------------------
| current_lr              | 0.0001      |
| ent_coef                | 0.027137475 |
| ent_coef_loss           | -23.953201  |
| entropy                 | 8.92744     |
| episodes                | 100         |
| fps                     | 40          |
| mean 100 episode reward | 343         |
| n_updates               | 36751       |
| policy_loss             | -43.370445  |
| qf1_loss                | 2.311096    |
| qf2_loss                | 2.086415    |
| success rate            | 0           |
| time_elapsed            | 918         |
| total timesteps         | 37125       |
| value_loss              | 0.2303352   |
-----------------------------------------
-----------------------------------------
| current_lr              | 0.0001      |
| ent_coef                | 0.007934947 |
| ent_coef_loss           | 1.1313438   |
| entropy                 | 7.422938    |
| episodes                | 200         |
| fps                     | 36    

In [None]:
# Train for 1e6 steps
model.learn(int(2e6),reset_num_timesteps=False)
# Save the trained agent
model.save(osp.join(exp_dir, '2e6-steps'))

-----------------------------------------
| current_lr              | 0.0001      |
| ent_coef                | 0.023150394 |
| ent_coef_loss           | -2.192941   |
| entropy                 | 7.891837    |
| episodes                | 100         |
| fps                     | 27          |
| mean 100 episode reward | 152         |
| n_updates               | 37125       |
| policy_loss             | -21.213112  |
| qf1_loss                | 0.28706837  |
| qf2_loss                | 0.23305702  |
| success rate            | 0.0404      |
| time_elapsed            | 1331        |
| total timesteps         | 1037125     |
| value_loss              | 0.23443252  |
-----------------------------------------
-----------------------------------------
| current_lr              | 0.0001      |
| ent_coef                | 0.014548837 |
| ent_coef_loss           | 0.875576    |
| entropy                 | 7.7432957   |
| episodes                | 400         |
| fps                     | 26    

In [19]:
from stable_baselines import logger

In [16]:
!ls $exp_dir

1e6-steps.zip  HER_1
