In [1]:
import numpy as np
import os.path as osp
from rrc.envs import cube_env, initializers
from gym.wrappers import FlattenObservation, FilterObservation

from spinup.utils import test_policy
from spinup.user_config import DEFAULT_DATA_DIR
from xvfbwrapper import Xvfb
from rrc.envs.wrappers import PyBulletClearGUIWrapper



In [3]:
# env, pol = test_policy.load_policy_and_env('/scr-ssd/ksrini/spinningup/data/2021-06-03_ppo-rrc-acclip/2021-06-03_10-59-54-ppo-rrc-acclip_s0')
env, pol = test_policy.load_policy_and_env('/scr-ssd/ksrini/spinningup/data/2021-06-03_ppo-rrc-diff4_irandom/2021-06-03_12-03-09-ppo-rrc-diff4_irandom_s0')



Loading from /scr-ssd/ksrini/spinningup/data/2021-06-03_ppo-rrc-diff4_irandom/2021-06-03_12-03-09-ppo-rrc-diff4_irandom_s0/pyt_save/model499.pt.




In [4]:
xvfb = Xvfb()
xvfb.start()

goal_pose = None # {'position': np.array([0,0,0.0325]), 'orientation': np.array([0,0,0,1])}
env = cube_env.CubeEnv(goal_pose, 1, initializer=env.initializer, episode_length=500,
                       visualization=True, save_mp4=True, save_dir='./videos')
env = PyBulletClearGUIWrapper(env)
env = FlattenObservation(FilterObservation(env, filter_keys=['observation', 'desired_goal']))
obs = env.reset()
d = False
while not d:
    obs, r, d, i = env.step(pol(obs))
obs = env.reset()
xvfb.stop()



In [9]:
obs, r, d, i = env.step(pol(obs))

(array([ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00,  1.0000000e+00,
         0.0000000e+00,  0.0000000e+00,  3.2499999e-02, -9.8478017e-07,
        -1.3081907e-05, -5.7593757e-01,  8.1749368e-01,  6.4295933e-02,
         7.0091151e-02,  1.3454532e-02, -6.0875551e-05,  4.8068559e-05,
         4.1382122e-01, -1.4876003e-03, -2.2157433e-03, -1.1878055e-05],
       dtype=float32),
 -0.21716879579162512,
 False,
 {'difficulty': 1})

In [None]:
xvfb = Xvfb()
xvfb.start()

goal_pose = {'position': np.array([0,0,0.0325]), 'orientation': np.array([0,0,0,1])}
env = cube_env.CubeEnv(goal_pose, 1, initializer=initializers.CenteredInitializer(1), episode_length=500,
                       visualization=True, save_mp4=True, save_dir='./videos')
env = FlattenObservation(FilterObservation(env, filter_keys=['observation', 'desired_goal']))
obs = env.reset()
d = False
while not d:
    obs, r, d, i = env.step(pol(obs))



In [3]:
goal_pose = {'position': np.array([0,0,0.0325]), 'orientation': np.array([0,0,0,1])}
env = cube_env.CubeEnv(goal_pose, 1, initializer=initializers.CenteredInitializer(1), episode_length=1000)
env = FlattenObservation(FilterObservation(env, filter_keys=['observation', 'desired_goal']))



## Train PPO

In [1]:
import gym
import functools
import torch.nn as nn
import numpy as np
import time
import os
import os.path as osp

from gym import ObservationWrapper
from gym.spaces import flatten_space
from gym.wrappers import Monitor as GymMonitor
from gym.wrappers import FilterObservation
from rrc.env import initializers, cube_env
from rrc.env.reward_fns import *
from rrc.env.wrappers import MonitorPyBulletWrapper
from rrc.env.termination_fns import stay_close_to_goal
from stable_baselines3.common.logger import configure
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.torch_layers import CombinedExtractor
from stable_baselines3.common.preprocessing import get_flattened_obs_dim, is_image_space
from stable_baselines3 import HerReplayBuffer, SAC, TD3, PPO
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize
from stable_baselines3.common.env_util import make_vec_env


class FlattenGoalObs(ObservationWrapper):
    def __init__(self, env, observation_keys):
        super().__init__(env)
        obs_space = self.env.observation_space
        obs_dict = {k: flatten_space(obs_space[k]) for k in observation_keys}
        self.observation_space = gym.spaces.Dict(obs_dict)

    def observation(self, obs):
        n_obs = {}
        for k in self.observation_space.spaces:
            if isinstance(obs[k], dict):
                obs_list = [obs[k][k2] for k2 in self.env.observation_space[k]]
                n_obs[k] = np.concatenate(obs_list)
            else:
                n_obs[k] = obs[k]
        return n_obs


class ResidualPDWrapper(gym.Wrapper):
    def __init__(self, env, Kp=np.eye(3)*np.array([100,100,1]), Kd=1, include_ac=False):
        super(ResidualPDWrapper, self).__init__(env)
        self.Kp = Kp
        self.Kd = Kd
        self._obs = self._prev_obs = None
        self.include_ac = include_ac
        if self.include_ac:
            obs_dict = self.env.observation_space.spaces
            obs_dict = {k: obs_dict['observation'][k] for k in obs_dict['observation']}
            obs_dict['pd_action'] = gym.spaces.Box(low=-np.ones(3), high=np.ones(3))
            self.observation_space.spaces['observation'] = obs_dict
    
    def reset(self):
        obs = super(ResidualPDWrapper, self).reset()
        self._prev_obs = None
        self._obs = obs
        if self.include_ac:
            obs['observation']['pd_action'] = self.pd_action(self._obs, self._prev_obs)
        return obs
    
    def step(self, action):
        action[:3] *= self.env.unwrapped.force_factor
        action[3:] *= self.env.unwrapped.torque_factor
        pd_action = self.pd_action(self._obs, self._prev_obs)
        ac = action + pd_action
        self._prev_obs = self._obs
        obs, r, d, i = self.env.step(ac)
        self._obs = obs
        if self.include_ac:
            obs['observation']['pd_action'] = self.pd_action(self._obs, self._prev_obs)
        return obs, r, d, i
    
    def pd_action(self, observation, prev_observation):
        if observation is None:
            return np.zeros(6)
        if observation['observation'].get('pd_action') is not None:
            return observation['observation']['pd_action']
        err = observation['observation']['position']
        u = -self.Kp @ err
        if prev_observation is None:
            return np.concatenate([u, np.zeros(3)], axis=-1)
        err_diff = observation['observation']['position'] - prev_observation['observation']['position']
        u -= self.Kd * err_diff / self.env.time_step_s
        return np.concatenate([u, np.zeros(3)], axis=-1)

    
class HERCombinedExtractor(CombinedExtractor):
    """
    HERCombinedExtractor is a combined extractor which only extracts pre-specified observation_keys to include in
    the observation, while retaining them at the environment level so that they may still be stored in the replay buffer
    """

    def __init__(self, observation_space: gym.spaces.Dict, cnn_output_dim: int = 256, observation_keys: list = []):
        # TODO we do not know features-dim here before going over all the items, so put something there. This is dirty!
        super(CombinedExtractor, self).__init__(observation_space, features_dim=1)

        extractors = {}

        total_concat_size = 0
        for key in observation_keys:
            subspace = observation_space.spaces[key]
            # The observation key is a vector, flatten it if needed
            extractors[key] = nn.Flatten()
            total_concat_size += get_flattened_obs_dim(subspace)

        self.extractors = nn.ModuleDict(extractors)

        # Update the features dim manually
        self._features_dim = total_concat_size


def make_sac_model(ep_len, lr, exp_dir=None, env=None, use_goal=True,
               use_sde=False):
    if use_goal:
        obs_keys = ['desired_goal', 'observation']
    else:
        obs_keys = ['observation']

    policy_kwargs = dict(
                    log_std_init=-3,
                    features_extractor_class=HERCombinedExtractor,
                    features_extractor_kwargs=dict(observation_keys=obs_keys))
    if use_sde:
        sde_kwargs = dict(
                use_sde=True,
                use_sde_at_warmup=True,
                sde_sample_freq=64)
    else:
        sde_kwargs = {}

    rb_kwargs = dict(
                    n_sampled_goal=4,
                    goal_selection_strategy='future',
                    online_sampling=False,
                    max_episode_length=ep_len)

    model = SAC('MultiInputPolicy', env,
                # tensorboard_log=exp_dir,
                replay_buffer_class=HerReplayBuffer,
                # Parameters for HER
                replay_buffer_kwargs=rb_kwargs,
                policy_kwargs=policy_kwargs,
                verbose=1, buffer_size=int(1e6),
                learning_starts=1500,
                learning_rate=lr,
                gamma=0.99, batch_size=256, **sde_kwargs)
    return model


def make_ppo_model(ep_len, lr, exp_dir=None, env=None, use_goal=True,
                   use_sde=True, dry_run=False):
    if use_goal:
        obs_keys = ['desired_goal', 'observation']
    else:
        obs_keys = ['observation']

    policy_kwargs = dict(
                    log_std_init=-3,
                    features_extractor_class=HERCombinedExtractor,
                    features_extractor_kwargs=dict(observation_keys=obs_keys))
    if use_sde:
        sde_kwargs = dict(
                use_sde=True,
                sde_sample_freq=4)
    else:
        sde_kwargs = {}
    tensorboard_log = exp_dir if dry_run else None
    model = PPO('MlpPolicy', env,
                tensorboard_log=tensorboard_log,
                # Parameters for HER
                policy_kwargs=policy_kwargs,
                verbose=1,
                learning_rate=lr,
                n_steps=1000,
                gamma=0.99, batch_size=250, **sde_kwargs)
    return model


def env_fn_generator(diff=3, episode_length=500, relative_goal=True, reward_fn=None,
                     termination_fn=False, save_mp4=False, save_dir='',
                     save_freq=10, initializer=None, **env_kwargs):
    if reward_fn is None:
        reward_fn = training_reward4
    else:
        if reward_fn == 'train1':
            reward_fn = training_reward1
        elif reward_fn == 'train2':
            reward_fn = training_reward2
        elif reward_fn == 'train3':
            reward_fn = training_reward3
        elif reward_fn == 'train4':
            reward_fn = training_reward4
        elif reward_fn == 'competition':
            reward_fn = competition_reward
    goal = None
    if initializer is None:
        initializer = initializers.centered_init
    elif initializer =='center':
        initializer = initializers.centered_init
    elif initializer == 'train':
        initializer = initializers.training_init
    elif initializer == 'fixed':
        from trifinger_simulation.tasks.move_cube import Pose
        import json
        goal = Pose.from_json(json.load(open('goal.json', 'r'))).to_dict()
        initializer = initializers.fixed_g_init(diff, goal)
    if termination_fn:
        termination_fn = stay_close_to_goal if diff<4 else stay_close_to_goal_level_4
    else:
        termination_fn = None

    def env_fn():
        env = cube_env.CubeEnv(goal, diff,
                initializer=initializer,
                episode_length=episode_length,
                relative_goal=relative_goal,
                reward_fn=reward_fn,
                force_factor=1,
                torque_factor=.1,
                termination_fn=termination_fn,
                **env_kwargs)
        if save_mp4:
            env = MonitorPyBulletWrapper(env, save_dir, save_freq)
        env = FlattenGoalObs(env, ['desired_goal', 'achieved_goal', 'observation'])
        return Monitor(env, info_keywords=('ori_err', 'pos_err'))
    return env_fn


def make_env_cls(diff=3, initializer='train',
                     episode_length=500, relative_goal=True, reward_fn=None,
                     termination_fn=False, **env_kwargs):
    if reward_fn is None:
        reward_fn = training_reward4
    else:
        if reward_fn == 'train1':
            reward_fn = training_reward1
        elif reward_fn == 'train2':
            reward_fn = training_reward2
        elif reward_fn == 'train3':
            reward_fn = training_reward3
        elif reward_fn == 'train4':
            reward_fn = training_reward4
        elif reward_fn == 'competition':
            reward_fn = competition_reward
    if termination_fn:
        termination_fn = stay_close_to_goal if diff<4 else stay_close_to_goal_level_4
    else:
        termination_fn = None

    if initializer is None:
        initializer = initializers.centered_init
    elif initializer =='center':
        initializer = initializers.centered_init
    elif initializer == 'train':
        initializer = initializers.training_init
    elif initializer == 'fixed':
        from trifinger_simulation.tasks.move_cube import Pose
        import json
        goal = Pose.from_json(json.load(open('goal.json', 'r'))).to_dict()
        initializer = initializers.fixed_g_init(diff, goal)

    env_cls = functools.partial(cube_env.CubeEnv, cube_goal_pose=None,
            goal_difficulty=diff,
            initializer=initializer,
            episode_length=episode_length,
            relative_goal=relative_goal,
            reward_fn=reward_fn,
            termination_fn=termination_fn,
            **env_kwargs)
    return env_cls


def train_save_model(model, eval_env, exp_dir, n_steps=1e5, reset_num_timesteps=False):
    model.learn(n_steps, eval_env=eval_env, n_eval_episodes=5,
                eval_freq=10000, reset_num_timesteps=False,
                eval_log_path=exp_dir)
    # Save the trained agent
    model.save(osp.join(exp_dir, '{:.0e}-steps'.format(model.num_timesteps)))
    return model


wandb_root = '/scr-ssd/ksrini/spinningup/notebooks'
get_save_path = lambda run: '/'.join([wandb_root] + run.config['exp_dir'].split('/')[1:])

def display_video(path=None, run=None):
    if run:
        path = get_save_path(run.config['exp_dir'])
    return Video(path, embed=True, width=640)

In [2]:
env_cls = make_env_cls(1, episode_length=500, reward_fn='train2',
                       termination_fn=True, initializer='center', 
                       torque_factor=.1, force_factor=.1)

wrapper = lambda env: FlattenGoalObs(ResidualPDWrapper(env), 
                                     observation_keys=['desired_goal', 'achieved_goal', 'observation'])

env = make_vec_env(env_cls, n_envs=10, wrapper_class=wrapper,
        monitor_kwargs=dict(info_keywords=('ori_err', 'pos_err')))

model = make_ppo_model(500, 3e-4, None, env, True)



Using cuda device


In [4]:
import wandb

wandb.init(project='cvxrl', name='residual-ppo-diff1')

[34m[1mwandb[0m: Currently logged in as: [33mkrshna[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.10.32 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


In [5]:
exp_root = './data'
hms_time = time.strftime("%Y-%m-%d_%H-%M-%S")
exp_name = 'ResidualPPO_rrc-diff{}'.format(1)
exp_dir = osp.join(exp_root, exp_name, hms_time)
os.makedirs(exp_dir)

In [None]:
logger = configure(exp_dir, ['stdout', 'wandb'])
model.set_logger(logger)
eval_env = make_vec_env(env_cls, n_envs=1, wrapper_class=wrapper,
        monitor_kwargs=dict(info_keywords=('ori_err', 'pos_err')))
model = train_save_model(model, eval_env, exp_dir, 1e6)

Logging to ./data/ResidualPPO_rrc-diff1/2021-06-23_08-19-10




---------------------------------
| rollout/           |          |
|    ep_len_mean     | 755      |
|    ep_rew_mean     | 89.5     |
| time/              |          |
|    fps             | 611      |
|    iterations      | 1        |
|    time_elapsed    | 16       |
|    total_timesteps | 10000    |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 830         |
|    ep_rew_mean          | 113         |
| time/                   |             |
|    fps                  | 566         |
|    iterations           | 2           |
|    time_elapsed         | 35          |
|    total_timesteps      | 20000       |
| train/                  |             |
|    approx_kl            | 0.089975215 |
|    clip_fraction        | 0.397       |
|    clip_range           | 0.2         |
|    entropy_loss         | 4.03        |
|    explained_variance   | -0.00634    |
|    learning_rate        | 0.