In [None]:
import gymnasium as gym
import gymnasium_robotics

# PyTorch
import torch
import os
import numpy as np
import tqdm
from stable_baselines3 import SAC
# from stable_baselines3.common.noise import NormalActionNoise
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.buffers import ReplayBuffer

In [2]:
env_id = 'FrankaKitchen-v1'
task = 'kettle'
gym.register_envs(gymnasium_robotics)

In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [4]:
n_actions = 9

In [64]:
def flatten_observation(observation):
    if not isinstance(observation, dict):
        return observation
    achieved = observation['achieved_goal'][task].astype(np.float32)
    obs = observation['observation'].astype(np.float32)

    flat_obs = np.concatenate([achieved, obs], dtype=np.float32)
    return flat_obs

def custom_reward(observation):
    achieved = observation['achieved_goal'][task][0:4]
    desired = observation['desired_goal'][task][0:4]
    res = 1.0 - np.linalg.norm(achieved - desired)
    # assert res <= 1.0 and res >= 0.0, "Reward out of range! "+str(res)
    return res   

In [6]:
flat_dim = 59 + 7
obs_low = np.full((flat_dim,), -1e10, dtype=np.float32)
obs_high = np.full((flat_dim,), 1e10, dtype=np.float32)

class FlattenDictWrapper(gym.Wrapper):    
    def __init__(self, env):
        super().__init__(env)
        self.keys = env.observation_space.spaces.keys()
        self.observation_space = gym.spaces.Box(low=obs_low, high=obs_high, shape=(flat_dim,), dtype=np.float32)

    # def observation(self, observation):
    #     return flatten_observation(observation)
    
    def step(self, action):
        obs, reward, terminated, truncated, info = self.env.step(action)
        if reward == 0.0:
            reward = custom_reward(obs)
        obs = flatten_observation(obs)
        return obs, reward, terminated, truncated, info

    def reset(self, **kwargs):
        obs = self.env.reset(**kwargs)
        return flatten_observation(obs)

In [83]:
def make_env():
    env = gym.make(env_id, render_mode=None, tasks_to_complete=[task])  # Or your actual task
    env = FlattenDictWrapper(env)
    return env

n_training_envs = 1
env = DummyVecEnv([make_env]*n_training_envs)
eval_env = DummyVecEnv([make_env])

In [73]:
combos1 = torch.load("combos/combos-lr001-hs0-just-success-10000")
combos2 = torch.load("combos/combos-lr001-hs0-just-success-10000-5000")

In [84]:
def initialize_buffer(replay_buffer, combos):
    for combo in tqdm.tqdm(combos):
        state = eval_env.reset()
        for combo_step in combo:
            action = np.asarray([combo_step])
            obs, reward, done, info = eval_env.step(action)
            replay_buffer.add(state[0], obs[0], action, reward, done, info)
            state = obs

In [85]:
model = SAC("MlpPolicy", env, device=device, learning_rate=0.001)
replay_buffer = model.replay_buffer
initialize_buffer(replay_buffer, combos1)
initialize_buffer(replay_buffer, combos2)

100%|██████████| 1209/1209 [23:39<00:00,  1.17s/it]
100%|██████████| 636/636 [12:38<00:00,  1.19s/it]


In [86]:
max_timesteps = 50000
run_name = f"sac_{max_timesteps}_reward_shaping_"+task
eval_log_dir = os.path.join("eval_logs", run_name)
eval_callback = EvalCallback(eval_env, best_model_save_path=eval_log_dir,
                              log_path=eval_log_dir, eval_freq=max(500 // n_training_envs, 1),
                              n_eval_episodes=5, deterministic=True,
                              render=False)

In [None]:
model.learn(total_timesteps=max_timesteps, callback=eval_callback)



Eval num_timesteps=500, episode_reward=167.52 +/- 0.00
Episode length: 280.00 +/- 0.00
New best mean reward!
Eval num_timesteps=1000, episode_reward=167.52 +/- 0.00
Episode length: 280.00 +/- 0.00
Eval num_timesteps=1500, episode_reward=167.52 +/- 0.00
Episode length: 280.00 +/- 0.00
Eval num_timesteps=2000, episode_reward=167.52 +/- 0.00
Episode length: 280.00 +/- 0.00
Eval num_timesteps=2500, episode_reward=167.52 +/- 0.00
Episode length: 280.00 +/- 0.00
Eval num_timesteps=3000, episode_reward=167.52 +/- 0.00
Episode length: 280.00 +/- 0.00
Eval num_timesteps=3500, episode_reward=167.52 +/- 0.00
Episode length: 280.00 +/- 0.00
Eval num_timesteps=4000, episode_reward=167.52 +/- 0.00
Episode length: 280.00 +/- 0.00
Eval num_timesteps=4500, episode_reward=167.52 +/- 0.00
Episode length: 280.00 +/- 0.00
Eval num_timesteps=5000, episode_reward=167.52 +/- 0.00
Episode length: 280.00 +/- 0.00
Eval num_timesteps=5500, episode_reward=47.77 +/- 12.32
Episode length: 280.00 +/- 0.00
Eval num_ti

In [11]:
model.save(run_name)

In [13]:
for i in range(10):
	env_eval = make_env()
	obs, _ = env_eval.reset()
	done = False
	ep_reward = 0

	while not done:
		action, _ = model.predict(obs, deterministic=True)
		obs, reward, terminated, truncated, _ = env_eval.step(action)
		done = terminated or truncated
		ep_reward += reward
	print(f"Episode reward: {ep_reward}")

Episode reward: 166.28406681999147
Episode reward: 166.28406681999152
Episode reward: 166.28406681999144
Episode reward: 166.28406681999152
Episode reward: 166.28406681999144
Episode reward: 166.28406681999144
Episode reward: 166.28406681999144
Episode reward: 166.28406681999144
Episode reward: 166.28406681999144
Episode reward: 166.2840668199915


In [None]:
print("Action space:", env_eval.action_space)
print("Model action space:", model.action_space)

Action space: Box(-1.0, 1.0, (9,), float64)
Model action space: Box(-1.0, 1.0, (9,), float64)
