In [1]:
import gymnasium as gym
import panda_gym
import sys
from pathlib import Path
import numpy as np
import torch as th
from stable_baselines3.ppo import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.vec_env import SubprocVecEnv, VecNormalize
from gymnasium.wrappers import FlattenObservation

PROJECT_ROOT_DIR = Path().absolute().parent
print(PROJECT_ROOT_DIR)

if str(PROJECT_ROOT_DIR.absolute()) not in sys.path:
    sys.path.append(str(PROJECT_ROOT_DIR.absolute()))
print(sys.path)

from utils.sb3_env_utils import make_env
from utils.load_data import load_data
from utils.sb3_env_wrappers import ScaledObservationWrapper
from models.sb3_model import PPOWithBCLoss
from utils.sb3_evaluate_kl import evaluate_policy_with_kl
from configs.load_config import load_config
from utils.register_env import register_my_env

Gym has been unmaintained since 2022 and does not support NumPy 2.0 amongst other critical functionality.
Please upgrade to Gymnasium, the maintained drop-in replacement of Gym, or contact the authors of your software and request that they upgrade.
See the migration guide at https://gymnasium.farama.org/introduction/migration_guide/ for additional information.
  from distutils.dep_util import newer, newer_group
  from distutils.dep_util import newer, newer_group
  from pkg_resources import resource_stream, resource_exists
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)


/home/hs/dev_ai/codes/IRPO/exp_on_panda
['/home/hs/anaconda3/envs/o2o/lib/python38.zip', '/home/hs/anaconda3/envs/o2o/lib/python3.8', '/home/hs/anaconda3/envs/o2o/lib/python3.8/lib-dynload', '', '/home/hs/anaconda3/envs/o2o/lib/python3.8/site-packages', '/home/hs/anaconda3/envs/o2o/lib/python3.8/site-packages/setuptools/_vendor', '/home/hs/dev_ai/codes/IRPO/exp_on_panda']


In [2]:
# Configuration
# CONFIG_FILE_NAME = "configs/iter_1/seed1/reacher_annealing.json"
# CONFIG_FILE_NAME = "configs/iter_2/seed1/reacher_annealing.json"
CONFIG_FILE_NAME = "configs/iter_4/seed1/reacher_annealing.json"

custom_config = load_config(CONFIG_FILE_NAME)

ENV_NAME = custom_config["env"]["name"]
BC_EXPERIMENT_NAME = custom_config["bc"]["experiment_name"]
RL_EXPERIMENT_NAME = custom_config["rl_bc"]["experiment_name"]
BC_EXPERT_DATA_DIR = custom_config["bc"]["data_cache_dir"]
BC_POLICY_FILE_NAME = custom_config["bc"].get("policy_file_save_name", "bc_checkpoint")

print(f"Env: {ENV_NAME}")
print(f"BC Exp: {BC_EXPERIMENT_NAME}")
print(f"RL Exp: {RL_EXPERIMENT_NAME}")
print(f"Data: {BC_EXPERT_DATA_DIR}")

Env: my-reach
BC Exp: iter_4/reacher_10epochs_loss_1_annealing
RL Exp: iter_4/reacher_1e7steps_8envs_loss_1_annealing
Data: rollout/cache/myreach_from_iter_3_rl_bc_1.csv


In [3]:
# Register Environment
register_my_env(goal_range=0.3, distance_threshold=0.01, max_episode_steps=50)

# Load Data & Environment
data_file: Path = PROJECT_ROOT_DIR / BC_EXPERT_DATA_DIR
print(f"load data from {str(data_file.absolute())}")
_, _, _, _, _, obs_scaler = load_data(data_file)

env = gym.make(ENV_NAME)
env = ScaledObservationWrapper(env=FlattenObservation(env), scaler=obs_scaler)

load data from /home/hs/dev_ai/codes/IRPO/exp_on_panda/rollout/cache/myreach_from_iter_3_rl_bc_1.csv
argv[0]=--background_color_red=0.8745098114013672
argv[1]=--background_color_green=0.21176470816135406
argv[2]=--background_color_blue=0.1764705926179886


pybullet build time: Jan 29 2025 23:19:57


In [4]:
# Load Models
bc_policy_save_dir = PROJECT_ROOT_DIR / "checkpoints" / "bc" / BC_EXPERIMENT_NAME
bc_ppo = PPOWithBCLoss.load(str((bc_policy_save_dir / BC_POLICY_FILE_NAME).absolute()))
print(f"Loaded BC model from {bc_policy_save_dir / BC_POLICY_FILE_NAME}")

rl_bc_policy_save_dir = PROJECT_ROOT_DIR / "checkpoints" / "rl" / RL_EXPERIMENT_NAME
rl_bc_ppo = PPOWithBCLoss.load(str((rl_bc_policy_save_dir / "best_model").absolute()), env=env)
print(f"Loaded RL model from {rl_bc_policy_save_dir / 'best_model'}")

verbose:  0
Loaded BC model from /home/hs/dev_ai/codes/IRPO/exp_on_panda/checkpoints/bc/iter_4/reacher_10epochs_loss_1_annealing/bc_checkpoint
verbose:  0
Loaded RL model from /home/hs/dev_ai/codes/IRPO/exp_on_panda/checkpoints/rl/iter_4/reacher_1e7steps_8envs_loss_1_annealing/best_model


In [5]:
# Evaluate KL (Teacher: BC, Student: RL, Sample: BC)
print("Evaluating KL (Teacher=BC, Student=RL, Sample=BC)...")
evaluate_policy_with_kl(model_teacher=bc_ppo, model_student=rl_bc_ppo, sample_model=bc_ppo, env=env, n_eval_episodes=10)

Evaluating KL (Teacher=BC, Student=RL, Sample=BC)...




(-4.5, 1.3601470508735443, 6.0165632206744515e+23, 2.3404524488882585)

In [6]:
# Evaluate KL (Teacher: RL, Student: BC, Sample: RL)
print("Evaluating KL (Teacher=RL, Student=BC, Sample=RL)...")
evaluate_policy_with_kl(model_teacher=rl_bc_ppo, model_student=bc_ppo, sample_model=rl_bc_ppo, env=env, n_eval_episodes=10)

Evaluating KL (Teacher=RL, Student=BC, Sample=RL)...




(-4.0, 1.0, 742415141400.2032, 1.7063520585745573)

In [7]:
# Evaluate KL (Teacher: BC, Student: RL, Sample: RL) and print stats
print("Evaluating KL (Teacher=BC, Student=RL, Sample=RL)...")
mean_reward, std_reward, mean_kl, mean_act_dist_kl = evaluate_policy_with_kl(
    model_teacher=bc_ppo, 
    model_student=rl_bc_ppo, 
    sample_model=rl_bc_ppo, 
    env=env, 
    n_eval_episodes=100
)

print("-" * 50)
print(f"Mean Reward: {mean_reward:.2f} +/- {std_reward:.2f}")
print(f"KL (Teacher->Student): {mean_kl:.4f}")
print(f"Action Dist KL: {mean_act_dist_kl:.4f}")
print("-" * 50)

Evaluating KL (Teacher=BC, Student=RL, Sample=RL)...




--------------------------------------------------
Mean Reward: -3.97 +/- 1.49
KL (Teacher->Student): 17907037.8099
Action Dist KL: 1.5006
--------------------------------------------------
