In [None]:
%load_ext autoreload
%autoreload 2

In [1]:
import gymnasium
import os

from torch.optim.lr_scheduler import CyclicLR

from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import SubprocVecEnv
from stable_baselines3.common.callbacks import (
    BaseCallback,
    EvalCallback,
    CheckpointCallback,
)

from rl_course.common import show_model, evaluate_model, submit_to_hg_hub, load_dotenv
from rich import print as rprint

load_dotenv("../.env")

[32;20m[INFO][common.py@55]: Loaded environment variables: ['HG_TOKEN'][0m


In [2]:
env_name = "LunarLander-v3"
model_name = "ppo-LunarLander-v6"
model_dir = f"models/{model_name}"
device = "cpu"
logs = "logs"
exp_dir = "../experiments"

In [22]:
class LRSchedulerCallback(BaseCallback):
    def __init__(self, scheduler, verbose: int = 0):
        super().__init__(verbose)
        self.scheduler = scheduler
        self.current_progress = 0

    def _on_step(self) -> bool:
        self.scheduler.step()
        new_lr = self.scheduler.get_last_lr()[0]
        self.model.learning_rate = new_lr
        self.logger.record("train/learning_rate_clbk", new_lr)
        return True


In [None]:
train_env = make_vec_env(env_name, n_envs=16, vec_env_cls=SubprocVecEnv)
model = PPO(
    "MlpPolicy",
    train_env,
    n_steps=2048,
    batch_size=128,
    learning_rate=0.0003,
    n_epochs=5,
    gamma=0.999,
    gae_lambda=0.98,
    ent_coef=0.01,
    device=device,
    verbose=0,
    tensorboard_log=logs,
)
scheduler_callback = LRSchedulerCallback(
    CyclicLR(
        model.policy.optimizer,
        base_lr=1e-5,
        max_lr=3e-3,
        step_size_up=1000,
        step_size_down=1_000_000,
        mode="triangular2",
        cycle_momentum=False,
    )
)

eval_env = make_vec_env(env_name, n_envs=1, vec_env_cls=SubprocVecEnv)
best_model_dir = os.path.join(model_dir, "best_model")
os.makedirs(best_model_dir, exist_ok=True)
eval_callback = EvalCallback(
    eval_env,
    best_model_save_path=best_model_dir,
    log_path=logs,
    eval_freq=10000,
    deterministic=True,
    render=False,
    verbose=0
)

ckpt_dir = os.path.join(model_dir, "checkpoints")
os.makedirs(ckpt_dir, exist_ok=True)
checkpoint_callback = CheckpointCallback(
    save_freq=50000,
    save_path=ckpt_dir,
    name_prefix=model_name,
    save_replay_buffer=True,
    save_vecnormalize=True,
    verbose=1,
)

model.learn(
    total_timesteps=2_500_000,
    callback=[eval_callback, scheduler_callback],
    progress_bar=True,
)
model.save(f"./models/{model_name}/checkpoints/{model_name}-final")
del model

In [None]:
# model_path = "models/ppo-LunarLander-v1/checkpoints/ppo-LunarLander-v2-final.zip"
model_path = "../experiments/ppo-LunarLander-v5/best_model/best_model.zip"
eval_model = PPO.load(model_path, device="cpu")
rprint(evaluate_model(env_name, eval_model, num_episodes=10))
del eval_model

In [None]:
## See the model performance
env = gymnasium.make("LunarLander-v3", render_mode="human")
show_model(env,  PPO.load(model_path))

In [None]:
submit_to_hg_hub(
    PPO.load(model_path, device="cpu"),
    "ppo-LunarLander-v5",
    "LunarLander-v3",
    "PPO",
    "hadhoryth/ppo-LunarLander-v5",
    "Batman"
)