In [1]:
# Virtual display
from pyvirtualdisplay import Display

virtual_display = Display(visible=0, size=(1400, 900))
virtual_display.start()

<pyvirtualdisplay.display.Display at 0x7f877a66efe0>

In [18]:
import json
import os
import time

import gymnasium as gym
import numpy as np
import optuna
import torch
import torch.nn as nn
import tqdm

from math import pi
from typing import Any, Callable, Dict, Optional, Type

from huggingface_sb3 import load_from_hub, package_to_hub, push_to_hub
from huggingface_hub import notebook_login
from optuna.pruners import MedianPruner
from optuna.samplers import TPESampler
from optuna.visualization import plot_optimization_history, plot_param_importances
from sb3_contrib import QRDQN, TQC
from stable_baselines3 import A2C, DQN, PPO, SAC, TD3
from stable_baselines3.common.base_class import BaseAlgorithm
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import DummyVecEnv

# Part 1: The Importance of Tuned Hyperparameters

In [None]:
env_id = 'Pendulum-v1'
# Env used only for evaluation
eval_envs = make_vec_env(env_id, n_envs=10)
# 4000 training timesteps
budget_pendulum = 4000

### PPO

In [None]:
t0 = time.time()
ppo_model = PPO('MlpPolicy', env_id, seed=0, verbose=0).learn(budget_pendulum)
t1 = time.time()
print(f'Trained in {t1-t0:.2f} s ({budget_pendulum/(t1-t0):.1f} steps/s)')

In [None]:
mean_reward, std_reward = evaluate_policy(ppo_model, eval_envs, n_eval_episodes=100, deterministic=True)

print(f'Episode reward: {mean_reward:.2f} +/- {std_reward:.2f}')

### A2C

In [None]:
t0 = time.time()
a2c_model = A2C('MlpPolicy', env_id, seed=0, verbose=0).learn(budget_pendulum)
t1 = time.time()
print(f'Trained in {t1-t0:.2f} s ({budget_pendulum/(t1-t0):.1f} steps/s)')

In [None]:
mean_reward, std_reward = evaluate_policy(a2c_model, eval_envs, n_eval_episodes=100, deterministic=True)

print(f'Episode reward: {mean_reward:.2f} +/- {std_reward:.2f}')

### Training longer PPO

In [None]:
new_budget = 10 * budget_pendulum

t0 = time.time()
ppo_model = PPO('MlpPolicy', env_id, seed=0, verbose=0).learn(new_budget)
t1 = time.time()
print(f'Trained in {t1-t0:.2f} s ({new_budget/(t1-t0):.1f} steps/s)')

In [None]:
mean_reward, std_reward = evaluate_policy(ppo_model, eval_envs, n_eval_episodes=100, deterministic=True)

print(f'Episode reward: {mean_reward:.2f} +/- {std_reward:.2f}')

### Tuning PPO hyperparameters

In [None]:
tuned_params = {
    'gamma': 0.9,
    'use_sde': True,
    'sde_sample_freq': 4,
    'learning_rate': 1e-3,
}
budget = 50000

t0 = time.time()
ppo_tuned_model = PPO(
    'MlpPolicy', 
    env_id, 
    seed=0, 
    verbose=1, 
    **tuned_params).learn(budget, log_interval=5)
t1 = time.time()
print(f'Trained in {t1-t0:.2f} s ({budget/(t1-t0):.1f} steps/s)')

In [None]:
mean_reward, std_reward = evaluate_policy(ppo_tuned_model, eval_envs, n_eval_episodes=100, deterministic=True)

print(f"Tuned PPO Mean episode reward: {mean_reward:.2f} +/- {std_reward:.2f}")

## Part III: Automatic Hyperparameter Tuning

**Hyperparameter sampling functions**

In [3]:
def sample_a2c_params(trial: optuna.Trial) -> Dict[str, Any]:
    """Sampler for A2C hyperparameters.
    
    Args:
        trial (optuna.Trial): Optuna trial.
        
    Returns:
        (Dict[str, Any]): Sampled hyperparameters for the given trial.
    
    """
    gamma = 1.0 - trial.suggest_float('gamma', 0.0001, 0.1, log=True)
    max_grad_norm = trial.suggest_float('max_grad_norm', 0.3, 5.0, log=True)
    n_steps = 2 ** trial.suggest_int('exponent_n_steps', 3, 10)
    
    learning_rate = trial.suggest_float('learning_rate', 0.00001, 1, log=True)
    net_arch = trial.suggest_categorical('net_arch', ['tiny', 'small'])
    activation_fn = trial.suggest_categorical('activation_fn', ['tanh', 'relu'])
    
    trial.set_user_attr('gamma_', gamma)
    trial.set_user_attr('n_steps', n_steps)
    
    net_arch = [
        {'pi': [64], 'vf': [64]}
        if net_arch == 'tiny'
        else {"pi": [64, 64], "vf": [64, 64]}
    ]
    
    activation_fn = {'tanh': nn.Tanh, 'relu': nn.ReLU}[activation_fn]
    
    return {
        'n_steps': n_steps,
        'gamma': gamma,
        'learning_rate': learning_rate,
        'max_grad_norm': max_grad_norm,
        'policy_kwargs': {
            'net_arch': net_arch,
            'activation_fn': activation_fn,
        },
    }


In [4]:
def sample_ppo_params(trial: optuna.Trial) -> Dict[str, Any]:
    """Sampler for PPO hyperparameters.
    
    Args:
        trial (optuna.Trial): Optuna trial.
        
    Returns:
        (Dict[str, Any]): Sampled hyperparameters for the given trial.
    
    """
    gamma = 1.0 - trial.suggest_float('gamma', 0.0001, 0.1, log=True)
    max_grad_norm = trial.suggest_float('max_grad_norm', 0.3, 5.0, log=True)
    n_steps = 2 ** trial.suggest_int('exponent_n_steps', 3, 10)
    
    gae_lambda = 1 - trial.suggest_float('gae_lambda', 0.0001, 0.2, log=True)    
    learning_rate = trial.suggest_float('learning_rate', 0.00001, 1, log=True)
    net_arch = trial.suggest_categorical('net_arch', ['tiny', 'small'])
    activation_fn = trial.suggest_categorical('activation_fn', ['tanh', 'relu'])
    
    trial.set_user_attr('gamma_', gamma)
    trial.set_user_attr('n_steps', n_steps)
    
    net_arch = [
        {'pi': [64], 'vf': [64]}
        if net_arch == 'tiny'
        else {"pi": [64, 64], "vf": [64, 64]}
    ]
    
    activation_fn = {'tanh': nn.Tanh, 'relu': nn.ReLU}[activation_fn]
    
    return {
        'n_steps': n_steps,
        'gamma': gamma,
        'gae_lambda': gae_lambda,
        'learning_rate': learning_rate,
        'max_grad_norm': max_grad_norm,
        'policy_kwargs': {
            'net_arch': net_arch,
            'activation_fn': activation_fn,
        },
    }


def init_ppo_params() -> Dict[str, Any]:
    """Return default parameter values for an initial trial
    
    """
    return {
        'n_steps': 128,
        'gamma': 0.99,
        'gae_lambda': 0.95,
        'learning_rate': 2.5e-4,
        'max_grad_norm': 0.5,
        'policy_kwargs': {
            'net_arch': {'pi': [64], 'vf': [64]},
            'activation_fn': nn.ReLU},
    }


**Optuna eval callback**

In [5]:
class TrialEvalCallback(EvalCallback):
    def __init__(
        self,
        eval_env: gym.Env,
        trial: optuna.Trial,
        n_eval_episodes: int = 5,
        eval_freq: int = 10000,
        deterministic: bool = True,
        verbose: int = 0,
    ):

        super().__init__(
            eval_env=eval_env,
            n_eval_episodes=n_eval_episodes,
            eval_freq=eval_freq,
            deterministic=deterministic,
            verbose=verbose,
        )
        self.trial = trial
        self.eval_idx = 0
        self.is_pruned = False

    def _on_step(self) -> bool:
        if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0:
            # Evaluate policy (done in the parent class)
            super()._on_step()
            self.eval_idx += 1
            # Send report to Optuna
            self.trial.report(self.last_mean_reward, self.eval_idx)
            # Prune trial if need
            if self.trial.should_prune():
                self.is_pruned = True
                return False
        return True

**Training objective function**

In [6]:
def train(
        model_alg: Type[BaseAlgorithm],
        hyperparameters: Dict[str, Any],
        n_timesteps: int,
        save_file: Optional[str] = None) -> BaseAlgorithm:
    """Train a stable_baselines3 model according to a set of
    hyperparameters.
    
    Args:
        model_alg (Type[BaseAlgorithm]): A stable_baselines3 RL algorithm class.
        hyperparameters (Dict[str, Any]): kwargs for the `model_alg` constructor.
        n_timesteps (int): Number of training timesteps.
        save_file (Optional[str]): If specified, save the trained model here.
        
    Returns:
        model (BaseAlgorithm): A trained stable_baselines3 model.
        
    """
    torch.set_num_threads(1)
    
    # Create the RL model
    model = model_alg(**hyperparameters)
    
    # Train
    model.learn(n_timesteps, log_interval=1000000, progress_bar=True)
    
    if save_file is not None:
        os.makedirs(os.path.dirname(save_file), exist_ok=True)
        model.save(save_file)
    
    return model


In [7]:
def objective(
        trial: optuna.Trial, 
        model_alg: Type[BaseAlgorithm],
        sample_fn: Callable[[optuna.Trial], Dict[str, Any]]) -> float:
    """
    Objective function using by Optuna to evaluate
    one configuration (i.e., one set of hyperparameters).

    Given a trial object, it will sample hyperparameters,
    evaluate it and report the result (mean episodic reward after training)
    
    Args:
        trial (optuna.Trial): Optuna trial object.
        model_alg (Type[BaseAlgorithm]): A stable_baselines3 RL algorithm class.
        sample_fn (Callable[[optuna.Trial], Dict[str, Any]]): Function for
            generating hyperparameter sample suggestions.

    :param trial: Optuna trial object
    :return: Mean episodic reward after training
    """

    kwargs = DEFAULT_HYPERPARAMS.copy()

    # 1. Sample hyperparameters and update the keyword arguments
    kwargs.update(sample_fn(trial))

    # Create the RL model
    model = model_alg(**kwargs)

    # 2. Create envs used for evaluation using `make_vec_env`, `ENV_ID` and `N_EVAL_ENVS`
    eval_envs = make_vec_env(ENV_ID, n_envs=N_EVAL_ENVS)
    
    # 3. Create the `TrialEvalCallback` callback defined above that will periodically evaluate
    # and report the performance using `N_EVAL_EPISODES` every `EVAL_FREQ`
    # TrialEvalCallback signature:
    # TrialEvalCallback(eval_env, trial, n_eval_episodes, eval_freq, deterministic, verbose)
    eval_callback = TrialEvalCallback(
        eval_envs, 
        trial, 
        N_EVAL_EPISODES, 
        EVAL_FREQ, 
        deterministic=True, 
        verbose=False)

    nan_encountered = False
    try:
        # Train the model
        model.learn(N_TIMESTEPS, callback=eval_callback)
    except AssertionError as e:
        # Sometimes, random hyperparams can generate NaN
        print(e)
        nan_encountered = True
    finally:
        # Free memory
        model.env.close()
        eval_envs.close()

    # Tell the optimizer that the trial failed
    if nan_encountered:
        return float("nan")

    if eval_callback.is_pruned:
        raise optuna.exceptions.TrialPruned()

    return eval_callback.last_mean_reward

### Atari hyperparameters

In [8]:
N_TRIALS = 100  # Maximum number of trials
N_JOBS = 1 # Number of jobs to run in parallel
N_STARTUP_TRIALS = 10  # Stop random sampling after N_STARTUP_TRIALS
N_EVALUATIONS = 20  # Number of evaluations during the training
N_TIMESTEPS = 250_000 # Training budget
EVAL_FREQ = int(N_TIMESTEPS / N_EVALUATIONS)
N_EVAL_ENVS = 8
N_EVAL_EPISODES = 10
N_TRAIN_ENVS = 8
TIMEOUT = int(60 * 60 * 12)  # 12 hours


ENV_ID = "Asteroids-v4"
MODEL_ALG = PPO
HP_SAMPLER = sample_ppo_params
INIT_SAMPLER = init_ppo_params

DEFAULT_HYPERPARAMS = {
    "policy": "CnnPolicy",
    "env": make_vec_env(ENV_ID, n_envs=N_TRAIN_ENVS),
    'ent_coef': 0.01,
    'vf_coef': 0.5,
    'clip_range': 0.1,
    'batch_size': 256,
}

A.L.E: Arcade Learning Environment (version 0.7.4+069f8bd)
[Powered by Stella]


### CartPole hyperparameters

In [None]:
N_TRIALS = 10  # Maximum number of trials
N_JOBS = 2 # Number of jobs to run in parallel
N_STARTUP_TRIALS = 10  # Stop random sampling after N_STARTUP_TRIALS
N_EVALUATIONS = 2  # Number of evaluations during the training
N_TIMESTEPS = int(2e4)  # Training budget
EVAL_FREQ = int(N_TIMESTEPS / N_EVALUATIONS)
N_EVAL_ENVS = 5
N_EVAL_EPISODES = 10
N_TRAIN_ENVS = 8
TIMEOUT = int(60 * 15)  # 15 minutes

ENV_ID = "CartPole-v1"
MODEL_ALG = A2C
HP_SAMPLER = sample_a2c_params
INIT_SAMPLER = init_a2c_params

DEFAULT_HYPERPARAMS = {
    "policy": "MlpPolicy",
    "env": make_vec_env(ENV_ID, n_envs=N_TRAIN_ENVS),
}

### HPO loop

In [None]:
# Set pytorch num threads to 1 for faster training
torch.set_num_threads(1)
# Select the sampler, can be random, TPESampler, CMAES, ...
sampler = TPESampler(n_startup_trials=N_STARTUP_TRIALS)
# Do not prune before 1/3 of the max budget is used
pruner = MedianPruner(
    n_startup_trials=N_STARTUP_TRIALS, n_warmup_steps=N_EVALUATIONS // 4
)
# Create the study and start the hyperparameter optimization
model_name = f'{MODEL_ALG}'.split('.')[-1][:-2]
study_name = f'{ENV_ID}_{model_name}'
study = optuna.create_study(
    sampler=sampler, 
    pruner=pruner, 
    direction="maximize", 
    study_name=study_name)

# Start with default hyperparams pulled from some other source
study.enqueue_trial(INIT_SAMPLER())

# Specify the model algorithm and hyperparameter sampler to 
# create a final objective function
objective_final = lambda t: objective(t, MODEL_ALG, HP_SAMPLER)

try:
    study.optimize(
        objective_final, n_trials=N_TRIALS, n_jobs=N_JOBS, timeout=TIMEOUT)
except KeyboardInterrupt:
    pass

print("Number of finished trials: ", len(study.trials))

print("Best trial:")
trial = study.best_trial

print(f"  Value: {trial.value}")

print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

print("  User attrs:")
for key, value in trial.user_attrs.items():
    print(f"    {key}: {value}")

# Write report
study.trials_dataframe().to_csv("study_results_a2c_cartpole.csv")

fig1 = plot_optimization_history(study)
fig1.show()
try:
    fig2 = plot_param_importances(study)
    fig2.show()
except Exception as e:
    print(f'Param importance plot failed with error: {e}')

# Retraining a study

**Save best params**

In [None]:
best_params = study.best_params
best_params.update(DEFAULT_HYPERPARAMS)
print(best_params)

In [None]:

with open('content/rl-baselines3-zoo/logs/best-asteroids-20230129.json', 'w') as fd:
    json.dump(best_params, fd)

**Get top K configs**

**Hackily get top 5 configs from text**

In [9]:
top5 = [
    {'gamma': 0.060826137246005656, 'max_grad_norm': 4.512018052188801, 'exponent_n_steps': 8, 'gae_lambda': 0.0003259354802354418, 'learning_rate': 0.00013038419011988115, 'net_arch': 'small', 'activation_fn': 'tanh'},
    {'gamma': 0.07663570395711342, 'max_grad_norm': 3.478570404779269, 'exponent_n_steps': 9, 'gae_lambda': 0.0013308385822802026, 'learning_rate': 0.00019024287008661774, 'net_arch': 'tiny', 'activation_fn': 'relu'},
    {'gamma': 0.06856468429867885, 'max_grad_norm': 2.5653104623097125, 'exponent_n_steps': 9, 'gae_lambda': 0.0005156622747407014, 'learning_rate': 0.0002227256156467384, 'net_arch': 'small', 'activation_fn': 'tanh'},
    {'gamma': 0.0511020782049801, 'max_grad_norm': 2.6154552042837587, 'exponent_n_steps': 10, 'gae_lambda': 0.0007317209203167054, 'learning_rate': 0.0001927384326410831, 'net_arch': 'small', 'activation_fn': 'tanh'},
    {'gamma': 0.04389554306290792, 'max_grad_norm': 3.785866867421681, 'exponent_n_steps': 9, 'gae_lambda': 0.0010996358649999723, 'learning_rate': 0.000296739645522824, 'net_arch': 'tiny', 'activation_fn': 'relu'},
]
for t in top5:
    t.update(DEFAULT_HYPERPARAMS)
    lr = t['learning_rate']
    t['learning_rate'] = lambda x: lr * 0.5 ** int(10 * (1 - x))
    t['gamma'] = 1 - t['gamma']
    t['gae_lambda'] = 1 - t['gae_lambda']
    t['n_steps'] = int(2**t['exponent_n_steps'])
    del t['exponent_n_steps']
    t['net_arch'] = [
        {'pi': [64], 'vf': [64]}
        if t['net_arch'] == 'tiny'
        else {"pi": [64, 64], "vf": [64, 64]}
    ]
    
    t['activation_fn'] = {'tanh': nn.Tanh, 'relu': nn.ReLU}[t['activation_fn']]
    t['policy_kwargs'] = {'net_arch': t['net_arch'], 'activation_fn': t['activation_fn']}
    del t['net_arch']
    del t['activation_fn']

In [None]:
model_name = str(MODEL_ALG).split('.')[-1][:-2].lower()
n_steps = 10_000_000
for i, hyperparams in enumerate(top5):
    save_file = os.path.join(
        'content', 'rl-baselines3-zoo', 'logs', model_name, f'{i:02}.zip')
    train(MODEL_ALG, hyperparams, n_steps, save_file)



Output()

Output()

Output()

## Evaluate trained models

In [16]:
m0 = PPO.load('./content/rl-baselines3-zoo/logs/ppo/00.zip')
eval_envs = make_vec_env(ENV_ID, n_envs=N_EVAL_ENVS)

In [20]:
results = {}
for i in tqdm.tqdm(range(5), total=5):
    model = PPO.load(f'./content/rl-baselines3-zoo/logs/ppo/{i:02}.zip')
    mean_reward, std_reward = evaluate_policy(
        model, eval_envs, n_eval_episodes=100, deterministic=True)
    results[i] = f'Episode reward: {mean_reward:.2f} +/- {std_reward:.2f}'
    

print(results)

100%|█████████████████████████████████████████████| 5/5 [02:05<00:00, 25.03s/it]

{0: 'Episode reward: 795.00 +/- 276.63', 1: 'Episode reward: 520.30 +/- 180.17', 2: 'Episode reward: 715.70 +/- 219.28', 3: 'Episode reward: 949.20 +/- 340.12', 4: 'Episode reward: 761.20 +/- 307.95'}





In [22]:
for k in results:
    print(f'Top-{k}', results[k])

Top-0 Episode reward: 795.00 +/- 276.63
Top-1 Episode reward: 520.30 +/- 180.17
Top-2 Episode reward: 715.70 +/- 219.28
Top-3 Episode reward: 949.20 +/- 340.12
Top-4 Episode reward: 761.20 +/- 307.95
