In [16]:
import optuna
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
import gymnasium as gym

In [17]:
def policy_reward(model, n_eps=10):
    env = gym.make("CartPole-v1")
    reward, stdev = evaluate_policy(model, env, n_eval_episodes=n_eps, render=False)
    return reward

In [18]:
def train_model(n_steps, policy_kwargs):
    model = PPO("MlpPolicy", "CartPole-v1", verbose=0, policy_kwargs=policy_kwargs)
    model.learn(n_steps)
    return model

In [26]:
def objective(trial):
    n_layers = trial.suggest_int('n_layers', 1, 3)
    policy_kwargs = dict(
        net_arch=[trial.suggest_int(f'layer_size_{i}', 16, 64) for i in range(n_layers)]
    )
    model = train_model(1000, policy_kwargs)
    return policy_reward(model, n_eps=10)

In [27]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=10)

[I 2024-04-10 13:58:49,410] A new study created in memory with name: no-name-80517e7b-671b-4e20-8836-dcff4a279999
[I 2024-04-10 13:58:50,198] Trial 0 finished with value: 194.5 and parameters: {'n_layers': 1, 'layer_size_0': 43}. Best is trial 0 with value: 194.5.
[I 2024-04-10 13:58:51,153] Trial 1 finished with value: 147.2 and parameters: {'n_layers': 3, 'layer_size_0': 16, 'layer_size_1': 59, 'layer_size_2': 42}. Best is trial 0 with value: 194.5.
[I 2024-04-10 13:58:52,192] Trial 2 finished with value: 218.6 and parameters: {'n_layers': 3, 'layer_size_0': 52, 'layer_size_1': 41, 'layer_size_2': 48}. Best is trial 2 with value: 218.6.
[I 2024-04-10 13:58:52,870] Trial 3 finished with value: 171.3 and parameters: {'n_layers': 1, 'layer_size_0': 32}. Best is trial 2 with value: 218.6.
[I 2024-04-10 13:58:53,610] Trial 4 finished with value: 64.6 and parameters: {'n_layers': 2, 'layer_size_0': 64, 'layer_size_1': 44}. Best is trial 2 with value: 218.6.
[I 2024-04-10 13:58:54,450] Tria

In [28]:
study.best_params

{'n_layers': 1, 'layer_size_0': 44}