In [3]:
import gymnasium as gym
from stable_baselines3 import A2C
from stable_baselines3.common.evaluation import evaluate_policy
import optuna

In [4]:
TIMESTEPS = 50000
N_TRIALS = 100

In [None]:
def optimize(trial):
    env = gym.make("LunarLander-v3")

    learning_rate = trial.suggest_float("lr", 1e-5, 1e-3, log=True)
    gamma = trial.suggest_float("gamma", 0.95, 0.999)
    ent_coef = trial.suggest_float("ent_coef", 0.0, 0.1)
    vf_coef = trial.suggest_float("vf_coef", 0.25, 1.0)
    n_steps = trial.suggest_categorical("n_steps", [5, 20, 50, 100])
    net_arch_pi = trial.suggest_categorical("pi", [64, 128, 256])
    net_arch_vf = trial.suggest_categorical("vf", [64, 128, 256])

    policy_kwargs = dict(net_arch=dict(pi=[net_arch_pi], vf=[net_arch_vf]))

    model = A2C(
        "MlpPolicy",
        env,
        learning_rate=learning_rate,
        gamma=gamma,
        ent_coef=ent_coef,
        vf_coef=vf_coef,
        n_steps=n_steps,
        policy_kwargs=policy_kwargs,
        seed=42,
        device="cpu",
        verbose=0
    )

    model.learn(total_timesteps=TIMESTEPS)
    mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=5)
    env.close()
    return mean_reward

In [6]:
study = optuna.create_study(direction="maximize")
study.optimize(optimize, n_trials=N_TRIALS)
print(study.best_params)

[I 2025-05-23 21:48:36,538] A new study created in memory with name: no-name-e5573e0f-6922-4834-af07-1c05892534bb
[I 2025-05-23 21:48:48,731] Trial 0 finished with value: -772.3282312260417 and parameters: {'lr': 0.00013122529534136156, 'gamma': 0.9949411940986782, 'ent_coef': 0.09326466908046736, 'vf_coef': 0.5898289590267786, 'n_steps': 100, 'pi': 128, 'vf': 256}. Best is trial 0 with value: -772.3282312260417.
[I 2025-05-23 21:48:59,416] Trial 1 finished with value: -613.1619553789496 and parameters: {'lr': 0.0002557177746983233, 'gamma': 0.9783599694490754, 'ent_coef': 0.05943051734076615, 'vf_coef': 0.9102120564385362, 'n_steps': 50, 'pi': 128, 'vf': 128}. Best is trial 1 with value: -613.1619553789496.
[I 2025-05-23 21:49:37,393] Trial 2 finished with value: -282.14049864331025 and parameters: {'lr': 0.00048525933873478066, 'gamma': 0.9643112009079057, 'ent_coef': 0.0857236065153166, 'vf_coef': 0.8050064272434785, 'n_steps': 5, 'pi': 64, 'vf': 256}. Best is trial 2 with value: -2

{'lr': 0.0006997150606687543, 'gamma': 0.9987615431097223, 'ent_coef': 0.014358534695948685, 'vf_coef': 0.9222159161633121, 'n_steps': 5, 'pi': 128, 'vf': 256}
