In [5]:
import gymnasium as gym
from stable_baselines3 import DQN
from stable_baselines3.common.evaluation import evaluate_policy
import optuna

In [6]:
TIMESTEPS = 50000
N_TRIALS = 100

In [None]:
def optimize(trial):
    env = gym.make("LunarLander-v3")

    learning_rate = trial.suggest_float("lr", 1e-5, 1e-3, log=True)
    buffer_size = trial.suggest_categorical("buffer_size", [10_000, 50_000, 100_000])
    batch_size = trial.suggest_categorical("batch_size", [32, 64, 128])
    gamma = trial.suggest_float("gamma", 0.95, 0.999)
    train_freq = trial.suggest_categorical("train_freq", [1, 4, 8])
    net_arch_choice = trial.suggest_categorical("net_arch", ["64_64", "128_128", "256_256"])
    net_arch_map = {
        "64_64": [64, 64],
        "128_128": [128, 128],
        "256_256": [256, 256],
    }
    exploration_fraction = trial.suggest_float("exploration_fraction", 0.1, 0.5)
    exploration_final_eps = trial.suggest_float("exploration_final_eps", 0.01, 0.1)

    policy_kwargs = dict(net_arch=net_arch_map[net_arch_choice])


    model = DQN(
        "MlpPolicy",
        env,
        learning_rate=learning_rate,
        buffer_size=buffer_size,
        batch_size=batch_size,
        gamma=gamma,
        train_freq=train_freq,
        exploration_fraction=exploration_fraction,
        exploration_final_eps=exploration_final_eps,
        policy_kwargs=policy_kwargs,
        seed=42,
        device="cpu",
        verbose=0,
    )

    model.learn(total_timesteps=TIMESTEPS)
    mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=5)
    env.close()
    return mean_reward

In [8]:
study = optuna.create_study(direction="maximize")
study.optimize(optimize, n_trials=N_TRIALS)
print(study.best_params)

[I 2025-05-24 16:55:56,471] A new study created in memory with name: no-name-9c7ec415-ab8e-46bb-80a2-ed4f6b01fea2
[I 2025-05-24 16:57:18,985] Trial 0 finished with value: 155.9309749722682 and parameters: {'lr': 0.00033743299181107015, 'buffer_size': 10000, 'batch_size': 128, 'gamma': 0.9633482079429906, 'train_freq': 1, 'net_arch': '64_64', 'exploration_fraction': 0.258747896317038, 'exploration_final_eps': 0.045731665176009746}. Best is trial 0 with value: 155.9309749722682.
[I 2025-05-24 16:57:47,759] Trial 1 finished with value: -12.75380614187959 and parameters: {'lr': 0.00039633237844276785, 'buffer_size': 100000, 'batch_size': 32, 'gamma': 0.9937118899763984, 'train_freq': 4, 'net_arch': '128_128', 'exploration_fraction': 0.44818266938205376, 'exploration_final_eps': 0.08349365591589727}. Best is trial 0 with value: 155.9309749722682.
[I 2025-05-24 16:59:41,433] Trial 2 finished with value: 15.79928019491199 and parameters: {'lr': 3.5078341342619696e-05, 'buffer_size': 50000, 'b

{'lr': 0.00021924459122944128, 'buffer_size': 10000, 'batch_size': 128, 'gamma': 0.9594449582014025, 'train_freq': 1, 'net_arch': '64_64', 'exploration_fraction': 0.24785493712044818, 'exploration_final_eps': 0.04073005273312161}
