In [1]:
import gymnasium 
from stable_baselines3 import A2C
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_atari_env
import os

from stable_baselines3 import DQN
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env


import optuna
from optuna.pruners import MedianPruner
from optuna.samplers import TPESampler

from typing import Any
from typing import Dict

import torch
import torch.nn as nn

from typing import Any
from typing import Dict

 
import optuna
from optuna.pruners import MedianPruner
from optuna.samplers import TPESampler
from stable_baselines3 import A2C
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.monitor import Monitor
import torch
import torch.nn as nn



2023-05-15 00:31:29.340084: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-05-15 00:31:29.371740: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
N_TRIALS = 100
N_STARTUP_TRIALS = 5
N_EVALUATIONS = 2
N_TIMESTEPS = int(2e4)
EVAL_FREQ = int(N_TIMESTEPS / N_EVALUATIONS)
N_EVAL_EPISODES = 3

ENV_ID = "ALE/Galaxian-v5"

DEFAULT_HYPERPARAMS = {
    "policy": "MlpPolicy",
    "buffer_size": 10000,
    "env": ENV_ID,
    "seed": 0,
}

In [4]:
def sample_a2c_params(trial: optuna.Trial) -> Dict[str, Any]:
    learning_rate = trial.suggest_float("lr", 1e-5, 1, log=True)
    gamma = 1.0 - trial.suggest_float("gamma", 0.0001, 0.1, log=True)
    
    return { 
        "learning_rate": learning_rate,
        "gamma": gamma,
    }

In [5]:
class TrialEvalCallback(EvalCallback):
    """Callback used for evaluating and reporting a trial."""

    def __init__(
        self,
        eval_env: gymnasium.Env,
        trial: optuna.Trial,
        n_eval_episodes: int = 5,
        eval_freq: int = 10000,
        deterministic: bool = True,
        verbose: int = 0,
    ):
        super().__init__(
            eval_env=eval_env,
            n_eval_episodes=n_eval_episodes,
            eval_freq=eval_freq,
            deterministic=deterministic,
            verbose=verbose,
        )
        self.trial = trial
        self.eval_idx = 0
        self.is_pruned = False

    def _on_step(self) -> bool:
        if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0:
            super()._on_step()
            self.eval_idx += 1
            self.trial.report(self.last_mean_reward, self.eval_idx)
            # Prune trial if need.
            if self.trial.should_prune():
                self.is_pruned = True
                return False
        return True

In [6]:
def objective(trial: optuna.Trial) -> float:
    kwargs = DEFAULT_HYPERPARAMS.copy()
    # Sample hyperparameters.
    kwargs.update(sample_a2c_params(trial))
    # Create the RL model.
    model = DQN(**kwargs)
    # Create env used for evaluation.
    eval_env = Monitor(gymnasium.make(ENV_ID))
    # Create the callback that will periodically evaluate and report the performance.
    eval_callback = TrialEvalCallback(
        eval_env, trial, n_eval_episodes=N_EVAL_EPISODES, eval_freq=EVAL_FREQ, deterministic=True
    )

    nan_encountered = False
    try:
        model.learn(N_TIMESTEPS, callback=eval_callback)
    except AssertionError as e:
        # Sometimes, random hyperparams can generate NaN.
        print(e)
        nan_encountered = True
    finally:
        # Free memory.
        model.env.close()
        eval_env.close()

    # Tell the optimizer that the trial failed.
    if nan_encountered:
        return float("nan")

    if eval_callback.is_pruned:
        raise optuna.exceptions.TrialPruned()

    return eval_callback.last_mean_reward

In [7]:
torch.set_num_threads(1)

sampler = TPESampler(n_startup_trials=N_STARTUP_TRIALS)
    # Do not prune before 1/3 of the max budget is used.
pruner = MedianPruner(n_startup_trials=N_STARTUP_TRIALS, n_warmup_steps=N_EVALUATIONS // 3)

study = optuna.create_study(sampler=sampler, pruner=pruner, direction="maximize")
try:
    study.optimize(objective, n_trials=N_TRIALS, timeout=600)
except KeyboardInterrupt:
    pass

print("Number of finished trials: ", len(study.trials))

print("Best trial:")
trial = study.best_trial

print("  Value: ", trial.value)

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

print("  User attrs:")
for key, value in trial.user_attrs.items():
    print("    {}: {}".format(key, value))

[32m[I 2023-05-14 23:54:51,663][0m A new study created in memory with name: no-name-dd8efb00-067f-4b18-b356-4dc48faf90ef[0m
A.L.E: Arcade Learning Environment (version 0.8.1+53f58b7)
[Powered by Stella]
[32m[I 2023-05-14 23:55:06,304][0m Trial 0 finished with value: 540.0 and parameters: {'lr': 0.00010506273044970215, 'gamma': 0.00015488713970968385}. Best is trial 0 with value: 540.0.[0m
[32m[I 2023-05-14 23:55:18,290][0m Trial 1 finished with value: 540.0 and parameters: {'lr': 0.6143876437524028, 'gamma': 0.0026008600492741234}. Best is trial 0 with value: 540.0.[0m
[32m[I 2023-05-14 23:55:30,352][0m Trial 2 finished with value: 540.0 and parameters: {'lr': 2.064328962242027e-05, 'gamma': 0.0003449346328776704}. Best is trial 0 with value: 540.0.[0m
[32m[I 2023-05-14 23:55:42,319][0m Trial 3 finished with value: 540.0 and parameters: {'lr': 0.11650753586775908, 'gamma': 0.0022556316343385545}. Best is trial 0 with value: 540.0.[0m
[32m[I 2023-05-14 23:55:54,150][0m 

Number of finished trials:  50
Best trial:
  Value:  540.0
  Params: 
    lr: 0.00010506273044970215
    gamma: 0.00015488713970968385
  User attrs:


In [2]:
env_id = "ALE/Galaxian-v5"
env = gymnasium.make(env_id)

# Env used only for evaluation
eval_envs = make_vec_env(env_id, n_envs=2)
# 4000 training timesteps
budget_pendulum = 4000

A.L.E: Arcade Learning Environment (version 0.8.1+53f58b7)
[Powered by Stella]


In [12]:
# model = DQN("MlpPolicy", env, seed=0, verbose=0, buffer_size = 10000, learning_rate=0.00010506273044970215, gamma=0.00015488713970968385, )


model.learn(total_timesteps=500000, progress_bar=True, tb_log_name = "DQN", log_interval=4 )

Output()

<stable_baselines3.dqn.dqn.DQN at 0x7f5076e297c0>

In [13]:
# model = DQN.load("DQN_cartpole", env=env, verbose=1, buffer_size=4)
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=100, deterministic=True)
print(f"PPO Mean episode reward: {mean_reward:.2f} +/- {std_reward:.2f}")


PPO Mean episode reward: 540.00 +/- 0.00


In [7]:
gamma = trial.suggest_float("gamma", 0.9, 0.99999, log=True)
max_grad_norm = trial.suggest_float("max_grad_norm", 0.3, 5.0, log=True)
budget = 20_000

In [14]:
model.save("DQN_cartpole")


In [8]:
# env = gymnasium.make("ALE/Galaxian-v5" )
# model = DQN("MlpPolicy", env, verbose=1, buffer_size=10000 )
model.learn(total_timesteps=90000, log_interval=1)
model.save("PPO_cartpole")



---------------------------------
| rollout/           |          |
|    ep_len_mean     | 427      |
|    ep_rew_mean     | 525      |
| time/              |          |
|    fps             | 728      |
|    iterations      | 1        |
|    time_elapsed    | 2        |
|    total_timesteps | 2048     |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 523          |
|    ep_rew_mean          | 622          |
| time/                   |              |
|    fps                  | 492          |
|    iterations           | 2            |
|    time_elapsed         | 8            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0044382312 |
|    clip_fraction        | 0.014        |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.79        |
|    explained_variance   | 0.000481     |
|    learning_r

In [5]:
# model.save("DQN_cartpole")
env.close()

In [15]:
import gymnasium as gym

from stable_baselines3 import DQN

env = gym.make("ALE/Galaxian-v5", render_mode="human")

# model = DQN("MlpPolicy", env, verbose=1, buffer_size=10000)
# model.learn(total_timesteps=10000, log_interval=4)
# model.save("dqn_cartpole")

# del model # remove to demonstrate saving and loading

model = DQN.load("DQN_cartpole")

obs, info = env.reset()
while True:
    action, _states = model.predict(obs, deterministic=True)
    obs, reward, terminated, truncated, info = env.step(action)
    if terminated or truncated:
        obs, info = env.reset()

: 

: 