### Setup

conda create --prefix=D:/L6_SWD/Dissertation/Project/CondaEnvs/diss_test python=3.8

conda activate diss_test

conda install ipykernel

#### Pip installs

pip install gymnasium[atari]
pip install gymnasium[accept-rom-license]


pip install stable-baselines3
pip install ale-py==0.7.4
pip install opencv-python

pip install tensorflow
pip install optuna


pip install rl_zoo3
pip install plotly
pip install --upgrade nbformat
pip install scikit-learn

### Import

In [9]:
import optuna
from stable_baselines3 import A2C
import torch
import torch.nn as nn
import gym
from stable_baselines3.common.env_util import make_atari_env
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.monitor import Monitor
from optuna.pruners import MedianPruner
from optuna.samplers import TPESampler
from rl_zoo3 import linear_schedule
from optuna.visualization import plot_optimization_history, plot_param_importances, plot_parallel_coordinate
import sklearn
from stable_baselines3.common.vec_env import VecFrameStack

### Data

In [10]:
N_TRIALS = 100 
N_STARTUP_TRIALS = 5
N_EVALUATIONS = 2
N_TIMESTEPS = int(8e4)
EVAL_FREQ = int(N_TIMESTEPS / N_EVALUATIONS)
N_EVAL_EPISODES = 10

ENV_ID = "ALE/BankHeist-v5"
DEFAULT_HYPERPARAMS = {
    "policy": "CnnPolicy",
    "env": ENV_ID,
}

### Sample Params

In [11]:
#SOURCE: https://github.com/DLR-RM/rl-baselines3-zoo/blob/master/rl_zoo3/hyperparams_opt.py
def Sample_A2C_Params(trial): 
    gamma = trial.suggest_categorical("gamma", [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999])
    normalize_advantage = trial.suggest_categorical("normalize_advantage", [False, True])
    max_grad_norm = trial.suggest_categorical("max_grad_norm", [0.3, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 5])
    use_rms_prop = trial.suggest_categorical("use_rms_prop", [False, True])
    gae_lambda = trial.suggest_categorical("gae_lambda", [0.8, 0.9, 0.92, 0.95, 0.98, 0.99, 1.0])
    n_steps = trial.suggest_categorical("n_steps", [1024, 2048])
    lr_schedule = trial.suggest_categorical("lr_schedule", ["linear", "constant"])
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 1, log=True)
    ent_coef = trial.suggest_float("ent_coef", 0.00000001, 0.1, log=True)
    vf_coef = trial.suggest_uniform("vf_coef", 0, 1)
    ortho_init = trial.suggest_categorical("ortho_init", [False, True])
    net_arch = trial.suggest_categorical("net_arch", ["small", "medium"])
    activation_fn = trial.suggest_categorical("activation_fn", ["tanh", "relu"])

    if lr_schedule == "linear":
        learning_rate = linear_schedule(learning_rate)

    net_arch = {
        "small": dict(pi=[64, 64], vf=[64, 64]),
        "medium": dict(pi=[256, 256], vf=[256, 256]),
    }[net_arch]

    activation_fn = {"tanh": nn.Tanh, "relu": nn.ReLU, "elu": nn.ELU, "leaky_relu": nn.LeakyReLU}[activation_fn]

    return {
        "n_steps": n_steps,
        "gamma": gamma,
        "gae_lambda": gae_lambda,
        "learning_rate": learning_rate,
        "ent_coef": ent_coef,
        "normalize_advantage": normalize_advantage,
        "max_grad_norm": max_grad_norm,
        "use_rms_prop": use_rms_prop,
        "vf_coef": vf_coef,
        "policy_kwargs": dict(
            net_arch=net_arch,
            activation_fn=activation_fn,
            ortho_init=ortho_init,
        ),
    }

### Trial

In [12]:
class TrialEvalCallback(EvalCallback):
    """Callback used for evaluating and reporting a trial."""

    def __init__(
        self,
        eval_env: gym.Env,
        trial: optuna.Trial,
        n_eval_episodes: int = 5,
        eval_freq: int = 10000,
        deterministic: bool = True,
        verbose: int = 0,
    ):
        super().__init__(
            eval_env=eval_env,
            n_eval_episodes=n_eval_episodes,
            eval_freq=eval_freq,
            deterministic=deterministic,
            verbose=verbose,
        )
        self.trial = trial
        self.eval_idx = 0
        self.is_pruned = False

    def _on_step(self) -> bool:
        if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0:
            super()._on_step()
            self.eval_idx += 1
            self.trial.report(self.last_mean_reward, self.eval_idx)
            # Prune trial if need.
            if self.trial.should_prune():
                self.is_pruned = True
                return False
        return True

### Objective

In [13]:
def Objective(trial):
    #Source: https://github.com/optuna/optuna-examples/blob/main/rl/sb3_simple.py
    kwargs = DEFAULT_HYPERPARAMS.copy()
    kwargs.update(Sample_A2C_Params(trial))


    # Create the RL model.
    model = A2C(**kwargs)

    # Create env used for evaluation.
    eval_env = Monitor(gym.make(ENV_ID))
    
    # Create the callback that will periodically evaluate and report the performance.
    eval_callback = TrialEvalCallback(
        eval_env, trial, n_eval_episodes=N_EVAL_EPISODES, eval_freq=EVAL_FREQ, deterministic=False
    )

    nan_encountered = False
    try:
        model.learn(N_TIMESTEPS, callback=eval_callback)
    except AssertionError as e:
        # Sometimes, random hyperparams can generate NaN.
        print(e)
        nan_encountered = True
    finally:
        # Free memory.
        model.env.close()
        eval_env.close()

    # Tell the optimizer that the trial failed.
    if nan_encountered:
        return float("nan")

    if eval_callback.is_pruned:
        raise optuna.exceptions.TrialPruned()

    return eval_callback.last_mean_reward

In [14]:
print(N_TRIALS)

100


In [15]:
torch.set_num_threads(1)
sampler = TPESampler(n_startup_trials=N_STARTUP_TRIALS)

# Do not prune before 1/3 of the max budget is used.
pruner = MedianPruner(n_startup_trials=N_STARTUP_TRIALS, n_warmup_steps=N_EVALUATIONS // 3)

study = optuna.create_study(sampler=sampler, pruner=pruner, direction="maximize")
try:
    study.optimize(Objective, n_trials=N_TRIALS)
except KeyboardInterrupt:
    pass

[32m[I 2023-04-27 04:04:58,320][0m A new study created in memory with name: no-name-2df3502f-63ed-42ae-8075-561c7a91fec3[0m
  vf_coef = trial.suggest_uniform("vf_coef", 0, 1)
[32m[I 2023-04-27 04:12:15,723][0m Trial 0 finished with value: 0.0 and parameters: {'gamma': 0.95, 'normalize_advantage': False, 'max_grad_norm': 0.6, 'use_rms_prop': False, 'gae_lambda': 0.92, 'n_steps': 2048, 'lr_schedule': 'linear', 'learning_rate': 0.005826055348716905, 'ent_coef': 0.0018210429873563904, 'vf_coef': 0.28496635077750554, 'ortho_init': True, 'net_arch': 'small', 'activation_fn': 'relu'}. Best is trial 0 with value: 0.0.[0m
[32m[I 2023-04-27 04:17:24,131][0m Trial 1 finished with value: 0.0 and parameters: {'gamma': 0.9, 'normalize_advantage': False, 'max_grad_norm': 0.6, 'use_rms_prop': True, 'gae_lambda': 0.92, 'n_steps': 1024, 'lr_schedule': 'constant', 'learning_rate': 0.0392569822675203, 'ent_coef': 0.0323344512085416, 'vf_coef': 0.3944709423307118, 'ortho_init': True, 'net_arch': 's

In [16]:
print("Number of finished trials: ", len(study.trials))

print("Best trial:")
trial = study.best_trial

print("  Value: ", trial.value)

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

#print("  User attrs:")
#for key, value in trial.user_attrs.items():
    #print("    {}: {}".format(key, value))

Number of finished trials:  26
Best trial:
  Value:  0.0
  Params: 
    gamma: 0.95
    normalize_advantage: False
    max_grad_norm: 0.6
    use_rms_prop: False
    gae_lambda: 0.92
    n_steps: 2048
    lr_schedule: linear
    learning_rate: 0.005826055348716905
    ent_coef: 0.0018210429873563904
    vf_coef: 0.28496635077750554
    ortho_init: True
    net_arch: small
    activation_fn: relu


In [18]:
try:
    fig1 = plot_optimization_history(study)
    fig2 = plot_param_importances(study)
    fig3 = plot_parallel_coordinate(study)

    fig1.show()
    fig2.show()
    fig3.show()

except (ValueError, ImportError, RuntimeError) as e:
    print("Error during plotting")
    print(e)

Error during plotting
Encountered zero total variance in all trees.
