### Setup

conda create --prefix=D:/L6_SWD/Dissertation/Project/CondaEnvs/diss_test python=3.8

conda activate diss_test

conda install ipykernel

#### Pip installs

pip install gymnasium[atari]
pip install gymnasium[accept-rom-license]


pip install stable-baselines3
pip install ale-py==0.7.4
pip install opencv-python

pip install tensorflow
pip install optuna


pip install rl_zoo3
pip install plotly
pip install --upgrade nbformat
pip install scikit-learn

### Import

In [1]:
import optuna
from stable_baselines3 import PPO
import torch
import torch.nn as nn
import gym
from stable_baselines3.common.env_util import make_atari_env
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.monitor import Monitor
from optuna.pruners import NopPruner
from optuna.samplers import TPESampler
from rl_zoo3 import linear_schedule
from optuna.visualization import plot_optimization_history, plot_param_importances, plot_parallel_coordinate
import sklearn
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.vec_env import VecTransposeImage
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy


  from .autonotebook import tqdm as notebook_tqdm


### Data

In [2]:
N_TRIALS = 50 
N_STARTUP_TRIALS = 5
N_EVALUATIONS = 1
N_TIMESTEPS = 50000
EVAL_FREQ = int(N_TIMESTEPS / N_EVALUATIONS)
N_EVAL_EPISODES = 10
N_JOBS = 2 # Number of jobs to run in parallel

ENV_ID = "ALE/Amidar-v5"
DEFAULT_HYPERPARAMS = {
    "policy": "CnnPolicy"
}

### Sample Params

In [3]:
#SOURCE: https://github.com/DLR-RM/rl-baselines3-zoo/blob/master/rl_zoo3/hyperparams_opt.py
def Sample_PPO_Params(trial): 
    batch_size = trial.suggest_categorical("batch_size", [8, 16, 32, 64, 128, 256, 512])
    n_steps = trial.suggest_categorical("n_steps", [8, 16, 32, 64, 128, 256, 512, 1024, 2048])
    gamma = trial.suggest_categorical("gamma", [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999])
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 1, log=True)
    lr_schedule = "constant"
    # Uncomment to enable learning rate schedule
    # lr_schedule = trial.suggest_categorical('lr_schedule', ['linear', 'constant'])
    ent_coef = trial.suggest_float("ent_coef", 0.00000001, 0.1, log=True)
    clip_range = trial.suggest_categorical("clip_range", [0.1, 0.2, 0.3, 0.4])
    n_epochs = trial.suggest_categorical("n_epochs", [1, 5, 10, 20])
    gae_lambda = trial.suggest_categorical("gae_lambda", [0.8, 0.9, 0.92, 0.95, 0.98, 0.99, 1.0])
    max_grad_norm = trial.suggest_categorical("max_grad_norm", [0.3, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 5])
    vf_coef = trial.suggest_float("vf_coef", 0, 1)
    net_arch = trial.suggest_categorical("net_arch", ["small", "medium"])
    # Uncomment for gSDE (continuous actions)
    # log_std_init = trial.suggest_uniform("log_std_init", -4, 1)
    # Uncomment for gSDE (continuous action)
    # sde_sample_freq = trial.suggest_categorical("sde_sample_freq", [-1, 8, 16, 32, 64, 128, 256])
    # Orthogonal initialization
    ortho_init = False
    # ortho_init = trial.suggest_categorical('ortho_init', [False, True])
    # activation_fn = trial.suggest_categorical('activation_fn', ['tanh', 'relu', 'elu', 'leaky_relu'])
    activation_fn = trial.suggest_categorical("activation_fn", ["tanh", "relu"])

    # TODO: account when using multiple envs
    if batch_size > n_steps:
        batch_size = n_steps

    if lr_schedule == "linear":
        learning_rate = linear_schedule(learning_rate)

    # Independent networks usually work best
    # when not working with images
    net_arch = {
        "small": dict(pi=[64, 64], vf=[64, 64]),
        "medium": dict(pi=[256, 256], vf=[256, 256]),
    }[net_arch]

    activation_fn = {"tanh": nn.Tanh, "relu": nn.ReLU, "elu": nn.ELU, "leaky_relu": nn.LeakyReLU}[activation_fn]

    return {
        "n_steps": n_steps,
        "batch_size": batch_size,
        "gamma": gamma,
        "learning_rate": learning_rate,
        "ent_coef": ent_coef,
        "clip_range": clip_range,
        "n_epochs": n_epochs,
        "gae_lambda": gae_lambda,
        "max_grad_norm": max_grad_norm,
        "vf_coef": vf_coef,
        # "sde_sample_freq": sde_sample_freq,
        "policy_kwargs": dict(
            # log_std_init=log_std_init,
            net_arch=net_arch,
            activation_fn=activation_fn,
            ortho_init=ortho_init,
        ),
    }

### Trial

In [4]:
class TrialEvalCallback(EvalCallback):
    """Callback used for evaluating and reporting a trial."""

    def __init__(
        self,
        eval_env: gym.Env,
        trial: optuna.Trial,
        n_eval_episodes: int = 5,
        eval_freq: int = 10000,
        deterministic: bool = True,
        verbose: int = 0,
    ):
        
        super().__init__(
            eval_env=eval_env,
            n_eval_episodes=n_eval_episodes,
            eval_freq=eval_freq,
            deterministic=deterministic,
            verbose=verbose
        )
        
        self.eval_env = VecTransposeImage(self.eval_env)
        
        self.trial = trial
        self.eval_idx = 0
        self.is_pruned = False

    def _on_step(self) -> bool:        
        if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0:
            super()._on_step()
            self.eval_idx += 1
            self.trial.report(self.last_mean_reward, self.eval_idx)
            # Prune trial if need.
            if self.trial.should_prune():
                self.is_pruned = True
                return False
        return True

### Objective

In [5]:
#Source: https://github.com/optuna/optuna-examples/blob/main/rl/sb3_simple.py
def Objective(trial):
    #Create Envs
    envAmt = 8
    envSeed = 47

    # Create train Env.  
    train_env = make_atari_env(ENV_ID,n_envs=envAmt, seed=envSeed)
    train_env = VecFrameStack(train_env, n_stack=envAmt)
 
    # Create Eval Env.
    eval_env = make_atari_env(ENV_ID,n_envs=envAmt, seed=envSeed)
    eval_env = VecFrameStack(eval_env, n_stack=envAmt)


    # Create the RL model.
    kwargs = DEFAULT_HYPERPARAMS.copy()
    kwargs.update(Sample_PPO_Params(trial))

    model = PPO(env= train_env,**kwargs)

  
    
    # Create the callback that will periodically evaluate and report the performance.
    eval_callback = TrialEvalCallback(
        eval_env, trial, n_eval_episodes=N_EVAL_EPISODES, eval_freq=EVAL_FREQ, deterministic=True
    )

    nan_encountered = False
    try:
        model.learn(N_TIMESTEPS, callback=eval_callback)
    except AssertionError as e:
        # Sometimes, random hyperparams can generate NaN.
        print(e)
        nan_encountered = True          

    #Check closing conditions
    if nan_encountered:       
        model.env.close()
        eval_env.close()      
        return float("nan")

    if eval_callback.is_pruned:
        model.env.close()
        eval_env.close()  
        raise optuna.exceptions.TrialPruned()
    
    mean_reward, std_reward  = evaluate_policy(model, eval_env, n_eval_episodes=100, render=False)
    model.env.close()
    eval_env.close()      

    #mean_reward = eval_callback.last_mean_reward
    #model.env.close()
    #eval_env.close()  

    return mean_reward

### Start Study

In [6]:
#Sampler
sampler = TPESampler(n_startup_trials=N_STARTUP_TRIALS)

#Do not prune trials
pruner = NopPruner()

#pruner = optuna.pruners.MedianPruner

study = optuna.create_study(sampler=sampler, pruner=pruner, direction="maximize")
try:
    print("Starting Optimize")
    study.optimize(Objective, n_trials=N_TRIALS, n_jobs=N_JOBS)
except KeyboardInterrupt:
    pass

[32m[I 2023-06-03 12:30:40,113][0m A new study created in memory with name: no-name-21a8110f-19f8-40a2-853a-dbea77a36daa[0m


Starting Optimize


### Display Info

In [None]:
print("Number of finished trials: ", len(study.trials))

print("Best trial:")
trial = study.best_trial

print("  Value: ", trial.value)
print("  Params: ")

for key, value in trial.params.items():
    print("    {}: {}".format(key, value))
    
#print("  User attrs:")
#for key, value in trial.user_attrs.items():
#print("    {}: {}".format(key, value))

Number of finished trials:  5
Best trial:
  Value:  -inf
  Params: 
    batch_size: 64
    n_steps: 32
    gamma: 0.999
    learning_rate: 1.3821965734677845e-05
    ent_coef: 2.5907417682425254e-06
    clip_range: 0.3
    n_epochs: 5
    gae_lambda: 0.99
    max_grad_norm: 0.8
    vf_coef: 0.3209074648747364
    net_arch: medium
    activation_fn: tanh


In [None]:
try:
    fig1 = plot_optimization_history(study)
    fig2 = plot_param_importances(study)
    fig3 = plot_parallel_coordinate(study)

    fig1.show()
    fig2.show()
    fig3.show()

except (ValueError, ImportError, RuntimeError) as e:
    print("Error during plotting")
    print(e)