### Setup

conda create --prefix=D:/L6_SWD/Dissertation/Project/CondaEnvs/diss_test python=3.8

conda activate diss_test

conda install ipykernel

#### Pip installs

pip install gymnasium[atari]
pip install gymnasium[accept-rom-license]


pip install stable-baselines3
pip install ale-py==0.7.4
pip install opencv-python

pip install tensorflow
pip install optuna


pip install rl_zoo3
pip install plotly
pip install --upgrade nbformat
pip install scikit-learn

### Import

In [1]:
import optuna
from stable_baselines3 import A2C
import torch
import torch.nn as nn
import gym
from stable_baselines3.common.env_util import make_atari_env
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.monitor import Monitor
from optuna.pruners import NopPruner
from optuna.samplers import TPESampler
from rl_zoo3 import linear_schedule
from optuna.visualization import plot_optimization_history, plot_param_importances, plot_parallel_coordinate
import sklearn
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.vec_env import VecTransposeImage
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy


  from .autonotebook import tqdm as notebook_tqdm


### Data

In [2]:
N_TRIALS = 10 
N_STARTUP_TRIALS = 5
N_EVALUATIONS = 1
N_TIMESTEPS = 500000
EVAL_FREQ = int(N_TIMESTEPS / N_EVALUATIONS)
N_EVAL_EPISODES = 10
N_JOBS = 2 # Number of jobs to run in parallel

ENV_AMT = 1

ENV_ID = "ALE/Amidar-v5"
DEFAULT_HYPERPARAMS = {
    "policy": "CnnPolicy"
}

### Sample Params

In [3]:
#SOURCE: https://github.com/DLR-RM/rl-baselines3-zoo/blob/master/rl_zoo3/hyperparams_opt.py
def Sample_A2C_Params(trial): 
    gamma = trial.suggest_categorical("gamma", [0.95, 0.98, 0.99, 0.995, 0.999, 0.9999, 0.99999])
    normalize_advantage = trial.suggest_categorical("normalize_advantage", [False, True])
    max_grad_norm = trial.suggest_categorical("max_grad_norm", [0.3, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 5])
    use_rms_prop = trial.suggest_categorical("use_rms_prop", [False, True])
    gae_lambda = trial.suggest_categorical("gae_lambda", [0.8, 0.9, 0.92, 0.95, 0.98, 0.99, 1.0])
    n_steps = trial.suggest_categorical("n_steps", [64 ,128, 256, 512, 1024])
    lr_schedule = trial.suggest_categorical("lr_schedule", ["linear", "constant"])
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 1, log=True)
    ent_coef = trial.suggest_float("ent_coef", 0.00000001, 0.1, log=True)
    vf_coef = trial.suggest_float("vf_coef", 0, 1)
    ortho_init = trial.suggest_categorical("ortho_init", [False, True])
    net_arch = trial.suggest_categorical("net_arch", ["small", "medium"])
    activation_fn = trial.suggest_categorical("activation_fn", ["tanh", "relu"])

    if lr_schedule == "linear":
        learning_rate = linear_schedule(learning_rate)

    net_arch = {
        "small": dict(pi=[64, 64], vf=[64, 64]),
        "medium": dict(pi=[256, 256], vf=[256, 256]),
    }[net_arch]

    activation_fn = {"tanh": nn.Tanh, "relu": nn.ReLU, "elu": nn.ELU, "leaky_relu": nn.LeakyReLU}[activation_fn]

    return {
        "n_steps": n_steps,
        "gamma": gamma,
        "gae_lambda": gae_lambda,
        "learning_rate": learning_rate,
        "ent_coef": ent_coef,
        "normalize_advantage": normalize_advantage,
        "max_grad_norm": max_grad_norm,
        "use_rms_prop": use_rms_prop,
        "vf_coef": vf_coef,
        "policy_kwargs": dict(
            net_arch=net_arch,
            activation_fn=activation_fn,
            ortho_init=ortho_init,
        ),
    }

### Trial

In [4]:
class TrialEvalCallback(EvalCallback):
    """Callback used for evaluating and reporting a trial."""

    def __init__(
        self,
        eval_env: gym.Env,
        trial: optuna.Trial,
        n_eval_episodes: int = 5,
        eval_freq: int = 10000,
        deterministic: bool = True,
        verbose: int = 0,
    ):
        
        super().__init__(
            eval_env=eval_env,
            n_eval_episodes=n_eval_episodes,
            eval_freq=eval_freq,
            deterministic=deterministic,
            verbose=verbose
        )
        
        self.eval_env = VecTransposeImage(self.eval_env)
        
        self.trial = trial
        self.eval_idx = 0
        self.is_pruned = False

    def _on_step(self) -> bool:        
        if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0:
            super()._on_step()
            self.eval_idx += 1
            self.trial.report(self.last_mean_reward, self.eval_idx)
            # Prune trial if need.
            if self.trial.should_prune():
                self.is_pruned = True
                return False
        return True

### Objective

In [5]:
#Source: https://github.com/optuna/optuna-examples/blob/main/rl/sb3_simple.py
def Objective(trial):
    #Create Envs
    envAmt = 8
    envSeed = 47

    # Create train Env.   
    env_kwargs = {
        "clip_reward ": True,           
    }
    

    train_env = make_atari_env(ENV_ID,n_envs=envAmt, seed=envSeed,wrapper_kwargs=env_kwargs)
    train_env = VecFrameStack(train_env, n_stack=envAmt)
 
    # Create Eval Env.
    eval_env = make_atari_env(ENV_ID,n_envs=envAmt, seed=envSeed,wrapper_kwargs=env_kwargs)
    eval_env = VecFrameStack(eval_env, n_stack=envAmt)


    # Create the RL model.
    kwargs = DEFAULT_HYPERPARAMS.copy()
    kwargs.update(Sample_A2C_Params(trial))

    model = A2C(env= train_env,**kwargs)

  
    
    # Create the callback that will periodically evaluate and report the performance.
    eval_callback = TrialEvalCallback(
        eval_env, trial, n_eval_episodes=N_EVAL_EPISODES, eval_freq=EVAL_FREQ, deterministic=True
    )

    nan_encountered = False
    try:
        model.learn(N_TIMESTEPS, callback=eval_callback)
    except AssertionError as e:
        # Sometimes, random hyperparams can generate NaN.
        print(e)
        nan_encountered = True          

    #Check closing conditions
    if nan_encountered:       
        model.env.close()
        eval_env.close()      
        return float("nan")

    if eval_callback.is_pruned:
        model.env.close()
        eval_env.close()  
        raise optuna.exceptions.TrialPruned()
    
    #mean_reward, std_reward  = evaluate_policy(model, eval_env, n_eval_episodes=100, render=False)
    #model.env.close()
    #eval_env.close()      

    mean_reward = eval_callback.last_mean_reward
    model.env.close()
    eval_env.close()  

    return mean_reward

### Start Study

In [6]:
#Sampler
sampler = TPESampler(n_startup_trials=N_STARTUP_TRIALS)

#Do not prune trials
#pruner = NopPruner()

pruner = optuna.pruners.SuccessiveHalvingPruner()

study = optuna.create_study(sampler=sampler, pruner=pruner, direction="maximize")
try:
    print("Starting Optimize")
    study.optimize(Objective, n_trials=N_TRIALS, n_jobs=N_JOBS)
except KeyboardInterrupt:
    pass

[32m[I 2023-05-29 06:35:50,670][0m A new study created in memory with name: no-name-71a94015-7e69-4732-98a3-bb03b45eaea2[0m


Starting Optimize


[32m[I 2023-05-29 07:28:02,305][0m Trial 0 finished with value: 20.0 and parameters: {'gamma': 0.95, 'normalize_advantage': False, 'max_grad_norm': 0.7, 'use_rms_prop': False, 'gae_lambda': 1.0, 'n_steps': 1024, 'lr_schedule': 'linear', 'learning_rate': 0.03022184829704538, 'ent_coef': 8.892231349916246e-05, 'vf_coef': 0.02308667368409789, 'ortho_init': False, 'net_arch': 'small', 'activation_fn': 'relu'}. Best is trial 0 with value: 20.0.[0m
[32m[I 2023-05-29 07:30:18,241][0m Trial 1 finished with value: 0.0 and parameters: {'gamma': 0.99, 'normalize_advantage': False, 'max_grad_norm': 5, 'use_rms_prop': False, 'gae_lambda': 0.99, 'n_steps': 1024, 'lr_schedule': 'linear', 'learning_rate': 0.00022027695743248512, 'ent_coef': 4.561855831069508e-08, 'vf_coef': 0.9987937970712781, 'ortho_init': True, 'net_arch': 'medium', 'activation_fn': 'relu'}. Best is trial 0 with value: 20.0.[0m
[32m[I 2023-05-29 07:37:04,745][0m Trial 2 finished with value: 0.0 and parameters: {'gamma': 0.99

RuntimeError: [enforce fail at C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\c10\core\impl\alloc_cpu.cpp:72] data. DefaultCPUAllocator: not enough memory: you tried to allocate 1849688064 bytes.

### Display Info

In [None]:
print("Number of finished trials: ", len(study.trials))

print("Best trial:")
trial = study.best_trial

print("  Value: ", trial.value)

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

#print("  User attrs:")
#for key, value in trial.user_attrs.items():
    #print("    {}: {}".format(key, value))

Number of finished trials:  100
Best trial:
  Value:  172.2
  Params: 
    gamma: 0.98
    normalize_advantage: False
    max_grad_norm: 0.7
    use_rms_prop: True
    gae_lambda: 0.8
    n_steps: 128
    lr_schedule: constant
    learning_rate: 0.001815853309651432
    ent_coef: 1.4242865045217283e-05
    vf_coef: 0.32837777846078803
    ortho_init: True
    net_arch: small
    activation_fn: relu


In [None]:
try:
    fig1 = plot_optimization_history(study)
    fig2 = plot_param_importances(study)
    fig3 = plot_parallel_coordinate(study)

    fig1.show()
    fig2.show()
    fig3.show()

except (ValueError, ImportError, RuntimeError) as e:
    print("Error during plotting")
    print(e)

### Test

In [None]:
kwargs = DEFAULT_HYPERPARAMS.copy()
#kwargs.update(Sample_A2C_Params(trial))

In [None]:
# Create train Env.
envAmt = 8
train_env = make_atari_env(ENV_ID,n_envs=envAmt, seed=0)
train_env = VecFrameStack(train_env, n_stack=envAmt)

In [None]:
#optunaModel = A2C(policy="CnnPolicy",env=train_env,verbose=0)

optunaModel = A2C(env= train_env,**kwargs)

In [None]:
optunaModel.learn(N_TIMESTEPS)

<stable_baselines3.a2c.a2c.A2C at 0x1aa5bab5910>

In [None]:
obs = train_env.reset()
while True:
    action, _states = optunaModel.predict(obs)
    obs, rewards, dones, info = train_env.step(action)
    train_env.render()

KeyboardInterrupt: 

In [None]:
mean_reward, std_reward  = evaluate_policy(optunaModel, train_env, n_eval_episodes=10, render=False)
print(mean_reward)
print(std_reward)

2.0
0.0


In [None]:
# Create env used for evaluation.
eval_env = make_atari_env(ENV_ID,n_envs=envAmt, seed=0)
eval_env = VecFrameStack(eval_env, n_stack=envAmt)
#eval_env = Monitor(eval_env)

In [None]:
mean_reward, std_reward  = evaluate_policy(optunaModel, eval_env, n_eval_episodes=10, render=False)
print(mean_reward)
print(std_reward)

In [None]:
# Create the callback that will periodically evaluate and report the performance.
eval_callback = TrialEvalCallback(
    eval_env, trial, n_eval_episodes=N_EVAL_EPISODES, eval_freq=EVAL_FREQ, deterministic=True
)

In [None]:
try:
    optunaModel.learn(N_TIMESTEPS, callback=eval_callback)
except AssertionError as e:
    # Sometimes, random hyperparams can generate NaN.
    print(e)

In [None]:
if eval_callback.is_pruned:
        # Free memory.
    model.env.close()
    eval_env.close()
    raise optuna.exceptions.TrialPruned()

In [None]:
# Free memory.
model.env.close()
eval_env.close()

In [None]:
print(eval_callback.last_mean_reward)