# Tuning of all Algorithms

In [1]:
import os

import optuna
from optuna.pruners import MedianPruner
from optuna.samplers import TPESampler

from stable_baselines3 import TD3
from stable_baselines3 import A2C
from stable_baselines3 import SAC
from stable_baselines3 import DDPG
from stable_baselines3 import PPO
from sb3_contrib import TQC
from sb3_contrib import TRPO
from sb3_contrib import RecurrentPPO

from stable_baselines3.common.noise import NormalActionNoise,OrnsteinUhlenbeckActionNoise
from stable_baselines3.common.monitor import Monitor
from gym.wrappers import RecordEpisodeStatistics
from wandb.integration.sb3 import WandbCallback
from gym.envs.registration import register
from stable_baselines3.common.env_checker import check_env

import torch.nn as nn
import torch
import subprocess
from rl_zoo3 import linear_schedule

import numpy as np
import wandb

from gym import make

import warnings
from pandas.core.common import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

## Register the Environment 

In [2]:
   
register(
    id="VPPBiddingEnv-TRAIN-v1",
    entry_point='vpp-gym.vpp_gym.envs.vpp_env:VPPBiddingEnv',
    max_episode_steps=1,
    kwargs={'config_path': "vpp_config_4.json",
            'log_level' : "DEBUG", # "DEBUG" , "INFO" or  "WARNING"
            'env_type' :"training",
            'render_mode' :"human", # "human", "fast_training" or None
           }
)

register(
    id="VPPBiddingEnv-TUNING-v1",
    entry_point='vpp-gym.vpp_gym.envs.vpp_env:VPPBiddingEnv',
    max_episode_steps=1,
    kwargs={'config_path': "vpp_config_4.json",
            'log_level' : "WARNING", # "DEBUG" , "INFO" or  "WARNING"
            'env_type' :"training",
            'render_mode' :"fast_training", # "human", "fast_training" or None
           }
)


register(
    id="VPPBiddingEnv-EVAL-v1",
    entry_point='vpp-gym.vpp_gym.envs.vpp_env:VPPBiddingEnv',
    max_episode_steps=1,
    kwargs={'config_path': "vpp_config_4.json",
            'log_level' : "DEBUG", # "DEBUG" , "INFO" or  "WARNING"
            'env_type' :"eval",
            'render_mode' :"human", # "human", "fast_training" or None
           }
)

register(
    id="VPPBiddingEnv-TUNING-EVAL-v1",
    entry_point='vpp-gym.vpp_gym.envs.vpp_env:VPPBiddingEnv',
    max_episode_steps=1,
    kwargs={'config_path': "vpp_config_4.json",
            'log_level' : "WARNING", # "DEBUG" , "INFO" or  "WARNING"
            'env_type' :"eval",
            'render_mode' :"fast_training", # "human", "fast_training" or None
           }
)

register(
    id="VPPBiddingEnv-TEST-v1",
    entry_point='vpp-gym.vpp_gym.envs.vpp_env:VPPBiddingEnv',
    max_episode_steps=1,
    kwargs={'config_path': "vpp_config_4.json",
            'log_level' : "INFO", # "DEBUG" , "INFO" or  "WARNING"
            'env_type' :"test",
            'render_mode' :"human", # "human", "fast_training" or None
           }
)

## Test the environment

In [3]:
# It will check your custom environment and output additional warnings if needed
env_to_check = make('VPPBiddingEnv-TEST-v1', render_mode=None)
check_env(env_to_check)

log_step: initial // slot: initial  log level = info
log_step: 1 slot: None logging_step: 1
log_step: 1 slot: None Bid Submission time (D-1) = 2020-07-02 05:00:00+00:00
log_step: 1 slot: None Gate Closure time (D-1) = 2020-07-02 06:00:00+00:00
log_step: 1 slot: None Historic Data Window: from 2020-07-01 05:00:00+00:00 to 2020-07-02 04:45:00+00:00 
log_step: 1 slot: None Forecast Data Window: from 2020-07-02 22:00:00+00:00 to 2020-07-03 21:45:00+00:00 
log_step: 1 slot: 0 Current Slot Time: (D) = 2020-07-02 22:00:00+00:00
log_step: 1 slot: 0 agents_bid_size = 14
log_step: 1 slot: 0 agents_bid_price = 1867.2778
log_step: 1 slot: 0 settlement_price_DE : 16.67
log_step: 1 slot: 0 self.activation_results['slots_won'] = 
log_step: 1 slot: 0
slot won: 	-1 
slot won: 	None 
slot won: 	None 
slot won: 	None 
slot won: 	None 
slot won: 	None
log_step: 1 slot: 0      agents bid_size = 
log_step: 1 slot: 0
size: 	14 
size: 	62 
size: 	60 
size: 	126 
size: 	73 
size: 	80
log_step: 1 slot: 0 self.a

## Globals 

In [4]:
EXPERIMENT_TAGS = ["tuning_3"]
EXPERIMENT_TIMESTEPS = 2785 #2785 #how many episodes to train
N_TRIALS = 2 #20 #how many experiments to run
N_STARTUP_TRIALS = 2 # 20 #how long to use random sampling before using TPESampler

## Offline Training and later sync logs

In [5]:
os.environ["WANDB_API_KEY"] = "0cea1eee5f42654eca0de365f0acca116367c9b4"
os.environ["WANDB_MODE"] = "offline"

## Tuning of Algorithms 

### HER

In [6]:
def sample_her_params(trial,hyperparams):
    """
    Sampler for HerReplayBuffer hyperparams.
    :param trial:
    :parma hyperparams:
    :return:
    """
    her_kwargs = trial.her_kwargs.copy()
    her_kwargs["n_sampled_goal"] = trial.suggest_int("n_sampled_goal", 1, 5)
    her_kwargs["goal_selection_strategy"] = trial.suggest_categorical(
        "goal_selection_strategy", ["final", "episode", "future"]
    )
    her_kwargs["online_sampling"] = trial.suggest_categorical("online_sampling", [True, False])
    hyperparams["replay_buffer_kwargs"] = her_kwargs
    return hyperparams


In [None]:
HYPERPARAMS_SAMPLER = {
    "A2C": sample_a2c_params,
    #"ARS": sample_ars_params,
    "DDPG": sample_ddpg_params,
    #"DQN": sample_dqn_params,
    #"QRDQN": sample_qrdqn_params,
    "SAC": sample_sac_params,
    "TQC": sample_tqc_params,
    "PPO": sample_ppo_params,
    "R_PPO": sample_rppo_params,
    "TD3": sample_td3_params,
    "TRPO": sample_trpo_params,
}

ALGOS = {
    "A2C": A2C,
    "DDPG": DDPG,
    #"DQN": DQN,
    "PPO": PPO,
    "SAC": SAC,
    "TD3": TD3,
    # SB3 Contrib,
    #"ARS": ARS,
    #"QRDQN": QRDQN,
    "TQC": TQC,
    "TRPO": TRPO,
    "R_PPO": RecurrentPPO,
}

In [9]:

# Set pytorch num threads to 1 for faster training
torch.set_num_threads(1)

sampler = TPESampler(n_startup_trials=N_STARTUP_TRIALS)

# Do not prune before 1/3 of the max budget is used
pruner = MedianPruner(n_startup_trials=N_STARTUP_TRIALS, n_warmup_steps=EXPERIMENT_TIMESTEPS // 3)


In [21]:
def optimize_agent(trial):
    """ Train the model and optimize
        Optuna maximises the negative log likelihood, so we
        need to negate the reward here
    """
    algo = study.user_attrs["algo"]
    nan_encountered = False
    try: 
        

        model_params = HYPERPARAMS_SAMPLER[algo](trial)

        # init tracking experiment.
        # hyper-parameters, trial id are stored.
        config = dict(trial.params)
        config["trial.number"] = trial.number
        wandb.init(
            project="RL-optuna",
            entity="jlu237", 
            sync_tensorboard=True,
            config=config,
            tags=[algo] + EXPERIMENT_TAGS,
            reinit=True
        )

        env = make('VPPBiddingEnv-TUNING-v1')
        env = Monitor(env) 
        env = RecordEpisodeStatistics(env) # record stats such as returns
        
        if algo == "R_PPO": 
            model = ALGOS[algo]('MultiInputLstmPolicy', env, verbose=0, seed = 1, **model_params)
        else:
            model = ALGOS[algo]('MultiInputPolicy', env, verbose=0,  seed = 1, **model_params)
            
        print(model_params)
    
        # -------------- TRAINING -----------------
        model.learn(total_timesteps=EXPERIMENT_TIMESTEPS,
                    log_interval=1,
                    progress_bar = True,
                    callback=WandbCallback(
                        gradient_save_freq=1,
                        verbose=0))
        
        # -------------- EVALUATION -----------------
        eval_env = make('VPPBiddingEnv-TUNING-EVAL-v1')
        eval_env = RecordEpisodeStatistics(eval_env) # record stats such as returns
        episodes = 140
        for i_episode in range(episodes):
            observation = eval_env.reset()
            if algo == "R_PPO":
                lstm_states = None
                num_envs = 1
                # Episode start signals are used to reset the lstm states
                episode_starts = np.ones((num_envs,), dtype=bool)
                for t in range(1):
                    eval_env.render()
                    #logging.debug("observation : " + str(observation), extra={'log_step': str(i_episode), 'slot': 'test'})
                    action, lstm_states = model.predict(observation, state=lstm_states, episode_start=episode_starts, deterministic=True)
                    observation, reward, dones, info = eval_env.step(action)
                    episode_starts = dones
                    if dones:
                        break
            else: 
                for t in range(1):
                    eval_env.render()
                    #logging.debug("observation : " + str(observation), extra={'log_step': str(i_episode), 'slot': 'test'})
                    action, _states = model.predict(observation, deterministic = True)
                    observation, reward, done, info = eval_env.step(action)
                    if done:
                        break
        total_reward_test = info["total_reward"]
        total_profit_test = info["total_profit"]

        mean_episode_reward_test = info["total_reward"] / episodes
        mean_episode_profit_test = info["total_profit"] / episodes

        print("Total Reward on Test Set: " + str(total_reward_test))
        print("Total Profit on Test Set: " + str(total_profit_test))
        print("Mean Episode Reward: " + str(mean_episode_reward_test))
        print("Mean Episode Profit: " + str(mean_episode_profit_test))

        wandb.log({"total_reward_test": total_reward_test, 
                   "total_profit_test": total_profit_test, 
                   "mean_episode_reward_test": mean_episode_reward_test,
                   "mean_episode_profit_test": mean_episode_profit_test,
                })
        wandb.finish()
        eval_env.close()

        return_code = subprocess.run("wandb sync wandb/latest-run", shell=True)
        
    except AssertionError as e:
        # Sometimes, random hyperparams can generate NaN
        print(e)
        nan_encountered = True
        
    finally:        
        # Free memory
        env.close()
        
    if nan_encountered: 
        return float("nan")

    return total_reward_test

### A2C 

In [23]:
def sample_a2c_params(trial: optuna.Trial):
    """
    Sampler for a2c hyperparams.
    :param trial:
    :return:
    """
    
    trial.using_her_replay_buffer = False
    
    n_steps_list = []
    n_step = round(EXPERIMENT_TIMESTEPS/3)

    while n_step > 1:
        n_steps_list.append(n_step)
        n_step = round(n_step/3)
    
    gamma = trial.suggest_categorical("gamma", [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999])
    normalize_advantage = trial.suggest_categorical("normalize_advantage", [False, True])
    max_grad_norm = trial.suggest_categorical("max_grad_norm", [0.3, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 3, 4, 5])
    # Toggle PyTorch RMS Prop (different from TF one, cf doc)
    use_rms_prop = trial.suggest_categorical("use_rms_prop", [False, True])
    gae_lambda = trial.suggest_categorical("gae_lambda", [0.7, 0.8, 0.9, 0.92, 0.95, 0.98, 0.99, 1.0])
    #n_steps = trial.suggest_categorical("n_steps", n_steps_list)
    n_steps = trial.suggest_categorical("n_steps", [2,3,4,5,6,7,8,9,10])
    #n_steps = 2
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1)
    ent_coef = trial.suggest_categorical('ent_coef', [0.1, 0.05, 0.025, 0.01, 0.001, 0.0001,  0.00000001])
    vf_coef = trial.suggest_uniform("vf_coef", 0.3, 1)
    
    # ------- policy_kwargs --------
    lr_schedule = trial.suggest_categorical("lr_schedule", ["linear", "constant"])
    use_sde = trial.suggest_categorical("use_sde", [False, True])
    if use_sde is True:
        sde_sample_freq = trial.suggest_categorical("sde_sample_freq", [-1, 0, 1, 2, 3])
        log_std_init = trial.suggest_uniform("log_std_init", -4, 1)
    sde_net_arch = trial.suggest_categorical("sde_net_arch", [None, "tiny", "small"])
    full_std = trial.suggest_categorical("full_std", [False, True])
    activation_fn = trial.suggest_categorical('activation_fn', ['tanh', 'relu', 'elu', 'leaky_relu'])
    ortho_init = trial.suggest_categorical("ortho_init", [False, True])

    # NOTE: Add "verybig" to net_arch when tuning HER
    net_arch = trial.suggest_categorical("net_arch", ["small", "medium", "big"])
    # ------- policy_kwargs --------

    if lr_schedule == "linear":
        learning_rate = linear_schedule(learning_rate)

    net_arch = {
        "small": [dict(pi=[64, 64], vf=[64, 64])],
        "medium": [dict(pi=[256, 256], vf=[256, 256])],
        "big": [dict(pi=[400, 400], vf=[400, 400])],
    }[net_arch]
    
    sde_net_arch = {
         None: None,
         "tiny": [64],
         "small": [64, 64],
    }[sde_net_arch]

    activation_fn = {"tanh": nn.Tanh, "relu": nn.ReLU, "elu": nn.ELU, "leaky_relu": nn.LeakyReLU}[activation_fn]
    
    hyperparams = {
        "n_steps": n_steps,
        "gamma": gamma,
        "gae_lambda": gae_lambda,
        "learning_rate": learning_rate,
        "ent_coef": ent_coef,
        "normalize_advantage": normalize_advantage,
        "max_grad_norm": max_grad_norm,
        "use_rms_prop": use_rms_prop,
        "use_sde": use_sde,
        "vf_coef": vf_coef,
        "policy_kwargs": dict(
            net_arch=net_arch,
            full_std=full_std,
            activation_fn=activation_fn,
            sde_net_arch=sde_net_arch,
            ortho_init=ortho_init,
        ),
    }
    
    if trial.using_her_replay_buffer:
        hyperparams = sample_her_params(trial, hyperparams)
        
    if use_sde is True:
        hyperparams["sde_sample_freq"] = sde_sample_freq
        hyperparams["policy_kwargs"]["log_std_init"] = log_std_init
 
    return hyperparams

In [24]:
HYPERPARAMS_SAMPLER = {
    "A2C": sample_a2c_params,
    #"ARS": sample_ars_params,
    ##"DDPG": sample_ddpg_params,
    #"DQN": sample_dqn_params,
    #"QRDQN": sample_qrdqn_params,
    ##"SAC": sample_sac_params,
    ##"TQC": sample_tqc_params,
    ##"PPO": sample_ppo_params,
    ##"R_PPO": sample_rppo_params,
    ##"TD3": sample_td3_params,
    ##"TRPO": sample_trpo_params,
}

ALGOS = {
    "A2C": A2C,
    ##"DDPG": DDPG,
    #"DQN": DQN,
    ##"PPO": PPO,
    ##"SAC": SAC,
    ##"TD3": TD3,
    # SB3 Contrib,
    #"ARS": ARS,
    #"QRDQN": QRDQN,
    ##"TQC": TQC,
    ##"TRPO": TRPO,
    ##"R_PPO": RecurrentPPO,
}

In [26]:
study = optuna.create_study(sampler=sampler, pruner=pruner, direction="maximize")
study.set_user_attr("algo", "A2C")

try:
    study.optimize(optimize_agent, n_trials=N_TRIALS, timeout=10800)

except KeyboardInterrupt:
    print('Interrupted by keyboard.')

[32m[I 2022-11-15 15:44:57,605][0m A new study created in memory with name: no-name-033c319d-9379-40ff-bac6-7d191b13cdbb[0m


Output()

{'n_steps': 3, 'gamma': 0.995, 'gae_lambda': 0.95, 'learning_rate': 0.9802493533201759, 'ent_coef': 0.001, 'normalize_advantage': False, 'max_grad_norm': 0.9, 'use_rms_prop': False, 'use_sde': True, 'vf_coef': 0.7502462221465176, 'policy_kwargs': {'net_arch': [{'pi': [256, 256], 'vf': [256, 256]}], 'full_std': False, 'activation_fn': <class 'torch.nn.modules.activation.ReLU'>, 'sde_net_arch': None, 'ortho_init': True, 'log_std_init': -2.7341152711700327}, 'sde_sample_freq': -1}


[33m[W 2022-11-15 15:46:12,764][0m Trial 0 failed because of the following error: ValueError('Expected parameter scale (Tensor of shape (256, 12)) of distribution Normal(loc: torch.Size([256, 12]), scale: torch.Size([256, 12])) to satisfy the constraint GreaterThan(lower_bound=0.0), but found invalid values:\ntensor([[nan, nan, nan,  ..., nan, nan, nan],\n        [nan, nan, nan,  ..., nan, nan, nan],\n        [nan, nan, nan,  ..., nan, nan, nan],\n        ...,\n        [nan, nan, nan,  ..., nan, nan, nan],\n        [nan, nan, nan,  ..., nan, nan, nan],\n        [nan, nan, nan,  ..., nan, nan, nan]], grad_fn=<MulBackward0>)')[0m
Traceback (most recent call last):
  File "/Users/Jan-Lukas.Pflaum/.virtualenvs/thesis/lib/python3.8/site-packages/optuna/study/_optimize.py", line 213, in _run_trial
    value_or_values = func(trial)
  File "/var/folders/8y/13f6tls56kz7j9jbb8j_dy7r0000gq/T/ipykernel_10274/1375961096.py", line 38, in optimize_agent
    model.learn(total_timesteps=EXPERIMENT_T

ValueError: Expected parameter scale (Tensor of shape (256, 12)) of distribution Normal(loc: torch.Size([256, 12]), scale: torch.Size([256, 12])) to satisfy the constraint GreaterThan(lower_bound=0.0), but found invalid values:
tensor([[nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan],
        ...,
        [nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan]], grad_fn=<MulBackward0>)

In [None]:
# backup 

%%time

def sample_a2c_params(trial: optuna.Trial):
    """
    Sampler for a2c hyperparams.
    :param trial:
    :return:
    """
    
    trial.using_her_replay_buffer = False
    
    n_steps_list = []
    n_step = round(EXPERIMENT_TIMESTEPS/3)

    while n_step > 1:
        n_steps_list.append(n_step)
        n_step = round(n_step/3)
    
    gamma = trial.suggest_categorical("gamma", [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999])
    normalize_advantage = trial.suggest_categorical("normalize_advantage", [False, True])
    max_grad_norm = trial.suggest_categorical("max_grad_norm", [0.3, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 3, 4, 5])
    # Toggle PyTorch RMS Prop (different from TF one, cf doc)
    use_rms_prop = trial.suggest_categorical("use_rms_prop", [False, True])
    gae_lambda = trial.suggest_categorical("gae_lambda", [0.7, 0.8, 0.9, 0.92, 0.95, 0.98, 0.99, 1.0])
    #n_steps = trial.suggest_categorical("n_steps", n_steps_list)
    n_steps = trial.suggest_categorical("n_steps", [2,3,4,5,6,7,8,9,10])
    #n_steps = 2
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1)
    ent_coef = trial.suggest_categorical('ent_coef', [0.1, 0.05, 0.025, 0.01, 0.001, 0.0001,  0.00000001])
    vf_coef = trial.suggest_uniform("vf_coef", 0.3, 1)
    
    # ------- policy_kwargs --------
    lr_schedule = trial.suggest_categorical("lr_schedule", ["linear", "constant"])
    use_sde = trial.suggest_categorical("use_sde", [False, True])
    if use_sde is True:
        sde_sample_freq = trial.suggest_categorical("sde_sample_freq", [-1, 0, 1, 2, 3])
        log_std_init = trial.suggest_uniform("log_std_init", -4, 1)
    sde_net_arch = trial.suggest_categorical("sde_net_arch", [None, "tiny", "small"])
    full_std = trial.suggest_categorical("full_std", [False, True])
    activation_fn = trial.suggest_categorical('activation_fn', ['tanh', 'relu', 'elu', 'leaky_relu'])
    ortho_init = trial.suggest_categorical("ortho_init", [False, True])

    # NOTE: Add "verybig" to net_arch when tuning HER
    net_arch = trial.suggest_categorical("net_arch", ["small", "medium", "big"])
    # ------- policy_kwargs --------

    if lr_schedule == "linear":
        learning_rate = linear_schedule(learning_rate)

    net_arch = {
        "small": [dict(pi=[64, 64], vf=[64, 64])],
        "medium": [dict(pi=[256, 256], vf=[256, 256])],
        "big": [dict(pi=[400, 400], vf=[400, 400])],
    }[net_arch]
    
    sde_net_arch = {
         None: None,
         "tiny": [64],
         "small": [64, 64],
    }[sde_net_arch]

    activation_fn = {"tanh": nn.Tanh, "relu": nn.ReLU, "elu": nn.ELU, "leaky_relu": nn.LeakyReLU}[activation_fn]
    
    hyperparams = {
        "n_steps": n_steps,
        "gamma": gamma,
        "gae_lambda": gae_lambda,
        "learning_rate": learning_rate,
        "ent_coef": ent_coef,
        "normalize_advantage": normalize_advantage,
        "max_grad_norm": max_grad_norm,
        "use_rms_prop": use_rms_prop,
        "use_sde": use_sde,
        "vf_coef": vf_coef,
        "policy_kwargs": dict(
            net_arch=net_arch,
            full_std=full_std,
            activation_fn=activation_fn,
            sde_net_arch=sde_net_arch,
            ortho_init=ortho_init,
        ),
    }
    
    if trial.using_her_replay_buffer:
        hyperparams = sample_her_params(trial, hyperparams)
        
    if use_sde is True:
        hyperparams["sde_sample_freq"] = sde_sample_freq
        hyperparams["policy_kwargs"]["log_std_init"] = log_std_init
 
    return hyperparams


    
def optimize_agent_a2c(trial):
    """ Train the model and optimize
        Optuna maximises the negative log likelihood, so we
        need to negate the reward here
    """
    
    nan_encountered = False
    try: 
        model_params = sample_a2c_params(trial)

        # init tracking experiment.
        # hyper-parameters, trial id are stored.
        config = dict(trial.params)
        config["trial.number"] = trial.number
        wandb.init(
            project="RL-optuna",
            entity="jlu237", 
            sync_tensorboard=True,
            config=config,
            tags=["A2C"] + EXPERIMENT_TAGS,
            reinit=True
        )

        env = make('VPPBiddingEnv-TUNING-v1')
        env = Monitor(env) 
        env = RecordEpisodeStatistics(env) # record stats such as returns
        model = A2C('MultiInputPolicy', env, verbose=0,  seed = 1, **model_params)
        print(model_params)
    
        # -------------- TRAINING -----------------
        model.learn(total_timesteps=EXPERIMENT_TIMESTEPS,
                    log_interval=1,
                    progress_bar = True,
                    callback=WandbCallback(
                        gradient_save_freq=1,
                        verbose=0))
        
        # -------------- EVALUATION -----------------
        eval_env = make('VPPBiddingEnv-TUNING-EVAL-v1')
        eval_env = RecordEpisodeStatistics(eval_env) # record stats such as returns
        episodes = 140
        for i_episode in range(episodes):
            observation = eval_env.reset()
            for t in range(1):
                eval_env.render()
                #logging.debug("observation : " + str(observation), extra={'log_step': str(i_episode), 'slot': 'test'})
                action, _states = model.predict(observation, deterministic = True)
                observation, reward, done, info = eval_env.step(action)
                if done:
                    break
        total_reward_test = info["total_reward"]
        total_profit_test = info["total_profit"]
        total_monthly_profit_test = info["total_profit_monthly"]

        mean_episode_reward_test = info["total_reward"] / episodes
        mean_episode_profit_test = info["total_profit"] / episodes

        print("Total Reward on Test Set: " + str(total_reward_test))
        print("Total Profit on Test Set: " + str(total_profit_test))
        print("Total Monthly Profit on Test Set: " + str(total_monthly_profit_test))
        print("Mean Episode Reward: " + str(mean_episode_reward_test))
        print("Mean Episode Profit: " + str(mean_episode_profit_test))

        wandb.log({"total_reward_test": total_reward_test, 
                   "total_profit_test": total_profit_test, 
                   "total_monthly_profit_test": total_monthly_profit_test, 
                   "mean_episode_reward_test": mean_episode_reward_test,
                   "mean_episode_profit_test": mean_episode_profit_test,
                })
        
        wandb.finish()
        eval_env.close()

        return_code = subprocess.run("wandb sync wandb/latest-run", shell=True)
        
    except AssertionError as e:
        # Sometimes, random hyperparams can generate NaN
        print(e)
        nan_encountered = True
        
    finally:        
        # Free memory
        env.close()
        
    if nan_encountered: 
        return float("nan")

    return total_reward_test

    
# Set pytorch num threads to 1 for faster training
torch.set_num_threads(1)

sampler = TPESampler(n_startup_trials=N_STARTUP_TRIALS)

# Do not prune before 1/3 of the max budget is used
pruner = MedianPruner(n_startup_trials=N_STARTUP_TRIALS, n_warmup_steps=EXPERIMENT_TIMESTEPS // 3)

#study = optuna.create_study()
study = optuna.create_study(sampler=sampler, pruner=pruner, direction="maximize")

try:
    study.optimize(optimize_agent(algo="A2C"), n_trials=N_TRIALS, timeout=10800)
except KeyboardInterrupt:
    print('Interrupted by keyboard.')

### Tuning TD3

In [None]:
def sample_td3_params(trial: optuna.Trial):
    """
    Sampler for TD3 hyperparams.
    :param trial:
    :return:
    """
    
    trial.using_her_replay_buffer = False

    learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1)
    batch_size = trial.suggest_categorical("batch_size", [16, 32, 64, 100, 128, 200])
    buffer_size = trial.suggest_categorical("buffer_size", [int(1e4), int(1e5), int(1e6)])
    tau = trial.suggest_categorical("tau", [0.001, 0.005, 0.01, 0.02, 0.05, 0.08])
    gamma = trial.suggest_categorical("gamma", [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999])

    train_freq = trial.suggest_categorical("train_freq", [1, 2, 8, 32])
    #gradient_steps = train_freq 
    gradient_steps = trial.suggest_categorical("gradient_steps", [-1, 1, 2, 8, 32, 256])
    learning_starts = trial.suggest_categorical('learning_starts', [0, 1, 10, 20, 100, 200]) 
    noise_type = trial.suggest_categorical("noise_type", ["ornstein-uhlenbeck", "normal", None])
    noise_std = trial.suggest_uniform("noise_std", 0, 1)
    
    policy_delay = trial.suggest_categorical("policy_delay", [ 1, 2, 5])
    target_policy_noise = trial.suggest_categorical("target_policy_noise", [0.1, 0.2, 0.3])

    if trial.using_her_replay_buffer: 
        net_arch = trial.suggest_categorical("net_arch", ["small", "medium", "big", "verybig"])
    else:
        net_arch = trial.suggest_categorical("net_arch", ["small", "medium", "big"])
    activation_fn = trial.suggest_categorical('activation_fn', ['tanh', 'relu', 'elu', 'leaky_relu'])
    
    net_arch = {
        "small": [64, 64],
        "medium": [256, 256],
        "big": [400, 300],
        "verybig": [256, 256, 256],
    }[net_arch]

    activation_fn = {"tanh": nn.Tanh, "relu": nn.ReLU, "elu": nn.ELU, "leaky_relu": nn.LeakyReLU}[activation_fn]

    hyperparams = {
        "learning_rate": learning_rate,
        "batch_size": batch_size,
        "buffer_size": buffer_size,
        "tau": tau,
        "gamma": gamma,
        "train_freq": train_freq,
        "gradient_steps": gradient_steps,
        "learning_starts": learning_starts,
        "policy_delay" : policy_delay,
        "target_policy_noise": target_policy_noise,
        "policy_kwargs": dict(
            net_arch=net_arch, 
            activation_fn=activation_fn
        ),
    }
    
    n_actions = 12      
    if noise_type == "normal":
        hyperparams["action_noise"] = NormalActionNoise(
            mean=np.zeros(n_actions), sigma=noise_std * np.ones(n_actions)
        )
    elif noise_type == "ornstein-uhlenbeck":
        hyperparams["action_noise"] = OrnsteinUhlenbeckActionNoise(
            mean=np.zeros(n_actions), sigma=noise_std * np.ones(n_actions)
        )

    return hyperparams


def optimize_agent_td3(trial):
    """ Train the model and optimize
        Optuna maximises the negative log likelihood, so we
        need to negate the reward here
    """
   
    
    model_params = sample_td3_params(trial)
    
    config = dict(trial.params)
    config["trial.number"] = trial.number
    wandb.init(
        project="RL-optuna",
        entity="jlu237", 
        sync_tensorboard=True,
        config=config,
        tags=["TD3"]+EXPERIMENT_TAGS,
        reinit=True
    )
    
    env = make('VPPBiddingEnv-TUNING-v1')
    env = Monitor(env) 
    env = RecordEpisodeStatistics(env) # record stats such as returns
    model = TD3('MultiInputPolicy', env, verbose=0, seed = 1, **model_params)
    print(model_params)
    model.learn(total_timesteps=EXPERIMENT_TIMESTEPS,
                log_interval=1,
                progress_bar = True,
                callback=WandbCallback(
                    gradient_save_freq=1,
                    verbose=0))
    
    # -------------- EVALUATION -----------------
    eval_env = make('VPPBiddingEnv-TUNING-EVAL-v1')
    eval_env = RecordEpisodeStatistics(eval_env) # record stats such as returns
    episodes = 140
    for i_episode in range(episodes):
        observation = eval_env.reset()
        for t in range(1):
            eval_env.render()
            #logging.debug("observation : " + str(observation), extra={'log_step': str(i_episode), 'slot': 'test'})
            action, _states = model.predict(observation, deterministic = True)
            observation, reward, done, info = eval_env.step(action)
            if done:
                break

    eval_env.close()
    
    total_reward_test = info["total_reward"]
    total_profit_test = info["total_profit"]

    mean_episode_reward_test = info["total_reward"] / episodes
    mean_episode_profit_test = info["total_profit"] / episodes

    print("Total Reward on Test Set: " + str(total_reward_test))
    print("Total Profit on Test Set: " + str(total_profit_test))
    print("Mean Episode Reward: " + str(mean_episode_reward_test))
    print("Mean Episode Profit: " + str(mean_episode_profit_test))
    
    wandb.log({"total_reward_test": total_reward_test, 
               "total_profit_test": total_profit_test, 
               "mean_episode_reward_test": mean_episode_reward_test,
               "mean_episode_profit_test": mean_episode_profit_test,
            })
    wandb.finish()
    
    return_code = subprocess.run("wandb sync wandb/latest-run", shell=True)
    
    return total_reward_test
    
    
# Set pytorch num threads to 1 for faster training
torch.set_num_threads(1)

sampler = TPESampler(n_startup_trials=N_STARTUP_TRIALS)

# Do not prune before 1/3 of the max budget is used
pruner = MedianPruner(n_startup_trials=N_STARTUP_TRIALS, n_warmup_steps=EXPERIMENT_TIMESTEPS // 3)

#study = optuna.create_study()
study = optuna.create_study(sampler=sampler, pruner=pruner, direction="maximize")

try:
    study.optimize(optimize_agent_td3, n_trials=N_TRIALS, timeout=10800)
except KeyboardInterrupt:
    print('Interrupted by keyboard.')

### Tuning SAC

In [None]:
def sample_sac_params(trial: optuna.Trial):
    """
    Sampler for SAC hyperparams.
    :param trial:
    :return:
    """
    
    trial.using_her_replay_buffer = False

    learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1)
    batch_size = trial.suggest_categorical("batch_size", [16, 32, 64, 100, 128, 200])
    buffer_size = trial.suggest_categorical("buffer_size", [int(1e4), int(1e5), int(1e6)])
    tau = trial.suggest_categorical("tau", [0.001, 0.005, 0.01, 0.02, 0.05, 0.08])
    gamma = trial.suggest_categorical("gamma", [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999])
    train_freq = trial.suggest_categorical("train_freq", [1, 2, 8, 32])
    # gradient_steps takes too much time
    gradient_steps = trial.suggest_categorical("gradient_steps", [-1, 1, 2, 8, 32, 256])
    learning_starts = trial.suggest_categorical('learning_starts', [0, 1, 10, 20, 100, 200]) 

    noise_type = trial.suggest_categorical("noise_type", ["ornstein-uhlenbeck", "normal", None])
    noise_std = trial.suggest_uniform("noise_std", 0, 1)
    policy_delay = trial.suggest_categorical("policy_delay", [ 1, 2, 5])
    target_policy_noise = trial.suggest_categorical("target_policy_noise", [0.1, 0.2, 0.3])
       
    ent_coef = trial.suggest_categorical('ent_coef', ['auto',  'auto_0.1', 0.5, 0.1, 0.05, 0.01, 0.0001])

    if ent_coef == 'auto' or 'auto_0.1':
        target_entropy = trial.suggest_categorical('target_entropy', ['auto', 10 , 5, 1, 0, -1, -5, -10])

    if trial.using_her_replay_buffer: 
        net_arch = trial.suggest_categorical("net_arch", ["small", "medium", "big", "verybig"])
    else:
        net_arch = trial.suggest_categorical("net_arch", ["small", "medium", "big"])
    
    
    net_arch = {
        "small": [64, 64],
        "medium": [256, 256],
        "big": [400, 300],
        "verybig": [256, 256, 256],
    }[net_arch]


    # ------- policy_kwargs --------
    use_sde = trial.suggest_categorical("use_sde", [False, True])
    if use_sde is True:
        sde_sample_freq = trial.suggest_categorical("sde_sample_freq", [-1, 0, 1, 2, 3])
        log_std_init = trial.suggest_uniform("log_std_init", -4, 1)
    
    #activation_fn = trial.suggest_categorical('activation_fn', ['tanh', 'relu', 'elu', 'leaky_relu'])
    activation_fn = trial.suggest_categorical('activation_fn', ['tanh', 'relu'])
    activation_fn = {"tanh": nn.Tanh, "relu": nn.ReLU, "elu": nn.ELU, "leaky_relu": nn.LeakyReLU}[activation_fn]
    # --------------------
 
    hyperparams = {
        "learning_rate": learning_rate,
        "batch_size": batch_size,
        "buffer_size": buffer_size,
        "learning_starts": learning_starts,
        "train_freq": train_freq,
        "gradient_steps": gradient_steps,
        "ent_coef": ent_coef,
        "tau": tau,
        "gamma": gamma,
        "policy_kwargs": dict(net_arch=net_arch, 
                              activation_fn=activation_fn
                             ),
    }
    
    if trial.using_her_replay_buffer:
        hyperparams = sample_her_params(trial, hyperparams)
        
    if use_sde is True:
        hyperparams["sde_sample_freq"] = sde_sample_freq
        hyperparams["policy_kwargs"]["log_std_init"] = log_std_init

        
    if ent_coef == 'auto' or '‘auto_0.1':
        hyperparams["target_entropy"] = target_entropy

    n_actions = 12      
    if noise_type == "normal":
        hyperparams["action_noise"] = NormalActionNoise(
            mean=np.zeros(n_actions), sigma=noise_std * np.ones(n_actions)
        )
    elif noise_type == "ornstein-uhlenbeck":
        
        hyperparams["action_noise"] = OrnsteinUhlenbeckActionNoise(
            mean=np.zeros(n_actions), sigma=noise_std * np.ones(n_actions)
        )

    return hyperparams


def optimize_agent_sac(trial):
    """ Train the model and optimize
        Optuna maximises the negative log likelihood, so we
        need to negate the reward here
    """
    
    nan_encountered = False
    try: 
        model_params = sample_sac_params(trial)

        # init tracking experiment.
        # hyper-parameters, trial id are stored.
        config = dict(trial.params)
        config["trial.number"] = trial.number
        wandb.init(
            project="RL-optuna",
            entity="jlu237", 
            sync_tensorboard=True,
            config=config,
            tags=["SAC"]+EXPERIMENT_TAGS,
            reinit=True
        )

        env = make('VPPBiddingEnv-TUNING-v1')
        env = Monitor(env) 
        env = RecordEpisodeStatistics(env) # record stats such as returns
        model = SAC('MultiInputPolicy', env, verbose=0, seed = 1, **model_params)
        print(model_params)
    
        # -------------- TRAINING -----------------
        model.learn(total_timesteps=EXPERIMENT_TIMESTEPS,
                    log_interval=1,
                    progress_bar = True,
                    callback=WandbCallback(
                        gradient_save_freq=1,
                        verbose=0))
        
        # -------------- EVALUATION -----------------
        eval_env = make('VPPBiddingEnv-TUNING-EVAL-v1')
        eval_env = RecordEpisodeStatistics(eval_env) # record stats such as returns
        episodes = 140
        for i_episode in range(episodes):
            observation = eval_env.reset()
            for t in range(1):
                eval_env.render()
                #logging.debug("observation : " + str(observation), extra={'log_step': str(i_episode), 'slot': 'test'})
                action, _states = model.predict(observation, deterministic = True)
                observation, reward, done, info = eval_env.step(action)
                if done:
                    break
        total_reward_test = info["total_reward"]
        total_profit_test = info["total_profit"]

        mean_episode_reward_test = info["total_reward"] / episodes
        mean_episode_profit_test = info["total_profit"] / episodes

        print("Total Reward on Test Set: " + str(total_reward_test))
        print("Total Profit on Test Set: " + str(total_profit_test))
        print("Mean Episode Reward: " + str(mean_episode_reward_test))
        print("Mean Episode Profit: " + str(mean_episode_profit_test))

        wandb.log({"total_reward_test": total_reward_test, 
                   "total_profit_test": total_profit_test, 
                   "mean_episode_reward_test": mean_episode_reward_test,
                   "mean_episode_profit_test": mean_episode_profit_test,
                })
        wandb.finish()
        eval_env.close()

        return_code = subprocess.run("wandb sync wandb/latest-run", shell=True)
        
    except AssertionError as e:
        # Sometimes, random hyperparams can generate NaN
        print(e)
        nan_encountered = True
        
    finally:        
        # Free memory
        env.close()
        
    if nan_encountered: 
        return float("nan")

    return total_reward_test

# Set pytorch num threads to 1 for faster training
torch.set_num_threads(1)

sampler = TPESampler(n_startup_trials=N_STARTUP_TRIALS)

# Do not prune before 1/3 of the max budget is used
pruner = MedianPruner(n_startup_trials=N_STARTUP_TRIALS, n_warmup_steps=EXPERIMENT_TIMESTEPS // 3)

#study = optuna.create_study()
study = optuna.create_study(sampler=sampler, pruner=pruner, direction="maximize")

try:
    study.optimize(optimize_agent_sac, n_trials=N_TRIALS, timeout=10800)
except KeyboardInterrupt:
    print('Interrupted by keyboard.')

### Tuning DDPG 

In [None]:
def sample_ddpg_params(trial: optuna.Trial):
    """
    Sampler for DDPG hyperparams.
    :param trial:
    :return:
    """

    trial.using_her_replay_buffer = False

    
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1)
    buffer_size = trial.suggest_categorical("buffer_size", [int(1e4), int(1e5), int(1e6)])
    learning_starts = trial.suggest_categorical('learning_starts', [0, 1, 10, 20, 100, 200]) 
    batch_size = trial.suggest_categorical("batch_size", [16, 32, 64, 100, 128, 200])
    tau = trial.suggest_categorical("tau", [0.001, 0.005, 0.01, 0.02, 0.05, 0.08])
    gamma = trial.suggest_categorical("gamma", [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999])
    train_freq = trial.suggest_categorical("train_freq", [1, 2, 8, 32])
    # gradient_steps takes too much time
    gradient_steps = trial.suggest_categorical("gradient_steps", [-1, 1, 2, 8, 32])
    noise_type = trial.suggest_categorical("noise_type", ["ornstein-uhlenbeck", "normal", None])
    noise_std = trial.suggest_uniform("noise_std", 0, 1)
    
    if trial.using_her_replay_buffer: 
        net_arch = trial.suggest_categorical("net_arch", ["small", "medium", "big", "verybig"])
    else:
        net_arch = trial.suggest_categorical("net_arch", ["small", "medium", "big"])
        
    net_arch = {
        "small": [64, 64],
        "medium": [256, 256],
        "big": [400, 300],
        "verybig": [256, 256, 256],
    }[net_arch]
    
    # ------- policy_kwargs --------
    #activation_fn = trial.suggest_categorical('activation_fn', ['tanh', 'relu', 'elu', 'leaky_relu'])
    activation_fn = trial.suggest_categorical('activation_fn', ['tanh', 'relu'])
    activation_fn = {"tanh": nn.Tanh, "relu": nn.ReLU, "elu": nn.ELU, "leaky_relu": nn.LeakyReLU}[activation_fn]
    # --------------------

    hyperparams = {
        "learning_rate": learning_rate,
        "buffer_size": buffer_size,
        "learning_starts": learning_starts,
        "batch_size": batch_size,
        "tau": tau,
        "gamma": gamma,
        "train_freq": train_freq,
        "gradient_steps": gradient_steps,
        "policy_kwargs": dict(net_arch=net_arch, 
                              activation_fn=activation_fn
                             ),
    }

    n_actions = 12      
    if noise_type == "normal":
        hyperparams["action_noise"] = NormalActionNoise(
            mean=np.zeros(n_actions), sigma=noise_std * np.ones(n_actions)
        )
    elif noise_type == "ornstein-uhlenbeck":
        
        hyperparams["action_noise"] = OrnsteinUhlenbeckActionNoise(
            mean=np.zeros(n_actions), sigma=noise_std * np.ones(n_actions)
        )

    if trial.using_her_replay_buffer:
        hyperparams = sample_her_params(trial, hyperparams)

    return hyperparams


def optimize_agent_ddpg(trial):
    """ Train the model and optimize
        Optuna maximises the negative log likelihood, so we
        need to negate the reward here
    """
    
    nan_encountered = False
    try: 
        model_params = sample_ddpg_params(trial)

        # init tracking experiment.
        # hyper-parameters, trial id are stored.
        config = dict(trial.params)
        config["trial.number"] = trial.number
        wandb.init(
            project="RL-optuna",
            entity="jlu237", 
            sync_tensorboard=True,
            config=config,
            tags=["DDPG"] + EXPERIMENT_TAGS,
            reinit=True
        )

        env = make('VPPBiddingEnv-TUNING-v1')
        env = Monitor(env) 
        env = RecordEpisodeStatistics(env) # record stats such as returns
        model = DDPG('MultiInputPolicy', env, verbose=0,  seed = 1, **model_params)
        print(model_params)
    
        # -------------- TRAINING -----------------
        model.learn(total_timesteps=EXPERIMENT_TIMESTEPS,
                    log_interval=1,
                    progress_bar = True,
                    callback=WandbCallback(
                        gradient_save_freq=1,
                        verbose=0))
        
        # -------------- EVALUATION -----------------
        eval_env = make('VPPBiddingEnv-TUNING-EVAL-v1')
        eval_env = RecordEpisodeStatistics(eval_env) # record stats such as returns
        episodes = 140
        for i_episode in range(episodes):
            observation = eval_env.reset()
            for t in range(1):
                eval_env.render()
                #logging.debug("observation : " + str(observation), extra={'log_step': str(i_episode), 'slot': 'test'})
                action, _states = model.predict(observation, deterministic = True)
                observation, reward, done, info = eval_env.step(action)
                if done:
                    break
        total_reward_test = info["total_reward"]
        total_profit_test = info["total_profit"]

        mean_episode_reward_test = info["total_reward"] / episodes
        mean_episode_profit_test = info["total_profit"] / episodes

        print("Total Reward on Test Set: " + str(total_reward_test))
        print("Total Profit on Test Set: " + str(total_profit_test))
        print("Mean Episode Reward: " + str(mean_episode_reward_test))
        print("Mean Episode Profit: " + str(mean_episode_profit_test))

        wandb.log({"total_reward_test": total_reward_test, 
                   "total_profit_test": total_profit_test, 
                   "mean_episode_reward_test": mean_episode_reward_test,
                   "mean_episode_profit_test": mean_episode_profit_test,
                })
        wandb.finish()
        eval_env.close()

        return_code = subprocess.run("wandb sync wandb/latest-run", shell=True)
        
    except AssertionError as e:
        # Sometimes, random hyperparams can generate NaN
        print(e)
        nan_encountered = True
        
    finally:        
        # Free memory
        env.close()
        
    if nan_encountered: 
        return float("nan")

    return total_reward_test

# Set pytorch num threads to 1 for faster training
torch.set_num_threads(1)

sampler = TPESampler(n_startup_trials=N_STARTUP_TRIALS)

# Do not prune before 1/3 of the max budget is used
pruner = MedianPruner(n_startup_trials=N_STARTUP_TRIALS, n_warmup_steps=EXPERIMENT_TIMESTEPS // 3)

#study = optuna.create_study()
study = optuna.create_study(sampler=sampler, pruner=pruner, direction="maximize")

try:
    study.optimize(optimize_agent_ddpg, n_trials=N_TRIALS, timeout=10800)
except KeyboardInterrupt:
    print('Interrupted by keyboard.')

### Tuning RecurrentPPO 

In [None]:
def sample_rppo_params(trial: optuna.Trial):
    """
    Sampler for RecurrentPPO hyperparams.
    :param trial:
    :return:
    """
    
    trial.using_her_replay_buffer = False
    
    n_steps_list = []
    n_step = round(EXPERIMENT_TIMESTEPS/3)

    while n_step > 1:
        n_steps_list.append(n_step)
        n_step = round(n_step/3)
        
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1)
    #n_steps = trial.suggest_categorical("n_steps", n_steps_list)
    n_steps = trial.suggest_categorical("n_steps", [2,3,4,5,6,7,8,9,10])
    batch_size = trial.suggest_categorical("batch_size", [16, 32, 64, 100, 128, 200])
    n_epochs = trial.suggest_categorical("n_epochs", [1, 5, 10, 20])
    gamma = trial.suggest_categorical("gamma", [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999])
    gae_lambda = trial.suggest_categorical("gae_lambda", [0.7, 0.8, 0.9, 0.92, 0.95, 0.98, 0.99, 1.0])
    clip_range = trial.suggest_categorical("clip_range", [0.1, 0.2, 0.3, 0.4])
    normalize_advantage = trial.suggest_categorical("normalize_advantage", [False, True])
    ent_coef = trial.suggest_loguniform("ent_coef", 0.00000001, 0.1)
    vf_coef = trial.suggest_uniform("vf_coef", 0, 1)
    max_grad_norm = trial.suggest_categorical("max_grad_norm", [0.3, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 5])
    target_kl = trial.suggest_categorical("target_kl", [0.1, 0.05, 0.03, 0.02, 0.01, 0.005, 0.001])
    
    # ------- policy_kwargs --------
    lr_schedule = trial.suggest_categorical("lr_schedule", ["linear", "constant"])
    use_sde = trial.suggest_categorical("use_sde", [False, True])
    if use_sde is True:
        sde_sample_freq = trial.suggest_categorical("sde_sample_freq", [-1, 0, 1, 2, 3])
        log_std_init = trial.suggest_uniform("log_std_init", -4, 1)
    full_std = trial.suggest_categorical("full_std", [False, True])
    activation_fn = trial.suggest_categorical('activation_fn', ['tanh', 'relu', 'elu', 'leaky_relu'])
    ortho_init = trial.suggest_categorical("ortho_init", [False, True])

    # NOTE: Add "verybig" to net_arch when tuning HER
    net_arch = trial.suggest_categorical("net_arch", ["small", "medium", "big"])
    # ------- policy_kwargs --------

    if lr_schedule == "linear":
        learning_rate = linear_schedule(learning_rate)

    net_arch = {
        "small": [dict(pi=[64, 64], vf=[64, 64])],
        "medium": [dict(pi=[256, 256], vf=[256, 256])],
        "big": [dict(pi=[400, 400], vf=[400, 400])],
    }[net_arch]
    
    activation_fn = {"tanh": nn.Tanh, "relu": nn.ReLU, "elu": nn.ELU, "leaky_relu": nn.LeakyReLU}[activation_fn]
    
    hyperparams = {
        "learning_rate": learning_rate,
        "n_steps": n_steps,
        "batch_size": batch_size,
        "n_epochs": n_epochs,
        "gamma": gamma,
        "gae_lambda": gae_lambda,
        "clip_range": clip_range,
        "normalize_advantage": normalize_advantage,
        "ent_coef": ent_coef,
        "vf_coef": vf_coef,
        "max_grad_norm": max_grad_norm,
        "target_kl": target_kl,
        "policy_kwargs": dict(
            net_arch=net_arch,
            full_std=full_std,
            activation_fn=activation_fn,
            ortho_init=ortho_init,
        ),
    }
    
     
    if trial.using_her_replay_buffer:
        hyperparams = sample_her_params(trial, hyperparams)
        
    if use_sde is True:
        hyperparams["sde_sample_freq"] = sde_sample_freq
        hyperparams["policy_kwargs"]["log_std_init"] = log_std_init

    return hyperparams


def optimize_agent_rppo(trial):
    """ Train the model and optimize
        Optuna maximises the negative log likelihood, so we
        need to negate the reward here
    """
    
    nan_encountered = False
    try: 
        model_params = sample_rppo_params(trial)

        # init tracking experiment.
        # hyper-parameters, trial id are stored.
        config = dict(trial.params)
        config["trial.number"] = trial.number
        wandb.init(
            project="RL-optuna",
            entity="jlu237", 
            sync_tensorboard=True,
            config=config,
            tags=["R_PPO"]+EXPERIMENT_TAGS,
            reinit=True
        )

        env = make('VPPBiddingEnv-TUNING-v1')
        env = Monitor(env) 
        env = RecordEpisodeStatistics(env) # record stats such as returns
        model = RecurrentPPO('MultiInputLstmPolicy', env, verbose=0, seed = 1, **model_params)
        print(model_params)
    
        # -------------- TRAINING -----------------
        model.learn(total_timesteps=EXPERIMENT_TIMESTEPS,
                    log_interval=1,
                    progress_bar = True,
                    callback=WandbCallback(
                        gradient_save_freq=1,
                        verbose=0))
        
        # -------------- EVALUATION -----------------
        eval_env = make('VPPBiddingEnv-TUNING-EVAL-v1')
        eval_env = RecordEpisodeStatistics(eval_env) # record stats such as returns
        episodes = 140
        for i_episode in range(episodes):
            observation = eval_env.reset()
            lstm_states = None
            num_envs = 1
            # Episode start signals are used to reset the lstm states
            episode_starts = np.ones((num_envs,), dtype=bool)
            for t in range(1):
                eval_env.render()
                #logging.debug("observation : " + str(observation), extra={'log_step': str(i_episode), 'slot': 'test'})
                action, lstm_states = model.predict(observation, state=lstm_states, episode_start=episode_starts, deterministic=True)
                observation, reward, dones, info = eval_env.step(action)
                episode_starts = dones
                if dones:
                    break
        
        total_reward_test = info["total_reward"]
        total_profit_test = info["total_profit"]

        mean_episode_reward_test = info["total_reward"] / episodes
        mean_episode_profit_test = info["total_profit"] / episodes

        print("Total Reward on Test Set: " + str(total_reward_test))
        print("Total Profit on Test Set: " + str(total_profit_test))
        print("Mean Episode Reward: " + str(mean_episode_reward_test))
        print("Mean Episode Profit: " + str(mean_episode_profit_test))

        wandb.log({"total_reward_test": total_reward_test, 
                   "total_profit_test": total_profit_test, 
                   "mean_episode_reward_test": mean_episode_reward_test,
                   "mean_episode_profit_test": mean_episode_profit_test,
                })
        wandb.finish()
        eval_env.close()

        return_code = subprocess.run("wandb sync wandb/latest-run", shell=True)
        
    except AssertionError as e:
        # Sometimes, random hyperparams can generate NaN
        print(e)
        nan_encountered = True
        
    finally:        
        # Free memory
        env.close()
        
    if nan_encountered: 
        return float("nan")

    return total_reward_test

# Set pytorch num threads to 1 for faster training
torch.set_num_threads(1)

sampler = TPESampler(n_startup_trials=N_STARTUP_TRIALS)

# Do not prune before 1/3 of the max budget is used
pruner = MedianPruner(n_startup_trials=N_STARTUP_TRIALS, n_warmup_steps=EXPERIMENT_TIMESTEPS // 3)

#study = optuna.create_study()
study = optuna.create_study(sampler=sampler, pruner=pruner, direction="maximize")

try:
    study.optimize(optimize_agent_rppo, n_trials=N_TRIALS, timeout=10800)
except KeyboardInterrupt:
    print('Interrupted by keyboard.')

### Tuning TRPO 

In [None]:
def sample_trpo_params(trial):
    """
    Sampler for TRPO hyperparams.
    :param trial:
    :return:
    """
    
    trial.using_her_replay_buffer = False
    
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1)
    n_steps = trial.suggest_categorical("n_steps", [2,3,4,5,6,7,8,9,10])
    batch_size = trial.suggest_categorical("batch_size", [16, 32, 64, 100, 128, 200])
    gamma = trial.suggest_categorical("gamma", [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999])
    cg_max_steps = trial.suggest_categorical("cg_max_steps", [5, 10, 15, 20, 25, 30])
    cg_damping = trial.suggest_categorical("cg_damping", [0.5, 0.2, 0.1, 0.05, 0.01])
    line_search_shrinking_factor = trial.suggest_categorical("line_search_shrinking_factor", [0.6, 0.7, 0.8, 0.9])
    line_search_max_iter = trial.suggest_categorical("line_search_max_iter", [1, 5, 10, 15, 20])
    n_critic_updates = trial.suggest_categorical("n_critic_updates", [1, 5, 10, 20, 25])
    gae_lambda = trial.suggest_categorical("gae_lambda", [0.8, 0.9, 0.92, 0.95, 0.98, 0.99, 1.0])
    normalize_advantage = trial.suggest_categorical("normalize_advantage", [False, True])
    use_sde = trial.suggest_categorical("use_sde", [False, True])
    if use_sde is True:
        sde_sample_freq = trial.suggest_categorical("sde_sample_freq", [-1, 0, 1, 2, 3])    
    target_kl = trial.suggest_categorical("target_kl", [0.1, 0.05, 0.03, 0.02, 0.01, 0.005, 0.001])
    
    # ------- policy_kwargs --------
    if use_sde is True:
        log_std_init = trial.suggest_uniform("log_std_init", -4, 1)
        full_std = trial.suggest_categorical('full_std', [False, True])
    sde_net_arch = trial.suggest_categorical("sde_net_arch", [None, "tiny", "small"])
    lr_schedule = trial.suggest_categorical("lr_schedule", ["linear", "constant"])
    net_arch = trial.suggest_categorical("net_arch", ["small", "medium", "big"])
    ortho_init = trial.suggest_categorical('ortho_init', [False, True])
    activation_fn = trial.suggest_categorical('activation_fn', ['tanh', 'relu', 'elu', 'leaky_relu'])

    if batch_size > n_steps:
        batch_size = n_steps

    if lr_schedule == "linear":
        learning_rate = linear_schedule(learning_rate)

    net_arch = {
        "small": [dict(pi=[64, 64], vf=[64, 64])],
        "medium": [dict(pi=[256, 256], vf=[256, 256])],
        "big": [dict(pi=[400, 400], vf=[400, 400])],
    }[net_arch]
    
    sde_net_arch = {
         None: None,
         "tiny": [64],
         "small": [64, 64],
    }[sde_net_arch]

    activation_fn = {"tanh": nn.Tanh, "relu": nn.ReLU, "elu": nn.ELU, "leaky_relu": nn.LeakyReLU}[activation_fn]

    hyperparams = {
            "learning_rate": learning_rate,
            "n_steps": n_steps,
            "batch_size": batch_size,
            "gamma": gamma,
            "cg_max_steps": cg_max_steps,
            "cg_damping": cg_damping,
            "line_search_shrinking_factor": line_search_shrinking_factor,
            "line_search_max_iter": line_search_max_iter,
            "n_critic_updates": n_critic_updates,
            "gae_lambda": gae_lambda,
            "normalize_advantage": normalize_advantage,
            "use_sde": use_sde,
            "target_kl": target_kl,
            "policy_kwargs": dict(
                net_arch=net_arch,
                ortho_init=ortho_init,
                activation_fn=activation_fn,
            ),
        }
    
     
    if trial.using_her_replay_buffer:
        hyperparams = sample_her_params(trial, hyperparams)
        
    if use_sde is True:
        hyperparams["sde_sample_freq"] = sde_sample_freq
        hyperparams["policy_kwargs"]["log_std_init"] = log_std_init
        hyperparams["policy_kwargs"]["full_std"] = full_std
        hyperparams["policy_kwargs"]["sde_net_arch"] = sde_net_arch       
        
    return hyperparams


def optimize_agent_trpo(trial):
    """ Train the model and optimize
        Optuna maximises the negative log likelihood, so we
        need to negate the reward here
    """
    
    nan_encountered = False
    try: 
        model_params = sample_trpo_params(trial)

        # init tracking experiment.
        # hyper-parameters, trial id are stored.
        config = dict(trial.params)
        config["trial.number"] = trial.number
        wandb.init(
            project="RL-optuna",
            entity="jlu237", 
            sync_tensorboard=True,
            config=config,
            tags=["TRPO"] + EXPERIMENT_TAGS,
            reinit=True
        )

        env = make('VPPBiddingEnv-TUNING-v1')
        env = Monitor(env) 
        env = RecordEpisodeStatistics(env) # record stats such as returns
        model = TRPO('MultiInputPolicy', env, verbose=0,  seed = 1, **model_params)
        print(model_params)
    
        # -------------- TRAINING -----------------
        model.learn(total_timesteps=EXPERIMENT_TIMESTEPS,
                    log_interval=1,
                    progress_bar = True,
                    callback=WandbCallback(
                        gradient_save_freq=1,
                        verbose=0))
        
        # -------------- EVALUATION -----------------
        eval_env = make('VPPBiddingEnv-TUNING-EVAL-v1')
        eval_env = RecordEpisodeStatistics(eval_env) # record stats such as returns
        episodes = 140
        for i_episode in range(episodes):
            observation = eval_env.reset()
            for t in range(1):
                eval_env.render()
                #logging.debug("observation : " + str(observation), extra={'log_step': str(i_episode), 'slot': 'test'})
                action, _states = model.predict(observation, deterministic = True)
                observation, reward, done, info = eval_env.step(action)
                if done:
                    break
        total_reward_test = info["total_reward"]
        total_profit_test = info["total_profit"]

        mean_episode_reward_test = info["total_reward"] / episodes
        mean_episode_profit_test = info["total_profit"] / episodes

        print("Total Reward on Test Set: " + str(total_reward_test))
        print("Total Profit on Test Set: " + str(total_profit_test))
        print("Mean Episode Reward: " + str(mean_episode_reward_test))
        print("Mean Episode Profit: " + str(mean_episode_profit_test))

        wandb.log({"total_reward_test": total_reward_test, 
                   "total_profit_test": total_profit_test, 
                   "mean_episode_reward_test": mean_episode_reward_test,
                   "mean_episode_profit_test": mean_episode_profit_test,
                })
        wandb.finish()
        eval_env.close()

        return_code = subprocess.run("wandb sync wandb/latest-run", shell=True)
        
    except AssertionError as e:
        # Sometimes, random hyperparams can generate NaN
        print(e)
        nan_encountered = True
        
    finally:        
        # Free memory
        env.close()
        
    if nan_encountered: 
        return float("nan")

    return total_reward_test

# Set pytorch num threads to 1 for faster training
torch.set_num_threads(1)

sampler = TPESampler(n_startup_trials=N_STARTUP_TRIALS)

# Do not prune before 1/3 of the max budget is used
pruner = MedianPruner(n_startup_trials=N_STARTUP_TRIALS, n_warmup_steps=EXPERIMENT_TIMESTEPS // 3)

#study = optuna.create_study()
study = optuna.create_study(sampler=sampler, pruner=pruner, direction="maximize")

try:
    study.optimize(optimize_agent_trpo, n_trials=N_TRIALS, timeout=10800)
except KeyboardInterrupt:
    print('Interrupted by keyboard.')

### Tuning PPO 

In [None]:
def sample_ppo_params(trial: optuna.Trial):
    """
    Sampler for PPO hyperparams.
    :param trial:
    :return:
    """
    
    trial.using_her_replay_buffer = False
    
    batch_size = trial.suggest_categorical("batch_size", [8, 16, 32, 64, 128, 256, 512])
    n_steps = trial.suggest_categorical("n_steps", [8, 16, 32, 64, 128, 256, 512, 1024, 2048])
    gamma = trial.suggest_categorical("gamma", [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999])
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1)
    lr_schedule = "constant"
    # Uncomment to enable learning rate schedule
    # lr_schedule = trial.suggest_categorical('lr_schedule', ['linear', 'constant'])
    ent_coef = trial.suggest_loguniform("ent_coef", 0.00000001, 0.1)
    clip_range = trial.suggest_categorical("clip_range", [0.1, 0.2, 0.3, 0.4])
    n_epochs = trial.suggest_categorical("n_epochs", [1, 5, 10, 20])
    gae_lambda = trial.suggest_categorical("gae_lambda", [0.8, 0.9, 0.92, 0.95, 0.98, 0.99, 1.0])
    max_grad_norm = trial.suggest_categorical("max_grad_norm", [0.3, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 5])
    vf_coef = trial.suggest_uniform("vf_coef", 0, 1)
    net_arch = trial.suggest_categorical("net_arch", ["small", "medium"])
    # Uncomment for gSDE (continuous actions)
    # log_std_init = trial.suggest_uniform("log_std_init", -4, 1)
    # Uncomment for gSDE (continuous action)
    # sde_sample_freq = trial.suggest_categorical("sde_sample_freq", [-1, 8, 16, 32, 64, 128, 256])
    # Orthogonal initialization
    ortho_init = False
    # ortho_init = trial.suggest_categorical('ortho_init', [False, True])
    # activation_fn = trial.suggest_categorical('activation_fn', ['tanh', 'relu', 'elu', 'leaky_relu'])
    activation_fn = trial.suggest_categorical("activation_fn", ["tanh", "relu"])

    # TODO: account when using multiple envs
    if batch_size > n_steps:
        batch_size = n_steps

    if lr_schedule == "linear":
        learning_rate = linear_schedule(learning_rate)

    # Independent networks usually work best
    # when not working with images
    net_arch = {
        "small": [dict(pi=[64, 64], vf=[64, 64])],
        "medium": [dict(pi=[256, 256], vf=[256, 256])],
    }[net_arch]

    activation_fn = {"tanh": nn.Tanh, "relu": nn.ReLU, "elu": nn.ELU, "leaky_relu": nn.LeakyReLU}[activation_fn]

    hyperparams = {
            "n_steps": n_steps,
            "batch_size": batch_size,
            "gamma": gamma,
            "learning_rate": learning_rate,
            "ent_coef": ent_coef,
            "clip_range": clip_range,
            "n_epochs": n_epochs,
            "gae_lambda": gae_lambda,
            "max_grad_norm": max_grad_norm,
            "vf_coef": vf_coef,
            # "sde_sample_freq": sde_sample_freq,
            "policy_kwargs": dict(
                # log_std_init=log_std_init,
                net_arch=net_arch,
                activation_fn=activation_fn,
                ortho_init=ortho_init,
            ),
        }
    
    
    
    return hyperparams


def optimize_agent_ppo(trial):
    """ Train the model and optimize
        Optuna maximises the negative log likelihood, so we
        need to negate the reward here
    """
   
    
    model_params = sample_ppo_params(trial)
    
    # init tracking experiment.
    # hyper-parameters, trial id are stored.
    config = dict(trial.params)
    config["trial.number"] = trial.number
    wandb.init(
        project="RL-optuna",
        entity="jlu237", 
        sync_tensorboard=True,
        config=config,
        tags=["PPO"] + EXPERIMENT_TAGS,
        reinit=True
    )
    
    env = make('VPPBiddingEnv-TRAIN-v1', render_mode="human")
    env = Monitor(env) 
    env = RecordEpisodeStatistics(env) # record stats such as returns
    
    
    model = PPO('MultiInputPolicy', env, verbose=0,  seed = 1, **model_params)

    model.learn(total_timesteps=EXPERIMENT_TIMESTEPS,
                log_interval=1,
                progress_bar = True,
                callback=WandbCallback(
                    gradient_save_freq=1,
                    verbose=0))

    wandb.finish()
    return_code = subprocess.run("wandb sync wandb/latest-run", shell=True)
    print(return_code)
    
study = optuna.create_study()
try:
    study.optimize(optimize_agent_ppo, n_trials=N_TRIALS)
except KeyboardInterrupt:
    print('Interrupted by keyboard.')

### Tuning TQC 

In [None]:
def sample_tqc_params(trial: optuna.Trial):
    """
    Sampler for TQC hyperparams.
    :param trial:
    :return:
    """
    # TQC is SAC + Distributional RL
    hyperparams = sample_sac_params(trial)

    n_quantiles = trial.suggest_int("n_quantiles", 5, 50)
    top_quantiles_to_drop_per_net = trial.suggest_int("top_quantiles_to_drop_per_net", 0, n_quantiles - 1)

    hyperparams["policy_kwargs"].update({"n_quantiles": n_quantiles})
    hyperparams["top_quantiles_to_drop_per_net"] = top_quantiles_to_drop_per_net

    return hyperparams



def optimize_agent_tqc(trial):
    """ Train the model and optimize
        Optuna maximises the negative log likelihood, so we
        need to negate the reward here
    """
   
    
    model_params = sample_tqc_params(trial)
    
    # init tracking experiment.
    # hyper-parameters, trial id are stored.
    config = dict(trial.params)
    config["trial.number"] = trial.number
    wandb.init(
        project="RL-optuna",
        entity="jlu237", 
        sync_tensorboard=True,
        config=config,
        tags=["TQC"] + EXPERIMENT_TAGS,
        reinit=True
    )
    
    env = make('VPPBiddingEnv-TRAIN-v1', render_mode="human")
    env = Monitor(env) 
    env = RecordEpisodeStatistics(env) # record stats such as returns
    
    
    model = DDPG('MultiInputPolicy', env, verbose=0,  seed = 1, **model_params)

    model.learn(total_timesteps=EXPERIMENT_TIMESTEPS,
                log_interval=1,
                progress_bar = True,
                callback=WandbCallback(
                    gradient_save_freq=1,
                    verbose=0))

    wandb.finish()
    return_code = subprocess.run("wandb sync wandb/latest-run", shell=True)
    print(return_code)
    
study = optuna.create_study()
try:
    study.optimize(optimize_agent_tqc, n_trials=N_TRIALS)
except KeyboardInterrupt:
    print('Interrupted by keyboard.')

### ARS  ???

In [None]:
from typing import List

slot_settlement_prices_DE : List[np.float32] = []
print(slot_settlement_prices_DE)
print(type(slot_settlement_prices_DE))

slot_settlement_prices_DE = [0.,0.,0.,0.,0.,0.]
print(slot_settlement_prices_DE)
print(type(slot_settlement_prices_DE))

slot_settlement_prices_DE = np.array(slot_settlement_prices_DE, dtype=np.float32)
print(slot_settlement_prices_DE)
print(type(slot_settlement_prices_DE))



In [None]:
from scipy.stats import norm

maximum_possible_VPP_capacity = 131.32
mean: float = 0. # symmetrical normal distribution at 0 
sd: float = maximum_possible_VPP_capacity/7
max_at_10_percent: float = norm.pdf(maximum_possible_VPP_capacity*0.1,mean,sd)

max_at_10_percent

In [None]:
list_false: List[int] = []
