# Performing hyperparameter optimization in the bench class

For this example we will use a model that consumes pre-computed features. 
We will use [CatBoost](https://catboost.ai/) which you need to install separately (e.g. using `pip install catboost`).

We will use [Optuna](https://optuna.org/) for hyperparameter optimization.

In [6]:
from catboost import CatBoostRegressor
from mofdscribe.bench import LogkHCO2OODBench
from mofdscribe.bench.df_model import DFModel
from sklearn.metrics import mean_squared_error
import optuna

In [11]:
FEATURES = [
    "total_POV_gravimetric",
    "mc_CRY-chi-0-all",
    "mc_CRY-chi-1-all",
    "mc_CRY-chi-2-all",
    "mc_CRY-chi-3-all",
    "mc_CRY-Z-0-all",
    "mc_CRY-Z-1-all",
    "mc_CRY-Z-2-all",
    "mc_CRY-Z-3-all",
    "mc_CRY-I-0-all",
    "mc_CRY-I-1-all",
    "mc_CRY-I-2-all",
    "mc_CRY-I-3-all",
    "mc_CRY-T-0-all",
    "mc_CRY-T-1-all",
    "mc_CRY-T-2-all",
    "mc_CRY-T-3-all",
    "mc_CRY-S-0-all",
    "mc_CRY-S-1-all",
    "mc_CRY-S-2-all",
    "mc_CRY-S-3-all",
    "D_mc_CRY-chi-0-all",
    "D_mc_CRY-chi-1-all",
    "D_mc_CRY-chi-2-all",
    "D_mc_CRY-chi-3-all",
    "D_mc_CRY-Z-0-all",
    "D_mc_CRY-Z-1-all",
    "D_mc_CRY-Z-2-all",
    "D_mc_CRY-Z-3-all",
    "D_mc_CRY-I-0-all",
    "D_mc_CRY-I-1-all",
    "D_mc_CRY-I-2-all",
    "D_mc_CRY-I-3-all",
    "D_mc_CRY-T-0-all",
    "D_mc_CRY-T-1-all",
    "D_mc_CRY-T-2-all",
    "D_mc_CRY-T-3-all",
    "D_mc_CRY-S-0-all",
    "D_mc_CRY-S-1-all",
    "D_mc_CRY-S-2-all",
    "D_mc_CRY-S-3-all",
    "sum-mc_CRY-chi-0-all",
    "sum-mc_CRY-chi-1-all",
    "sum-mc_CRY-chi-2-all",
    "sum-mc_CRY-chi-3-all",
    "sum-mc_CRY-Z-0-all",
    "sum-mc_CRY-Z-1-all",
    "sum-mc_CRY-Z-2-all",
    "sum-mc_CRY-Z-3-all",
    "sum-mc_CRY-I-0-all",
    "sum-mc_CRY-I-1-all",
    "sum-mc_CRY-I-2-all",
    "sum-mc_CRY-I-3-all",
    "sum-mc_CRY-T-0-all",
    "sum-mc_CRY-T-1-all",
    "sum-mc_CRY-T-2-all",
    "sum-mc_CRY-T-3-all",
    "sum-mc_CRY-S-0-all",
    "sum-mc_CRY-S-1-all",
    "sum-mc_CRY-S-2-all",
    "sum-mc_CRY-S-3-all",
    "sum-D_mc_CRY-chi-0-all",
    "sum-D_mc_CRY-chi-1-all",
    "sum-D_mc_CRY-chi-2-all",
    "sum-D_mc_CRY-chi-3-all",
    "sum-D_mc_CRY-Z-0-all",
    "sum-D_mc_CRY-Z-1-all",
    "sum-D_mc_CRY-Z-2-all",
    "sum-D_mc_CRY-Z-3-all",
    "sum-D_mc_CRY-I-0-all",
    "sum-D_mc_CRY-I-1-all",
    "sum-D_mc_CRY-I-2-all",
    "sum-D_mc_CRY-I-3-all",
    "sum-D_mc_CRY-T-0-all",
    "sum-D_mc_CRY-T-1-all",
    "sum-D_mc_CRY-T-2-all",
    "sum-D_mc_CRY-T-3-all",
    "sum-D_mc_CRY-S-0-all",
    "sum-D_mc_CRY-S-1-all",
    "sum-D_mc_CRY-S-2-all",
    "sum-D_mc_CRY-S-3-all",
    "D_lc-chi-0-all",
    "D_lc-chi-1-all",
    "D_lc-chi-2-all",
    "D_lc-chi-3-all",
    "D_lc-Z-0-all",
    "D_lc-Z-1-all",
    "D_lc-Z-2-all",
    "D_lc-Z-3-all",
    "D_lc-I-0-all",
    "D_lc-I-1-all",
    "D_lc-I-2-all",
    "D_lc-I-3-all",
    "D_lc-T-0-all",
    "D_lc-T-1-all",
    "D_lc-T-2-all",
    "D_lc-T-3-all",
    "D_lc-S-0-all",
    "D_lc-S-1-all",
    "D_lc-S-2-all",
    "D_lc-S-3-all",
    "D_lc-alpha-0-all",
    "D_lc-alpha-1-all",
    "D_lc-alpha-2-all",
    "D_lc-alpha-3-all",
    "D_func-chi-0-all",
    "D_func-chi-1-all",
    "D_func-chi-2-all",
    "D_func-chi-3-all",
    "D_func-Z-0-all",
    "D_func-Z-1-all",
    "D_func-Z-2-all",
    "D_func-Z-3-all",
    "D_func-I-0-all",
    "D_func-I-1-all",
    "D_func-I-2-all",
    "D_func-I-3-all",
    "D_func-T-0-all",
    "D_func-T-1-all",
    "D_func-T-2-all",
    "D_func-T-3-all",
    "D_func-S-0-all",
    "D_func-S-1-all",
    "D_func-S-2-all",
    "D_func-S-3-all",
    "D_func-alpha-0-all",
    "D_func-alpha-1-all",
    "D_func-alpha-2-all",
    "D_func-alpha-3-all",
    "sum-D_lc-chi-0-all",
    "sum-D_lc-chi-1-all",
    "sum-D_lc-chi-2-all",
    "sum-D_lc-chi-3-all",
    "sum-D_lc-Z-0-all",
    "sum-D_lc-Z-1-all",
    "sum-D_lc-Z-2-all",
    "sum-D_lc-Z-3-all",
    "sum-D_lc-I-0-all",
    "sum-D_lc-I-1-all",
    "sum-D_lc-I-2-all",
    "sum-D_lc-I-3-all",
    "sum-D_lc-T-0-all",
    "sum-D_lc-T-1-all",
    "sum-D_lc-T-2-all",
    "sum-D_lc-T-3-all",
    "sum-D_lc-S-0-all",
    "sum-D_lc-S-1-all",
    "sum-D_lc-S-2-all",
    "sum-D_lc-S-3-all",
    "sum-D_lc-alpha-0-all",
    "sum-D_lc-alpha-1-all",
    "sum-D_lc-alpha-2-all",
    "sum-D_lc-alpha-3-all",
    "sum-D_func-chi-0-all",
    "sum-D_func-chi-1-all",
    "sum-D_func-chi-2-all",
    "sum-D_func-chi-3-all",
    "sum-D_func-Z-0-all",
    "sum-D_func-Z-1-all",
    "sum-D_func-Z-2-all",
    "sum-D_func-Z-3-all",
    "sum-D_func-I-0-all",
    "sum-D_func-I-1-all",
    "sum-D_func-I-2-all",
    "sum-D_func-I-3-all",
    "sum-D_func-T-0-all",
    "sum-D_func-T-1-all",
    "sum-D_func-T-2-all",
    "sum-D_func-T-3-all",
    "sum-D_func-S-0-all",
    "sum-D_func-S-1-all",
    "sum-D_func-S-2-all",
    "sum-D_func-S-3-all",
    "sum-D_func-alpha-0-all",
    "sum-D_func-alpha-1-all",
    "sum-D_func-alpha-2-all",
    "sum-D_func-alpha-3-all",
]

TARGET = 'logKH_CO2'

## Make sure we can make it work outside MOFBench

In [17]:
def tune(train_data, valid_data, num_trials=10):
    def objective(trial):
        param = {
            "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1, log=True),
            "depth": trial.suggest_int("depth", 1, 16),
            "iterations": trial.suggest_int("iterations", 1, 10000),
            "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.5, log=True),
            "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 0.01, 10),
            "random_strength": trial.suggest_float("random_strength", 0.01, 10),
            "bagging_temperature": trial.suggest_float("bagging_temperature", 0.01, 10),
        }
        model = CatBoostRegressor(
            **param,
            silent=True,
        )
        model.fit(train_data[0], train_data[1])

        predictions = model.predict(valid_data[0])
        mse = mean_squared_error(valid_data[1], predictions)
        return mse

    study = optuna.create_study(
        pruner=optuna.pruners.MedianPruner(n_warmup_steps=5), direction="minimize"
    )
    study.optimize(
        objective,
        n_trials=num_trials,
        timeout=600,
        callbacks=[], # WeightsAndBiasesCallback(wandb_kwargs==wandb_kwargs) can be nice to use
    )
    model = CatBoostRegressor(
        **study.best_params,
        silent=True,
    )

    model.fit(train_data[0], train_data[1])

    return model 

The following lines are solely for the purpose of debugging!

In [18]:
df = LogkHCO2OODBench(None)._ds._df

2022-08-04 10:21:04.368 | DEBUG    | mofdscribe.datasets.core_dataset:__init__:118 - Dropped 639 duplicate basenames. New length 8182
2022-08-04 10:21:04.378 | DEBUG    | mofdscribe.datasets.core_dataset:__init__:124 - Dropped 1312 duplicate graphs. New length 6870
2022-08-04 10:21:05.323 | DEBUG    | mofdscribe.datasets.core_dataset:__init__:118 - Dropped 639 duplicate basenames. New length 8182
2022-08-04 10:21:05.333 | DEBUG    | mofdscribe.datasets.core_dataset:__init__:124 - Dropped 1312 duplicate graphs. New length 6870
2022-08-04 10:21:05.356 | DEBUG    | mofdscribe.splitters.splitters:__init__:116 - Splitter settings | shuffle True, random state None, sample frac 1.0, q (0, 0.25, 0.5, 0.75, 1)


In [19]:
part_1 = df.iloc[:100]
part_2 = df.iloc[100:200]

In [21]:
model = tune((part_1[FEATURES], part_1[TARGET]), (part_2[FEATURES], part_2[TARGET]))

[32m[I 2022-08-04 10:32:42,673][0m A new study created in memory with name: no-name-f0c55bb6-99b3-46ba-aa6d-e6ce463b16cb[0m
[32m[I 2022-08-04 10:32:47,106][0m Trial 0 finished with value: 0.9282989828252737 and parameters: {'colsample_bylevel': 0.015379121030965405, 'depth': 11, 'iterations': 9362, 'learning_rate': 0.10681725745136504, 'l2_leaf_reg': 2.5660243942202157, 'random_strength': 0.3738004901477886, 'bagging_temperature': 2.5786889048949253}. Best is trial 0 with value: 0.9282989828252737.[0m
[32m[I 2022-08-04 10:33:09,589][0m Trial 1 finished with value: 0.9463454876431397 and parameters: {'colsample_bylevel': 0.02489829296427832, 'depth': 13, 'iterations': 9334, 'learning_rate': 0.0013196665681919847, 'l2_leaf_reg': 8.537893489170106, 'random_strength': 5.099655648252827, 'bagging_temperature': 3.609814613019158}. Best is trial 0 with value: 0.9282989828252737.[0m
[32m[I 2022-08-04 10:33:23,092][0m Trial 2 finished with value: 0.8919964807222683 and parameters: {'

This seems to work.

Let's see how we implement this in a `MOFBench` class using a `mofdscribe` splitter.

In this example we really want to avoid data leakage and hence will also use the `HashSplitter` in the inner loop.