# Performing hyperparameter optimization in the bench class


For this example we will use a model that consumes pre-computed features.
We will use [CatBoost](https://catboost.ai/) which you need to install separately (e.g. using `pip install catboost`).

We will use [Optuna](https://optuna.org/) for hyperparameter optimization.


In [1]:
import optuna
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error

from mofdscribe.bench import LogkHCO2OODBench
from mofdscribe.bench.df_model import DFModel
from mofdscribe.splitters import HashSplitter


In [2]:
FEATURES = [
    "total_POV_gravimetric",
    "mc_CRY-chi-0-all",
    "mc_CRY-chi-1-all",
    "mc_CRY-chi-2-all",
    "mc_CRY-chi-3-all",
    "mc_CRY-Z-0-all",
    "mc_CRY-Z-1-all",
    "mc_CRY-Z-2-all",
    "mc_CRY-Z-3-all",
    "mc_CRY-I-0-all",
    "mc_CRY-I-1-all",
    "mc_CRY-I-2-all",
    "mc_CRY-I-3-all",
    "mc_CRY-T-0-all",
    "mc_CRY-T-1-all",
    "mc_CRY-T-2-all",
    "mc_CRY-T-3-all",
    "mc_CRY-S-0-all",
    "mc_CRY-S-1-all",
    "mc_CRY-S-2-all",
    "mc_CRY-S-3-all",
    "D_mc_CRY-chi-0-all",
    "D_mc_CRY-chi-1-all",
    "D_mc_CRY-chi-2-all",
    "D_mc_CRY-chi-3-all",
    "D_mc_CRY-Z-0-all",
    "D_mc_CRY-Z-1-all",
    "D_mc_CRY-Z-2-all",
    "D_mc_CRY-Z-3-all",
    "D_mc_CRY-I-0-all",
    "D_mc_CRY-I-1-all",
    "D_mc_CRY-I-2-all",
    "D_mc_CRY-I-3-all",
    "D_mc_CRY-T-0-all",
    "D_mc_CRY-T-1-all",
    "D_mc_CRY-T-2-all",
    "D_mc_CRY-T-3-all",
    "D_mc_CRY-S-0-all",
    "D_mc_CRY-S-1-all",
    "D_mc_CRY-S-2-all",
    "D_mc_CRY-S-3-all",
    "sum-mc_CRY-chi-0-all",
    "sum-mc_CRY-chi-1-all",
    "sum-mc_CRY-chi-2-all",
    "sum-mc_CRY-chi-3-all",
    "sum-mc_CRY-Z-0-all",
    "sum-mc_CRY-Z-1-all",
    "sum-mc_CRY-Z-2-all",
    "sum-mc_CRY-Z-3-all",
    "sum-mc_CRY-I-0-all",
    "sum-mc_CRY-I-1-all",
    "sum-mc_CRY-I-2-all",
    "sum-mc_CRY-I-3-all",
    "sum-mc_CRY-T-0-all",
    "sum-mc_CRY-T-1-all",
    "sum-mc_CRY-T-2-all",
    "sum-mc_CRY-T-3-all",
    "sum-mc_CRY-S-0-all",
    "sum-mc_CRY-S-1-all",
    "sum-mc_CRY-S-2-all",
    "sum-mc_CRY-S-3-all",
    "sum-D_mc_CRY-chi-0-all",
    "sum-D_mc_CRY-chi-1-all",
    "sum-D_mc_CRY-chi-2-all",
    "sum-D_mc_CRY-chi-3-all",
    "sum-D_mc_CRY-Z-0-all",
    "sum-D_mc_CRY-Z-1-all",
    "sum-D_mc_CRY-Z-2-all",
    "sum-D_mc_CRY-Z-3-all",
    "sum-D_mc_CRY-I-0-all",
    "sum-D_mc_CRY-I-1-all",
    "sum-D_mc_CRY-I-2-all",
    "sum-D_mc_CRY-I-3-all",
    "sum-D_mc_CRY-T-0-all",
    "sum-D_mc_CRY-T-1-all",
    "sum-D_mc_CRY-T-2-all",
    "sum-D_mc_CRY-T-3-all",
    "sum-D_mc_CRY-S-0-all",
    "sum-D_mc_CRY-S-1-all",
    "sum-D_mc_CRY-S-2-all",
    "sum-D_mc_CRY-S-3-all",
    "D_lc-chi-0-all",
    "D_lc-chi-1-all",
    "D_lc-chi-2-all",
    "D_lc-chi-3-all",
    "D_lc-Z-0-all",
    "D_lc-Z-1-all",
    "D_lc-Z-2-all",
    "D_lc-Z-3-all",
    "D_lc-I-0-all",
    "D_lc-I-1-all",
    "D_lc-I-2-all",
    "D_lc-I-3-all",
    "D_lc-T-0-all",
    "D_lc-T-1-all",
    "D_lc-T-2-all",
    "D_lc-T-3-all",
    "D_lc-S-0-all",
    "D_lc-S-1-all",
    "D_lc-S-2-all",
    "D_lc-S-3-all",
    "D_lc-alpha-0-all",
    "D_lc-alpha-1-all",
    "D_lc-alpha-2-all",
    "D_lc-alpha-3-all",
    "D_func-chi-0-all",
    "D_func-chi-1-all",
    "D_func-chi-2-all",
    "D_func-chi-3-all",
    "D_func-Z-0-all",
    "D_func-Z-1-all",
    "D_func-Z-2-all",
    "D_func-Z-3-all",
    "D_func-I-0-all",
    "D_func-I-1-all",
    "D_func-I-2-all",
    "D_func-I-3-all",
    "D_func-T-0-all",
    "D_func-T-1-all",
    "D_func-T-2-all",
    "D_func-T-3-all",
    "D_func-S-0-all",
    "D_func-S-1-all",
    "D_func-S-2-all",
    "D_func-S-3-all",
    "D_func-alpha-0-all",
    "D_func-alpha-1-all",
    "D_func-alpha-2-all",
    "D_func-alpha-3-all",
    "sum-D_lc-chi-0-all",
    "sum-D_lc-chi-1-all",
    "sum-D_lc-chi-2-all",
    "sum-D_lc-chi-3-all",
    "sum-D_lc-Z-0-all",
    "sum-D_lc-Z-1-all",
    "sum-D_lc-Z-2-all",
    "sum-D_lc-Z-3-all",
    "sum-D_lc-I-0-all",
    "sum-D_lc-I-1-all",
    "sum-D_lc-I-2-all",
    "sum-D_lc-I-3-all",
    "sum-D_lc-T-0-all",
    "sum-D_lc-T-1-all",
    "sum-D_lc-T-2-all",
    "sum-D_lc-T-3-all",
    "sum-D_lc-S-0-all",
    "sum-D_lc-S-1-all",
    "sum-D_lc-S-2-all",
    "sum-D_lc-S-3-all",
    "sum-D_lc-alpha-0-all",
    "sum-D_lc-alpha-1-all",
    "sum-D_lc-alpha-2-all",
    "sum-D_lc-alpha-3-all",
    "sum-D_func-chi-0-all",
    "sum-D_func-chi-1-all",
    "sum-D_func-chi-2-all",
    "sum-D_func-chi-3-all",
    "sum-D_func-Z-0-all",
    "sum-D_func-Z-1-all",
    "sum-D_func-Z-2-all",
    "sum-D_func-Z-3-all",
    "sum-D_func-I-0-all",
    "sum-D_func-I-1-all",
    "sum-D_func-I-2-all",
    "sum-D_func-I-3-all",
    "sum-D_func-T-0-all",
    "sum-D_func-T-1-all",
    "sum-D_func-T-2-all",
    "sum-D_func-T-3-all",
    "sum-D_func-S-0-all",
    "sum-D_func-S-1-all",
    "sum-D_func-S-2-all",
    "sum-D_func-S-3-all",
    "sum-D_func-alpha-0-all",
    "sum-D_func-alpha-1-all",
    "sum-D_func-alpha-2-all",
    "sum-D_func-alpha-3-all",
]

TARGET = "logKH_CO2"


## Make sure we can make it work outside MOFBench


In [3]:
def tune(train_data, valid_data, num_trials=10):
    def objective(trial):
        param = {
            "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1, log=True),
            "depth": trial.suggest_int("depth", 1, 16),
            "iterations": trial.suggest_int("iterations", 1, 10000),
            "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.5, log=True),
            "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 0.01, 10),
            "random_strength": trial.suggest_float("random_strength", 0.01, 10),
            "bagging_temperature": trial.suggest_float("bagging_temperature", 0.01, 10),
        }
        model = CatBoostRegressor(
            **param,
            silent=True,
        )
        model.fit(train_data[0], train_data[1])

        predictions = model.predict(valid_data[0])
        mse = mean_squared_error(valid_data[1], predictions)
        return mse

    study = optuna.create_study(
        pruner=optuna.pruners.MedianPruner(n_warmup_steps=5), direction="minimize"
    )
    study.optimize(
        objective,
        n_trials=num_trials,
        timeout=600,
        callbacks=[],  # WeightsAndBiasesCallback(wandb_kwargs==wandb_kwargs) can be nice to use
    )
    model = CatBoostRegressor(
        **study.best_params,
        silent=True,
    )

    model.fit(train_data[0], train_data[1])

    return model


The following lines are solely for the purpose of debugging!


In [4]:
df = LogkHCO2OODBench(None)._ds._df


2022-08-04 15:27:15.742 | DEBUG    | mofdscribe.datasets.core_dataset:__init__:123 - Dropped 639 duplicate basenames. New length 8182
2022-08-04 15:27:15.751 | DEBUG    | mofdscribe.datasets.core_dataset:__init__:129 - Dropped 1312 duplicate graphs. New length 6870
2022-08-04 15:27:16.671 | DEBUG    | mofdscribe.datasets.core_dataset:__init__:123 - Dropped 639 duplicate basenames. New length 8182
2022-08-04 15:27:16.682 | DEBUG    | mofdscribe.datasets.core_dataset:__init__:129 - Dropped 1312 duplicate graphs. New length 6870
2022-08-04 15:27:16.704 | DEBUG    | mofdscribe.splitters.splitters:__init__:116 - Splitter settings | shuffle True, random state None, sample frac 1.0, q (0, 0.25, 0.5, 0.75, 1)


In [5]:
part_1 = df.iloc[:100]
part_2 = df.iloc[100:200]


In [6]:
model = tune((part_1[FEATURES], part_1[TARGET]), (part_2[FEATURES], part_2[TARGET]))


[32m[I 2022-08-04 15:27:16,797][0m A new study created in memory with name: no-name-ae210727-6a82-46ef-ad1a-cbbc2c7c86f1[0m
[32m[I 2022-08-04 15:27:22,784][0m Trial 0 finished with value: 0.8674590769964755 and parameters: {'colsample_bylevel': 0.07158822825029386, 'depth': 9, 'iterations': 2771, 'learning_rate': 0.10466163577655244, 'l2_leaf_reg': 6.870066951280304, 'random_strength': 8.041568108904727, 'bagging_temperature': 8.92881468725213}. Best is trial 0 with value: 0.8674590769964755.[0m
[32m[I 2022-08-04 15:27:42,671][0m Trial 1 finished with value: 0.938751858116806 and parameters: {'colsample_bylevel': 0.06842527731599611, 'depth': 12, 'iterations': 4813, 'learning_rate': 0.007537524931211579, 'l2_leaf_reg': 0.5540167619204627, 'random_strength': 7.528369762943663, 'bagging_temperature': 3.1143513700339427}. Best is trial 0 with value: 0.8674590769964755.[0m
[32m[I 2022-08-04 15:27:46,900][0m Trial 2 finished with value: 0.7908504336324068 and parameters: {'colsam

This seems to work.


Let's see how we implement this in a `MOFBench` class using a `mofdscribe` splitter.


In this example we really want to avoid data leakage and hence will also use the `HashSplitter` in the inner loop.
Doing so is relatively easy as we can construct new datasets that we can use in splitters using the `get_subset` method of the datasets.


In [13]:
class MyCatBoostModel:
    def __init__(self, features=FEATURES):
        self.features = features
        self.model = CatBoostRegressor()

    def tune(self, idx, y):
        tune_splitter = HashSplitter(self.ds.get_subset(idx))
        # we will now use a simple split in two parts,
        # however, you could also use a k-fold in the tune method
        train_idx_, valid_idx_ = tune_splitter.train_test_split(0.7)
        train_idx = idx[train_idx_]
        valid_idx = idx[valid_idx_]
        train_data = (self.ds._df.iloc[train_idx][self.features], y[train_idx_])
        valid_data = (self.ds._df.iloc[valid_idx][self.features], y[valid_idx_])
        self.model = tune(train_data, valid_data)

    def fit(self, idx, structures, y):
        self.tune(idx, y)
        X = self.ds._df.iloc[idx][self.features]
        self.model.fit(X, y)

    def predict(self, idx, structures):
        X = self.ds._df.iloc[idx][self.features]
        pred = self.model.predict(X)
        print(pred)
        return pred


In [16]:
bench = LogkHCO2OODBench(
    MyCatBoostModel(), name="my model", features=" ,".join(FEATURES), patch_in_ds=True, debug=True
)


2022-08-04 15:44:19.025 | DEBUG    | mofdscribe.datasets.core_dataset:__init__:123 - Dropped 639 duplicate basenames. New length 8182
2022-08-04 15:44:19.034 | DEBUG    | mofdscribe.datasets.core_dataset:__init__:129 - Dropped 1312 duplicate graphs. New length 6870
2022-08-04 15:44:19.948 | DEBUG    | mofdscribe.datasets.core_dataset:__init__:123 - Dropped 639 duplicate basenames. New length 8182
2022-08-04 15:44:19.959 | DEBUG    | mofdscribe.datasets.core_dataset:__init__:129 - Dropped 1312 duplicate graphs. New length 6870
2022-08-04 15:44:19.981 | DEBUG    | mofdscribe.splitters.splitters:__init__:116 - Splitter settings | shuffle True, random state None, sample frac 0.01, q (0, 0.25, 0.5, 0.75, 1)


In [17]:
result = bench.bench()


2022-08-04 15:44:20.129 | DEBUG    | mofdscribe.bench.mofbench:_score:230 - K-fold round 0, 54 train points, 13 test points
2022-08-04 15:44:21.065 | DEBUG    | mofdscribe.datasets.core_dataset:__init__:123 - Dropped 639 duplicate basenames. New length 8182
2022-08-04 15:44:21.076 | DEBUG    | mofdscribe.datasets.core_dataset:__init__:129 - Dropped 1312 duplicate graphs. New length 6870
2022-08-04 15:44:21.083 | DEBUG    | mofdscribe.splitters.splitters:__init__:116 - Splitter settings | shuffle True, random state None, sample frac 1.0, q (0, 0.25, 0.5, 0.75, 1)
2022-08-04 15:44:21.084 | DEBUG    | mofdscribe.splitters.utils:check_fraction:429 - Using fractions: train: 0.7, valid: 0, test: 0.30000000000000004
2022-08-04 15:44:21.084 | DEBUG    | mofdscribe.splitters.splitters:train_test_split:159 - Using grouped partition
[32m[I 2022-08-04 15:44:21,086][0m A new study created in memory with name: no-name-5816ed5a-12b7-49a3-87f8-80c6694becff[0m
[32m[I 2022-08-04 15:44:22,559][0m Tr

[-3.61860741 -3.7046139  -3.58547182 -3.70272673 -2.78660819 -3.55187778
 -3.55935148 -3.41970087 -2.75539022 -3.68527742 -3.81238797 -3.19262452
 -3.13289448]


2022-08-04 15:45:42.364 | DEBUG    | mofdscribe.datasets.core_dataset:__init__:123 - Dropped 639 duplicate basenames. New length 8182
2022-08-04 15:45:42.374 | DEBUG    | mofdscribe.datasets.core_dataset:__init__:129 - Dropped 1312 duplicate graphs. New length 6870
2022-08-04 15:45:42.381 | DEBUG    | mofdscribe.splitters.splitters:__init__:116 - Splitter settings | shuffle True, random state None, sample frac 1.0, q (0, 0.25, 0.5, 0.75, 1)
2022-08-04 15:45:42.381 | DEBUG    | mofdscribe.splitters.utils:check_fraction:429 - Using fractions: train: 0.7, valid: 0, test: 0.30000000000000004
2022-08-04 15:45:42.382 | DEBUG    | mofdscribe.splitters.splitters:train_test_split:159 - Using grouped partition
[32m[I 2022-08-04 15:45:42,384][0m A new study created in memory with name: no-name-c05fbc0b-1c0b-4ea8-ac19-ec6f615ac4bd[0m
[32m[I 2022-08-04 15:45:45,253][0m Trial 0 finished with value: 0.7889371030146659 and parameters: {'colsample_bylevel': 0.02248339281894256, 'depth': 8, 'iterat