# Performing hyperparameter optimization in the bench class


For this example we will use a model that consumes pre-computed features.
We will use [CatBoost](https://catboost.ai/) which you need to install separately (e.g. using `pip install catboost`).

We will use [Optuna](https://optuna.org/) for hyperparameter optimization.


In [1]:
import optuna
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error

from mofdscribe.bench import LogkHCO2OODBench
from mofdscribe.bench.df_model import DFModel
from mofdscribe.splitters import HashSplitter
from mofdscribe.datasets.core_dataset import CoREDataset

In [2]:
ds = CoREDataset()  

FEATURES = list(ds.available_features) 

TARGET = "outputs.logKH_CO2"


2022-08-05 14:37:41.532 | DEBUG    | mofdscribe.datasets.core_dataset:__init__:125 - Dropped 3227 duplicate basenames. New length 2166
2022-08-05 14:37:41.620 | DEBUG    | mofdscribe.datasets.core_dataset:__init__:131 - Dropped 62 duplicate graphs. New length 2104


## Make sure we can make it work outside MOFBench


In [3]:
def tune(train_data, valid_data, num_trials=10):
    def objective(trial):
        param = {
            "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1, log=True),
            "depth": trial.suggest_int("depth", 1, 16),
            "iterations": trial.suggest_int("iterations", 1, 10000),
            "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.5, log=True),
            "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 0.01, 10),
            "random_strength": trial.suggest_float("random_strength", 0.01, 10),
            "bagging_temperature": trial.suggest_float("bagging_temperature", 0.01, 10),
        }
        model = CatBoostRegressor(
            **param,
            silent=True,
        )
        model.fit(train_data[0], train_data[1])

        predictions = model.predict(valid_data[0])
        mse = mean_squared_error(valid_data[1], predictions)
        return mse

    study = optuna.create_study(
        pruner=optuna.pruners.MedianPruner(n_warmup_steps=5), direction="minimize"
    )
    study.optimize(
        objective,
        n_trials=num_trials,
        timeout=600,
        callbacks=[],  # WeightsAndBiasesCallback(wandb_kwargs==wandb_kwargs) can be nice to use
    )
    model = CatBoostRegressor(
        **study.best_params,
        silent=True,
    )

    model.fit(train_data[0], train_data[1])

    return model


The following lines are solely for the purpose of debugging!


In [4]:
df = LogkHCO2OODBench(None)._ds._df


2022-08-05 14:37:49.103 | DEBUG    | mofdscribe.datasets.core_dataset:__init__:125 - Dropped 3227 duplicate basenames. New length 2166
2022-08-05 14:37:49.199 | DEBUG    | mofdscribe.datasets.core_dataset:__init__:131 - Dropped 62 duplicate graphs. New length 2104
2022-08-05 14:37:53.499 | DEBUG    | mofdscribe.datasets.core_dataset:__init__:125 - Dropped 3227 duplicate basenames. New length 2166
2022-08-05 14:37:53.537 | DEBUG    | mofdscribe.datasets.core_dataset:__init__:131 - Dropped 62 duplicate graphs. New length 2104
2022-08-05 14:37:53.548 | DEBUG    | mofdscribe.splitters.splitters:__init__:116 - Splitter settings | shuffle True, random state None, sample frac 1.0, q (0, 0.25, 0.5, 0.75, 1)


In [5]:
part_1 = df.iloc[:100]
part_2 = df.iloc[100:200]


In [6]:
model = tune((part_1[FEATURES], part_1[TARGET]), (part_2[FEATURES], part_2[TARGET]))


[32m[I 2022-08-05 14:38:33,761][0m A new study created in memory with name: no-name-e4a912ef-5ee2-4f6e-9ee4-cdf8da5e51d6[0m
[32m[I 2022-08-05 14:41:17,306][0m Trial 0 finished with value: 0.9928974128555632 and parameters: {'colsample_bylevel': 0.04916021967521814, 'depth': 10, 'iterations': 6508, 'learning_rate': 0.26292081565880115, 'l2_leaf_reg': 9.302100409349237, 'random_strength': 1.4880347676087942, 'bagging_temperature': 9.253479940692216}. Best is trial 0 with value: 0.9928974128555632.[0m
[32m[I 2022-08-05 15:02:20,466][0m Trial 1 finished with value: 1.0796535748049636 and parameters: {'colsample_bylevel': 0.06872108685419895, 'depth': 16, 'iterations': 3200, 'learning_rate': 0.04532176566972099, 'l2_leaf_reg': 1.4199906094824046, 'random_strength': 9.285762721519504, 'bagging_temperature': 2.8458182059019452}. Best is trial 0 with value: 0.9928974128555632.[0m


This seems to work.


Let's see how we implement this in a `MOFBench` class using a `mofdscribe` splitter.


In this example we really want to avoid data leakage and hence will also use the `HashSplitter` in the inner loop.
Doing so is relatively easy as we can construct new datasets that we can use in splitters using the `get_subset` method of the datasets.


In [26]:
class MyCatBoostModel:
    def __init__(self, features=FEATURES, num_trials=10):
        self.features = features
        self.model = CatBoostRegressor()
        self.num_trials = num_trials

    def tune(self, idx, y):
        tune_splitter = HashSplitter(self.ds.get_subset(idx))
        # we will now use a simple split in two parts,
        # however, you could also use a k-fold in the tune method
        train_idx_, valid_idx_ = tune_splitter.train_test_split(0.8)
        train_idx = idx[train_idx_]
        valid_idx = idx[valid_idx_]
        train_data = (self.ds._df.iloc[train_idx][self.features], y[train_idx_])
        valid_data = (self.ds._df.iloc[valid_idx][self.features], y[valid_idx_])
        self.model = tune(train_data, valid_data, self.num_trials)

    def fit(self, idx, structures, y):
        self.tune(idx, y)
        X = self.ds._df.iloc[idx][self.features]
        self.model.fit(X, y)

    def predict(self, idx, structures):
        X = self.ds._df.iloc[idx][self.features]
        pred = self.model.predict(X)
        return pred


In [16]:
bench = LogkHCO2OODBench(
    MyCatBoostModel(), name="my model", features=" ,".join(FEATURES), patch_in_ds=True, debug=True
)


2022-08-04 15:44:19.025 | DEBUG    | mofdscribe.datasets.core_dataset:__init__:123 - Dropped 639 duplicate basenames. New length 8182
2022-08-04 15:44:19.034 | DEBUG    | mofdscribe.datasets.core_dataset:__init__:129 - Dropped 1312 duplicate graphs. New length 6870
2022-08-04 15:44:19.948 | DEBUG    | mofdscribe.datasets.core_dataset:__init__:123 - Dropped 639 duplicate basenames. New length 8182
2022-08-04 15:44:19.959 | DEBUG    | mofdscribe.datasets.core_dataset:__init__:129 - Dropped 1312 duplicate graphs. New length 6870
2022-08-04 15:44:19.981 | DEBUG    | mofdscribe.splitters.splitters:__init__:116 - Splitter settings | shuffle True, random state None, sample frac 0.01, q (0, 0.25, 0.5, 0.75, 1)


In [17]:
result = bench.bench()


2022-08-04 15:44:20.129 | DEBUG    | mofdscribe.bench.mofbench:_score:230 - K-fold round 0, 54 train points, 13 test points
2022-08-04 15:44:21.065 | DEBUG    | mofdscribe.datasets.core_dataset:__init__:123 - Dropped 639 duplicate basenames. New length 8182
2022-08-04 15:44:21.076 | DEBUG    | mofdscribe.datasets.core_dataset:__init__:129 - Dropped 1312 duplicate graphs. New length 6870
2022-08-04 15:44:21.083 | DEBUG    | mofdscribe.splitters.splitters:__init__:116 - Splitter settings | shuffle True, random state None, sample frac 1.0, q (0, 0.25, 0.5, 0.75, 1)
2022-08-04 15:44:21.084 | DEBUG    | mofdscribe.splitters.utils:check_fraction:429 - Using fractions: train: 0.7, valid: 0, test: 0.30000000000000004
2022-08-04 15:44:21.084 | DEBUG    | mofdscribe.splitters.splitters:train_test_split:159 - Using grouped partition
[32m[I 2022-08-04 15:44:21,086][0m A new study created in memory with name: no-name-5816ed5a-12b7-49a3-87f8-80c6694becff[0m
[32m[I 2022-08-04 15:44:22,559][0m Tr

[-3.61860741 -3.7046139  -3.58547182 -3.70272673 -2.78660819 -3.55187778
 -3.55935148 -3.41970087 -2.75539022 -3.68527742 -3.81238797 -3.19262452
 -3.13289448]


2022-08-04 15:45:42.364 | DEBUG    | mofdscribe.datasets.core_dataset:__init__:123 - Dropped 639 duplicate basenames. New length 8182
2022-08-04 15:45:42.374 | DEBUG    | mofdscribe.datasets.core_dataset:__init__:129 - Dropped 1312 duplicate graphs. New length 6870
2022-08-04 15:45:42.381 | DEBUG    | mofdscribe.splitters.splitters:__init__:116 - Splitter settings | shuffle True, random state None, sample frac 1.0, q (0, 0.25, 0.5, 0.75, 1)
2022-08-04 15:45:42.381 | DEBUG    | mofdscribe.splitters.utils:check_fraction:429 - Using fractions: train: 0.7, valid: 0, test: 0.30000000000000004
2022-08-04 15:45:42.382 | DEBUG    | mofdscribe.splitters.splitters:train_test_split:159 - Using grouped partition
[32m[I 2022-08-04 15:45:42,384][0m A new study created in memory with name: no-name-c05fbc0b-1c0b-4ea8-ac19-ec6f615ac4bd[0m
[32m[I 2022-08-04 15:45:45,253][0m Trial 0 finished with value: 0.7889371030146659 and parameters: {'colsample_bylevel': 0.02248339281894256, 'depth': 8, 'iterat

[-4.05266121 -3.23464762 -2.70226256 -2.82330667 -3.97407216 -2.20919676
 -2.30515831 -2.84212156 -3.12909018 -3.95068338 -2.08661299 -3.80340152
 -4.47927697]


2022-08-04 15:46:22.720 | DEBUG    | mofdscribe.datasets.core_dataset:__init__:123 - Dropped 639 duplicate basenames. New length 8182
2022-08-04 15:46:22.730 | DEBUG    | mofdscribe.datasets.core_dataset:__init__:129 - Dropped 1312 duplicate graphs. New length 6870
2022-08-04 15:46:22.737 | DEBUG    | mofdscribe.splitters.splitters:__init__:116 - Splitter settings | shuffle True, random state None, sample frac 1.0, q (0, 0.25, 0.5, 0.75, 1)
2022-08-04 15:46:22.738 | DEBUG    | mofdscribe.splitters.utils:check_fraction:429 - Using fractions: train: 0.7, valid: 0, test: 0.30000000000000004
2022-08-04 15:46:22.738 | DEBUG    | mofdscribe.splitters.splitters:train_test_split:159 - Using grouped partition
[32m[I 2022-08-04 15:46:22,740][0m A new study created in memory with name: no-name-5261ed92-d040-4ec7-ba6d-2b8904f78c64[0m
[32m[I 2022-08-04 15:46:23,175][0m Trial 0 finished with value: 0.9654515919255807 and parameters: {'colsample_bylevel': 0.01997304386801485, 'depth': 4, 'iterat

[-3.04296752 -2.89693127 -3.12098484 -3.84862189 -2.25986282 -2.54504573
 -3.65971382 -2.2243226  -2.46030072 -2.64395391 -2.86685446 -3.71098824
 -3.94197044]


2022-08-04 15:46:33.374 | DEBUG    | mofdscribe.datasets.core_dataset:__init__:123 - Dropped 639 duplicate basenames. New length 8182
2022-08-04 15:46:33.385 | DEBUG    | mofdscribe.datasets.core_dataset:__init__:129 - Dropped 1312 duplicate graphs. New length 6870
2022-08-04 15:46:33.391 | DEBUG    | mofdscribe.splitters.splitters:__init__:116 - Splitter settings | shuffle True, random state None, sample frac 1.0, q (0, 0.25, 0.5, 0.75, 1)
2022-08-04 15:46:33.391 | DEBUG    | mofdscribe.splitters.utils:check_fraction:429 - Using fractions: train: 0.7, valid: 0, test: 0.30000000000000004
2022-08-04 15:46:33.392 | DEBUG    | mofdscribe.splitters.splitters:train_test_split:159 - Using grouped partition
[32m[I 2022-08-04 15:46:33,393][0m A new study created in memory with name: no-name-12d0e11d-eb5a-41f0-b63e-2134eb4ae2cb[0m
[32m[I 2022-08-04 15:46:41,254][0m Trial 0 finished with value: 2.7885410941153683 and parameters: {'colsample_bylevel': 0.023727644142140212, 'depth': 15, 'iter

[-2.85757107 -3.41350249 -3.2915807  -3.11396472 -3.07865049 -3.29083662
 -3.52789152 -3.88830457 -3.71440638 -2.7160054  -4.1858231  -3.63144649
 -4.22562457]


2022-08-04 15:47:16.250 | DEBUG    | mofdscribe.datasets.core_dataset:__init__:123 - Dropped 639 duplicate basenames. New length 8182
2022-08-04 15:47:16.260 | DEBUG    | mofdscribe.datasets.core_dataset:__init__:129 - Dropped 1312 duplicate graphs. New length 6870
2022-08-04 15:47:16.267 | DEBUG    | mofdscribe.splitters.splitters:__init__:116 - Splitter settings | shuffle True, random state None, sample frac 1.0, q (0, 0.25, 0.5, 0.75, 1)
2022-08-04 15:47:16.267 | DEBUG    | mofdscribe.splitters.utils:check_fraction:429 - Using fractions: train: 0.7, valid: 0, test: 0.30000000000000004
2022-08-04 15:47:16.267 | DEBUG    | mofdscribe.splitters.splitters:train_test_split:159 - Using grouped partition
[32m[I 2022-08-04 15:47:16,269][0m A new study created in memory with name: no-name-15d399fe-cf5d-49b7-a2c9-4b204cd9ea51[0m
[32m[I 2022-08-04 15:47:16,514][0m Trial 0 finished with value: 0.6991815496444341 and parameters: {'colsample_bylevel': 0.022695129195569888, 'depth': 1, 'itera

[-3.32927013 -3.23669601 -3.58300431 -3.47583225 -3.04673094 -2.84378802
 -3.32928865 -3.3407119  -2.39338696 -3.45640047 -3.45613196 -3.49974492
 -3.95592873]


In [18]:
result


BenchResult(start_time=datetime.datetime(2022, 8, 4, 13, 44, 20, 126292, tzinfo=datetime.timezone.utc), end_time=datetime.datetime(2022, 8, 4, 13, 47, 42, 220544, tzinfo=datetime.timezone.utc), metrics=RegressionMetricCollection(regression_metrics=[RegressionMetrics(mean_squared_error=1.324773649885319, mean_absolute_error=0.8438648987836127, r2_score=0.11883897879823824, max_error=2.611841211766282, mean_absolute_percentage_error=0.40289237423713364, top_5_in_top_5=0, top_10_in_top_10=1, top_50_in_top_50=1, top_100_in_top_100=1, top_500_in_top_500=1), RegressionMetrics(mean_squared_error=1.1322879159369008, mean_absolute_error=0.8708695405334369, r2_score=-1.0910638475959717, max_error=2.054046702028823, mean_absolute_percentage_error=0.2462515188629681, top_5_in_top_5=0, top_10_in_top_10=0, top_50_in_top_50=1, top_100_in_top_100=1, top_500_in_top_500=1), RegressionMetrics(mean_squared_error=0.6836588649093175, mean_absolute_error=0.6745806377610815, r2_score=0.3266337006687946, max_e

Now, as it ran sucessfully using `debug=True`, we can try a full run.


In [28]:
bench = LogkHCO2OODBench(
    MyCatBoostModel(num_trials=100),
    name="my catboost model",
    features=" ,".join(FEATURES),
    patch_in_ds=True,
    debug=False,
)


2022-08-04 17:09:26.970 | DEBUG    | mofdscribe.datasets.core_dataset:__init__:123 - Dropped 639 duplicate basenames. New length 8182
2022-08-04 17:09:26.981 | DEBUG    | mofdscribe.datasets.core_dataset:__init__:129 - Dropped 1312 duplicate graphs. New length 6870
2022-08-04 17:09:27.934 | DEBUG    | mofdscribe.datasets.core_dataset:__init__:123 - Dropped 639 duplicate basenames. New length 8182
2022-08-04 17:09:27.944 | DEBUG    | mofdscribe.datasets.core_dataset:__init__:129 - Dropped 1312 duplicate graphs. New length 6870
2022-08-04 17:09:27.969 | DEBUG    | mofdscribe.splitters.splitters:__init__:116 - Splitter settings | shuffle True, random state None, sample frac 1.0, q (0, 0.25, 0.5, 0.75, 1)


In [29]:
result = bench.bench()


2022-08-04 17:09:28.017 | DEBUG    | mofdscribe.bench.mofbench:_score:230 - K-fold round 0, 5493 train points, 1377 test points
2022-08-04 17:09:28.930 | DEBUG    | mofdscribe.datasets.core_dataset:__init__:123 - Dropped 639 duplicate basenames. New length 8182
2022-08-04 17:09:28.940 | DEBUG    | mofdscribe.datasets.core_dataset:__init__:129 - Dropped 1312 duplicate graphs. New length 6870
2022-08-04 17:09:28.969 | DEBUG    | mofdscribe.splitters.splitters:__init__:116 - Splitter settings | shuffle True, random state None, sample frac 1.0, q (0, 0.25, 0.5, 0.75, 1)
2022-08-04 17:09:28.969 | DEBUG    | mofdscribe.splitters.utils:check_fraction:429 - Using fractions: train: 0.8, valid: 0, test: 0.19999999999999996
2022-08-04 17:09:28.970 | DEBUG    | mofdscribe.splitters.splitters:train_test_split:159 - Using grouped partition
[32m[I 2022-08-04 17:09:29,189][0m A new study created in memory with name: no-name-03688dfa-0692-43c4-8003-429ceb10a523[0m
[32m[I 2022-08-04 17:09:34,336][0

In [30]:
result

BenchResult(start_time=datetime.datetime(2022, 8, 4, 15, 9, 28, 14098, tzinfo=datetime.timezone.utc), end_time=datetime.datetime(2022, 8, 4, 16, 33, 10, 16847, tzinfo=datetime.timezone.utc), metrics=RegressionMetricCollection(regression_metrics=[RegressionMetrics(mean_squared_error=0.5207995001907892, mean_absolute_error=0.5522248220190105, r2_score=0.5313560475037885, max_error=3.7823977350284106, mean_absolute_percentage_error=0.2720004732376112, top_5_in_top_5=0, top_10_in_top_10=0, top_50_in_top_50=0, top_100_in_top_100=0, top_500_in_top_500=0), RegressionMetrics(mean_squared_error=0.33931323077304393, mean_absolute_error=0.42248206804102334, r2_score=0.4048151298146425, max_error=3.659926973316546, mean_absolute_percentage_error=0.13795236838609187, top_5_in_top_5=0, top_10_in_top_10=0, top_50_in_top_50=0, top_100_in_top_100=0, top_500_in_top_500=1), RegressionMetrics(mean_squared_error=0.8082637960081362, mean_absolute_error=0.6696999612360695, r2_score=0.5028078435972169, max_er