# Using hyperopt-sklearn with `Bench`


[hyperopt-sklearn](https://github.com/hyperopt/hyperopt-sklearn) is a popular library for tuning many "classical" ML models. The convenience this library brings is that it has predefined hyperparameter grids that are automatically tuned when one calls `fit`.

One difficulty is that it currently does not have a convenient interface for providing custom validation sets (see [this issue](https://github.com/hyperopt/hyperopt-sklearn/issues/152)). We will use this as one other example for how to implement a model for benchmarking with `mofdscribe`.


In [1]:
from mofdscribe.bench import LogkHCO2IDBench
from mofdscribe.datasets.core_dataset import CoREDataset
from hpsklearn import (
    HyperoptEstimator,
    gaussian_process_regressor,
    lightgbm_regression,
    power_transformer,
    standard_scaler,
    xgboost_regression,
)

from mofdscribe.splitters import HashSplitter
import numpy as np
from copy import deepcopy


In [2]:
ds = CoREDataset()

FEATURES = list(ds.available_features)

TARGET = "outputs.logKH_CO2"


2022-08-07 16:25:09.726 | DEBUG    | mofdscribe.datasets.core_dataset:__init__:127 - Dropped 3227 duplicate basenames. New length 2166
2022-08-07 16:25:09.762 | DEBUG    | mofdscribe.datasets.core_dataset:__init__:133 - Dropped 62 duplicate graphs. New length 2104


In [7]:
class TunedXBoost:
    def __init__(self, features):
        self.model = HyperoptEstimator(regressor=xgboost_regression("mymodel"))
        self.features = features

    def tune(self, idx, y):
        tune_splitter = HashSplitter(self.ds.get_subset(idx))
        # we will now use a simple split in two parts,
        # however, you could also use a k-fold in the tune method
        models = []
        for train_idx_, valid_idx_ in tune_splitter.k_fold(5):
            train_idx = idx[train_idx_]
            valid_idx = idx[valid_idx_]

            train_x, train_y = self.ds._df.iloc[train_idx][self.features], y[train_idx_]
            valid_x, valid_y = self.ds._df.iloc[valid_idx][self.features], y[valid_idx_]

            # we concatenate train and validation data
            # but make sure to turn of shuffling and use the last fraction of the data for validation
            x = np.concatenate([train_x, valid_x])
            y = np.concatenate([train_y, valid_y])

            valid_frac = len(valid_x) / len(x)

            model = deepcopy(self.model)
            model.fit(x, y, cv_shuffle=False, n_folds=None, valid_size=valid_frac)

            models.append((model._best_loss, model, model._best_learner))

        models = sorted(models, key=lambda x: x[0]) 
        self.model = models[0][1]

    def fit(self, idx, structures, y):
        self.tune(idx, y)
        X = self.ds._df.iloc[idx][self.features]
        self.model.fit(X, y)

    def predict(self, idx, structures):
        X = self.ds._df.iloc[idx][self.features]
        pred = self.model.predict(X)
        return pred


In [8]:
model = TunedXBoost(FEATURES)


In [12]:
bench = LogkHCO2IDBench(model, name="xgboost-hyperopt", debug=False, patch_in_ds=True)


2022-08-07 16:54:49.212 | DEBUG    | mofdscribe.datasets.core_dataset:__init__:127 - Dropped 3227 duplicate basenames. New length 2166
2022-08-07 16:54:49.364 | DEBUG    | mofdscribe.datasets.core_dataset:__init__:133 - Dropped 62 duplicate graphs. New length 2104
2022-08-07 16:54:53.772 | DEBUG    | mofdscribe.datasets.core_dataset:__init__:127 - Dropped 3227 duplicate basenames. New length 2166
2022-08-07 16:54:53.806 | DEBUG    | mofdscribe.datasets.core_dataset:__init__:133 - Dropped 62 duplicate graphs. New length 2104
2022-08-07 16:54:53.819 | DEBUG    | mofdscribe.splitters.splitters:__init__:116 - Splitter settings | shuffle True, random state None, sample frac 1.0, q (0, 0.25, 0.5, 0.75, 1)


In [13]:
report = bench.bench()


2022-08-07 16:54:54.247 | DEBUG    | mofdscribe.bench.mofbench:_score:230 - K-fold round 0, 1690 train points, 414 test points
2022-08-07 16:54:58.571 | DEBUG    | mofdscribe.datasets.core_dataset:__init__:127 - Dropped 3227 duplicate basenames. New length 2166
2022-08-07 16:54:58.606 | DEBUG    | mofdscribe.datasets.core_dataset:__init__:133 - Dropped 62 duplicate graphs. New length 2104
2022-08-07 16:54:58.637 | DEBUG    | mofdscribe.splitters.splitters:__init__:116 - Splitter settings | shuffle True, random state None, sample frac 1.0, q (0, 0.25, 0.5, 0.75, 1)


100%|██████████| 1/1 [01:57<00:00, 117.24s/trial, best loss: 0.898569482858713]
100%|██████████| 2/2 [07:08<00:00, 428.60s/trial, best loss: 0.898569482858713]
100%|██████████| 3/3 [15:11<00:00, 911.92s/trial, best loss: 0.8314557217203841]
100%|██████████| 4/4 [00:14<00:00, 14.98s/trial, best loss: 0.8314557217203841]
100%|██████████| 5/5 [00:34<00:00, 34.15s/trial, best loss: 0.7244665752586938]
100%|██████████| 6/6 [10:16<00:00, 616.58s/trial, best loss: 0.6952394490714696]
100%|██████████| 7/7 [03:17<00:00, 197.77s/trial, best loss: 0.6952394490714696]
100%|██████████| 8/8 [00:49<00:00, 49.25s/trial, best loss: 0.6952394490714696]
100%|██████████| 9/9 [01:16<00:00, 76.70s/trial, best loss: 0.6952394490714696]
100%|██████████| 10/10 [16:23<00:00, 983.43s/trial, best loss: 0.694418136430368]
100%|██████████| 1/1 [11:21<00:00, 681.22s/trial, best loss: 1.05180736650701]
100%|██████████| 2/2 [00:27<00:00, 27.53s/trial, best loss: 1.05180736650701]
100%|██████████| 3/3 [13:45<00:00, 825

In [11]:
report

BenchResult(start_time=datetime.datetime(2022, 8, 7, 14, 26, 50, 724208, tzinfo=datetime.timezone.utc), end_time=datetime.datetime(2022, 8, 7, 14, 41, 22, 5793, tzinfo=datetime.timezone.utc), metrics=RegressionMetricCollection(regression_metrics=[RegressionMetrics(mean_squared_error=2.2541407682886128, mean_absolute_error=1.2180623255137877, r2_score=0.033736962545019455, max_error=2.6369412290583982, mean_absolute_percentage_error=1.1498512872694546, top_5_in_top_5=1, top_10_in_top_10=1, top_50_in_top_50=1, top_100_in_top_100=1, top_500_in_top_500=1), RegressionMetrics(mean_squared_error=1.1809820371171182, mean_absolute_error=0.9447372616054199, r2_score=-0.8369989405136833, max_error=1.855924517571582, mean_absolute_percentage_error=0.36642558493148253, top_5_in_top_5=1, top_10_in_top_10=1, top_50_in_top_50=1, top_100_in_top_100=1, top_500_in_top_500=1), RegressionMetrics(mean_squared_error=1.3269701986068339, mean_absolute_error=0.9784902795550539, r2_score=-3.740793935068651, max_