## Libraries

In [None]:
import time
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
import lightgbm as lgb

from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score

from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import MinMaxScaler

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

## Data preparation

Read in data from prepared file.

In [None]:
df = pd.read_csv('df_prepped.csv')
pd.set_option('display.max_columns', None)
df

Unnamed: 0,Year,Countries,Sand_1,Sand_2,Sand_3,Sand_4,Sand_5,Sand_6,Sand_7,Clay_1,Clay_2,Clay_3,Clay_4,Clay_5,Clay_6,Clay_7,OC_1,OC_2,OC_3,OC_4,OC_5,OC_6,OC_7,PAW_1,PAW_2,PAW_3,PAW_4,PAW_5,PAW_6,PAW_7,Y_maize_major,Farm,Sow_Maize_month_int,Harvest_Maize_month_int,sow_to_harvest_months,maize_lag-1,pcp_mean_lag-1,tmax_mean_lag-1,tmin_mean_lag-1,spi_mean_lag-1,maize_lag-2,pcp_mean_lag-2,tmax_mean_lag-2,tmin_mean_lag-2,spi_mean_lag-2,maize_lag-3,pcp_mean_lag-3,tmax_mean_lag-3,tmin_mean_lag-3,spi_mean_lag-3
0,2007,Angola,50,51,51,48,45,46,46,37,35,36,39,42,42,42,0.52,0.23,0.17,0.09,0.04,0.02,0.02,0.15,0.15,0.14,0.13,0.10,0.07,0.07,0.615357,104_Angola,9,4,7,0.554392,97.103755,301.939623,292.214020,0.093447,0.721607,129.051864,301.518536,292.496579,1.644698,0.620005,109.983325,301.786056,292.204097,0.514275
1,2007,Angola,62,64,63,59,58,59,59,27,25,26,29,31,30,30,0.11,0.05,0.07,0.04,0.02,0.02,0.01,0.11,0.10,0.10,0.09,0.07,0.07,0.03,0.257656,99_Angola,9,4,7,0.117051,59.292237,301.882929,288.092753,0.182926,0.300217,47.697564,303.988747,288.916992,0.909295,0.212699,41.130026,303.298082,288.642853,0.588172
2,2007,Angola,69,71,70,67,65,65,66,19,16,18,21,24,24,23,0.09,0.06,0.07,0.04,0.02,0.02,0.02,0.10,0.10,0.10,0.09,0.07,0.07,0.07,4.286831,108_Angola,9,4,7,3.093239,58.196545,302.891420,289.377311,0.991663,4.044452,42.130629,305.494178,290.535403,0.952237,2.295351,35.049776,304.824778,290.284886,0.371446
3,2007,Angola,60,63,61,57,53,53,53,29,26,28,32,35,36,36,0.46,0.16,0.14,0.08,0.05,0.04,0.03,0.12,0.13,0.12,0.12,0.11,0.10,0.09,0.700384,102_Angola,9,4,7,0.677797,149.210195,298.973795,287.311403,0.206751,0.907431,159.454723,299.404975,287.724299,1.374616,0.783018,174.088260,298.908208,287.362407,0.643207
4,2007,Angola,67,69,68,63,61,61,61,22,19,21,25,28,28,29,0.15,0.09,0.09,0.05,0.02,0.01,0.01,0.11,0.11,0.11,0.11,0.08,0.04,0.04,0.553450,43_Angola,9,4,7,0.412071,74.556629,304.006860,290.606725,-0.075621,0.675967,66.698670,304.644632,290.635254,1.144088,0.605584,67.404588,303.930955,290.564185,0.553079
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32325,2016,Zimbabwe,73,75,74,69,66,66,66,20,18,20,24,27,27,27,0.10,0.08,0.08,0.04,0.02,0.01,0.01,0.09,0.10,0.09,0.09,0.07,0.04,0.04,0.674080,3874_Zimbabwe,5,12,7,0.841480,25.249322,302.485864,291.402194,0.213789,0.926596,60.841103,301.103214,290.648780,0.666820,0.767325,56.355874,301.345864,290.478016,0.378187
32326,2016,Zimbabwe,57,58,58,54,52,51,52,28,26,26,31,33,34,33,0.14,0.12,0.10,0.05,0.03,0.01,0.01,0.13,0.14,0.13,0.11,0.09,0.03,0.03,0.707797,3875_Zimbabwe,5,12,7,0.685066,62.751591,299.755546,287.893781,0.412781,1.038142,62.393069,299.616883,288.078306,-0.008399,0.830597,89.248975,299.499617,287.620716,1.717135
32327,2016,Zimbabwe,55,57,56,54,52,51,51,30,28,30,32,34,35,35,0.53,0.41,0.30,0.22,0.18,0.16,0.15,0.18,0.19,0.17,0.15,0.14,0.14,0.14,0.805844,3859_Zimbabwe,5,12,7,0.481472,71.453830,296.963034,286.109140,0.004455,1.242612,129.128337,296.285672,286.086518,0.809915,0.994018,126.344999,296.054577,285.788589,0.512137
32328,2016,Zimbabwe,70,72,71,67,63,63,62,17,14,15,20,24,24,24,0.14,0.09,0.10,0.06,0.04,0.03,0.03,0.10,0.10,0.10,0.10,0.09,0.09,0.08,0.595883,3871_Zimbabwe,5,12,7,0.880191,74.755430,297.273136,285.907031,0.847040,0.720637,91.456254,297.102164,285.640060,1.451785,1.000904,62.046051,297.754080,285.755915,0.087132


For faster demo testing, create subsampling.

In [None]:
subsample_size =6400 #len(df)  # subsample subset of data for faster demo
df = df.sample(n=subsample_size, random_state=0)

In [None]:
df.shape

(6400, 50)

Additionally, `Countries` and `Farm` attributes as categorical are dropped.

In [None]:
df = df.drop(['Countries','Farm'], axis=1)
df

Unnamed: 0,Year,Sand_1,Sand_2,Sand_3,Sand_4,Sand_5,Sand_6,Sand_7,Clay_1,Clay_2,Clay_3,Clay_4,Clay_5,Clay_6,Clay_7,OC_1,OC_2,OC_3,OC_4,OC_5,OC_6,OC_7,PAW_1,PAW_2,PAW_3,PAW_4,PAW_5,PAW_6,PAW_7,Y_maize_major,Sow_Maize_month_int,Harvest_Maize_month_int,sow_to_harvest_months,maize_lag-1,pcp_mean_lag-1,tmax_mean_lag-1,tmin_mean_lag-1,spi_mean_lag-1,maize_lag-2,pcp_mean_lag-2,tmax_mean_lag-2,tmin_mean_lag-2,spi_mean_lag-2,maize_lag-3,pcp_mean_lag-3,tmax_mean_lag-3,tmin_mean_lag-3,spi_mean_lag-3
31671,2016,36,37,37,36,35,37,38,38,37,39,38,40,39,38,0.36,0.27,0.17,0.09,0.06,0.04,0.04,0.14,0.14,0.13,0.13,0.11,0.10,0.10,1.404263,3,8,5,1.331958,65.930227,307.842374,297.926592,-1.009550,1.326020,73.599637,307.082711,297.484613,-0.747904,1.333140,75.197736,307.100162,297.501827,-0.550423
5268,2008,63,65,65,63,60,60,59,24,22,23,25,28,28,28,0.51,0.21,0.18,0.10,0.09,0.07,0.06,0.12,0.12,0.12,0.11,0.11,0.11,0.11,0.582666,7,1,6,0.613687,121.964673,302.328427,292.150388,-0.573583,0.614261,126.884997,302.438494,292.174520,-0.317878,0.599353,119.842708,302.844087,292.372770,-0.570364
8384,2009,47,50,48,46,44,44,44,33,30,32,36,39,40,38,0.34,0.21,0.16,0.09,0.04,0.03,0.03,0.14,0.15,0.14,0.13,0.11,0.09,0.09,0.903740,4,9,5,0.904677,119.691173,303.917189,294.797522,-0.370280,0.817856,140.625954,304.046249,294.855836,-0.207510,0.900641,143.427173,303.874416,295.091166,-0.474229
230,2007,70,72,71,67,64,63,64,23,20,21,26,29,30,29,0.21,0.16,0.10,0.06,0.02,0.02,0.01,0.12,0.12,0.12,0.11,0.08,0.08,0.04,0.488761,9,4,7,0.407101,97.546617,300.665791,289.178047,0.175777,0.694363,75.104713,302.193750,289.793626,-1.088815,0.483943,79.174936,301.040231,289.162251,0.055712
17024,2011,55,57,55,50,47,47,47,28,26,28,33,37,37,37,0.22,0.15,0.14,0.09,0.07,0.06,0.06,0.14,0.14,0.14,0.13,0.11,0.11,0.10,0.874741,11,4,5,0.730527,50.318559,302.980889,290.716062,-1.929369,0.897466,63.932737,301.710150,289.934014,0.119020,1.136947,63.204844,302.013315,289.656402,-0.684243
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4782,2008,42,45,43,41,38,38,39,39,37,40,44,47,47,47,0.37,0.32,0.25,0.21,0.17,0.12,0.13,0.14,0.14,0.13,0.13,0.11,0.11,0.11,0.788274,7,1,6,0.773447,129.109045,304.379315,295.000348,-0.822808,0.751053,126.996040,304.205070,295.118181,-0.222404,0.791632,139.050901,304.632491,295.374624,-0.520674
22540,2013,42,43,43,41,41,42,41,32,32,32,33,34,34,34,0.53,0.23,0.22,0.16,0.11,0.10,0.10,0.15,0.15,0.15,0.14,0.13,0.13,0.13,1.877263,3,12,9,1.774599,77.172279,301.233717,288.919385,-0.526513,1.645290,75.618823,300.948039,288.705750,-0.386776,1.315047,87.312301,300.899663,289.041471,-0.552497
29009,2015,35,37,36,33,32,32,33,44,42,44,47,49,50,49,0.66,0.38,0.27,0.17,0.12,0.10,0.10,0.15,0.15,0.14,0.13,0.12,0.11,0.11,2.556540,4,9,5,2.873802,118.143598,299.001946,290.852571,-1.261750,2.664543,116.989215,298.791389,290.901275,-0.520458,2.711765,133.447707,298.636294,290.667639,-0.329680
19389,2012,56,58,56,53,50,49,50,27,25,27,30,34,34,34,0.27,0.21,0.13,0.08,0.06,0.04,0.04,0.15,0.14,0.13,0.12,0.11,0.11,0.11,1.652202,7,1,6,1.551927,79.980512,300.587927,289.448816,-0.498564,1.727954,99.455218,300.778265,289.861758,0.422087,1.550873,96.044501,300.209558,289.440378,0.406880


Create train and test sets.

In [None]:
# Separate a test set, the year 2016
df_test = df[df.Year == 2016]
df_train = df[df.Year < 2016]

In [None]:
df_test.shape, df_train.shape

((594, 48), (5806, 48))

## Hyperopt

### Testing hyperopt on subsample with small number of evaluations

Select regressors

In [None]:
names = ['K-Nearest Neighbors', 'Random Forest', 'AdaBoost', 'Linear Regression']
data = {'name': names}
df_best_regression = pd.DataFrame(data)

In [None]:
regression_models = [LinearRegression(), RandomForestRegressor(), KNeighborsRegressor(), AdaBoostRegressor()]

Define search spaces

In [None]:
# Define individual search spaces manually
regression_search_spaces = [
    # Linear Regression {'copy_X': True, 'fit_intercept': True, 'n_jobs': None, 'positive': False}
    {
        'fit_intercept': hp.choice('fit_intercept', [True, False]),
        'copy_X': hp.choice('copy_X', [True, False]),
        'n_jobs': hp.choice('n_jobs', [-1, 1, 2, 4]),  # Adjust the choices based on the available resources
        'positive': hp.choice('positive', [True, False])
    },

    # Random Forest {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': None, 'max_features': 1.0, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0,
                  # 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}

    {
        'ccp_alpha': hp.uniform('ccp_alpha', 0.0, 0.5),
        'max_depth': hp.choice('max_depth', range(1, 20)),
        'max_features': hp.choice('max_features', ['sqrt', 'log2']),
        'n_estimators': hp.choice('n_estimators', range(50, 200, 1)),
    },


    # K-Nearest Neighbors {'algorithm': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 5, 'p': 2, 'weights': 'uniform'}

    {
        'n_neighbors': hp.choice('n_neighbors', range(2, 20, 1)),
        'p': hp.choice('p', [1, 3]),
        'weights': hp.choice('weights', ['uniform', 'distance']),
        'algorithm': hp.choice('algorithm', ['auto', 'ball_tree', 'kd_tree', 'brute']),
        'leaf_size': hp.choice('leaf_size', range(10, 40, 1)),
    },


    # AdaBoost {'base_estimator': 'deprecated', 'estimator': None, 'learning_rate': 1.0, 'loss': 'linear', 'n_estimators': 50, 'random_state': None}
    {
        'n_estimators': hp.quniform('n_estimators', 50, 200, 10),
        'learning_rate': hp.loguniform('learning_rate', -4, 0),
    },

]


Cross validation preparation

In [None]:
# X and y
X_train = df_train.drop(columns=['Y_maize_major','Year'], axis=1)
y_train = df_train['Y_maize_major']
X_test = df_test.drop(columns=['Y_maize_major','Year'], axis=1)
y_test = df_test['Y_maize_major']

# Scale to [0,1] range
sc = MinMaxScaler()
X_train_scaled = pd.DataFrame(sc.fit_transform(X_train), columns=X_train.columns)
X_test_scaled = pd.DataFrame(sc.transform(X_test), columns=X_test.columns)

Run hyperopt with MAE calculation

In [None]:
# Initialize a dataframe for results collection
hpo_results = pd.DataFrame(columns=['regressor_name', 'best_cv_score', 'runtime_hpo', 'best_params', 'test_mae'])

# Define the best baseline score
best_baseline_score = 0.2161 # Define your baseline MAE score here

# Define how many iterations should be done
n_trials = 10

# All individual trials will be saved here
trials_dict = {}

# Start the timer to measure the runtime of the entire pipeline
hpo_time_start = time.time()

# Run the HPO and get the best params for each classifier
for i in range(len(df_best_regression)):

    # To improve the readability of code, creating the following objects:
    regressor_name = df_best_regression.loc[i, 'name']
    regressor_class = regression_models[names.index(regressor_name)]  # fetch from the list not df for baseline
    regressor_search_space = regression_search_spaces[names.index(regressor_name)]

    # To improve the readability of output:
    print()
    print('----------------------------------------------------------------------')
    print(f'Using *{regressor_name}* for estimation.')
    print('----------------------------------------------------------------------')

    # A objective function for receiving the CV scores for each model
    def hyperopt_cv_score(params):
        cv = RepeatedKFold(n_splits=5, n_repeats=5)  # can be also adjusted!
        # Check if 'n_estimators' is in params
        if 'n_estimators' in params:
            # Convert 'n_estimators' to an integer
            params['n_estimators'] = int(params['n_estimators'])
        model = regressor_class.set_params(**params)  # use the classifier from the list
        return cross_val_score(model,
                               X_train_scaled, y_train,
                               cv=cv,
                               scoring='neg_mean_absolute_error',
                               error_score='raise').mean()

    # A helper function for finding the best model
    def f(params):
        global best_cv_score
        global best_params
        global best_time

        cv_score = hyperopt_cv_score(params)

        if cv_score > best_cv_score:
            # Are we beating the best baseline score?
            if cv_score < best_baseline_score:  # we are beating the baseline accuracy
                best_cv_score = cv_score  # what is the best score?
                best_params = params  # what are the best params?
                best_time = round(time.time() - start_time, 4)  # track how much time it took to find the best params

                print(f'*{regressor_name}* BEAT THE BASELINE of {best_baseline_score}!')
                print(f'Better CV score: {best_cv_score}')
                print(f'Parameter combination: {best_params}')
                print(f'Time until beating the baseline: {best_time}s')
            else:
                best_cv_score = cv_score  # what is the best score?
                best_params = params  # what are the best params?
                best_time = round(time.time() - start_time, 4)  # track how much time it took to find the best params

                # Print the results for reference
                print(f'New best CV score: {best_cv_score}')
                print(f'New best params: {best_params}')
                print(f'Time taken until new best combination found: {round(best_time, 5)}s')

        return {'loss': -cv_score,  # see the comment below
                'status': STATUS_OK}
        # Comment regarding 'negative cv_score' (from the referenced source):
        ## Since we are trying to maximize the CV score (cv_score in the code above),
        ## we must negate this value for hyperopt, since hyperopt only knows how to minimize a function.
        ## Minimizing a function f is the same as maximizing the negative of f.
        ## About FMIN: https://github.com/hyperopt/hyperopt/wiki/FMin

    # Defining global variables to be updated
    best_cv_score = float('-inf')  # best CV score (negative infinity since we want to maximize)
    best_params = None  # best hyperparameter combination
    best_time = 0  # runtime until the best CV score is computed
    trials = Trials()  # store info at each step

    # Start running the algorithm and track time
    start_time = time.time()
    ## Hyperopt function
    best = fmin(f, regressor_search_space,  # use the search space associated with the classifier
                algo=tpe.suggest,
                max_evals=n_trials,  # how many evaluations?
                trials=trials)
    # Save all trials
    trials_dict[regressor_name] = trials  # save into classifier-trials

    print()
    print('######################################################################')
    # Print the summary of the best results
    print(f'Best CV score in {n_trials} iterations: {best_cv_score} ({best_time}s until found).')

    # Compute the accuracy score on the best model of the classifier
    m = regressor_class.set_params(**best_params).fit(X_train_scaled, y_train) if best_params is not None else regressor_class
    predictions = m.predict(X_test_scaled)
    score_test = mean_absolute_error(y_test, predictions)
    print(f'MAE on test data: {score_test}.')

    # Append the best results to the df
    hpo_results = hpo_results.append({'regressor_name': regressor_name,
                                      'best_cv_score': best_cv_score,
                                      'runtime_hpo': best_time,
                                      'best_params': best_params,
                                      'test_mae': score_test}, ignore_index=True)

# Mark the end of the entire pipeline
hpo_time_end = time.time()
print()
print('######################################################################')
print(f'The duration of the entire HPO pipeline for {len(df_best_regression)} classifiers across {n_trials} trials each: ')
print(f'{round(hpo_time_end - hpo_time_start, 5)} seconds')

# Sort the model results by test MAE and then Runtime
hpo_results = hpo_results.sort_values(['test_mae', 'runtime_hpo'], ascending=[1, 0]).reset_index(drop=True)



----------------------------------------------------------------------
Using *K-Nearest Neighbors* for estimation.
----------------------------------------------------------------------
*K-Nearest Neighbors* BEAT THE BASELINE of 0.2161!
Better CV score: -0.22516607430802535
Parameter combination: {'copy_X': False, 'fit_intercept': True, 'n_jobs': -1, 'positive': True}
Time until beating the baseline: 0.7494s
*K-Nearest Neighbors* BEAT THE BASELINE of 0.2161!
Better CV score: -0.22502038496658255
Parameter combination: {'copy_X': True, 'fit_intercept': True, 'n_jobs': -1, 'positive': True}
Time until beating the baseline: 7.128s
*K-Nearest Neighbors* BEAT THE BASELINE of 0.2161!
Better CV score: -0.22497594140997876
Parameter combination: {'copy_X': True, 'fit_intercept': True, 'n_jobs': -1, 'positive': True}
Time until beating the baseline: 9.0518s
100%|██████████| 10/10 [00:10<00:00,  1.01s/trial, best loss: 0.22497594140997876]

######################################################

  hpo_results = hpo_results.append({'regressor_name': regressor_name,


*Random Forest* BEAT THE BASELINE of 0.2161!
Better CV score: -0.5894685244856288
Parameter combination: {'ccp_alpha': 0.2656579177458394, 'max_depth': 14, 'max_features': 'sqrt', 'n_estimators': 189}
Time until beating the baseline: 154.9423s
*Random Forest* BEAT THE BASELINE of 0.2161!
Better CV score: -0.4719666383894933
Parameter combination: {'ccp_alpha': 0.10968727597627398, 'max_depth': 5, 'max_features': 'sqrt', 'n_estimators': 63}
Time until beating the baseline: 181.0101s
*Random Forest* BEAT THE BASELINE of 0.2161!
Better CV score: -0.39702165382395616
Parameter combination: {'ccp_alpha': 0.05320255998740314, 'max_depth': 11, 'max_features': 'sqrt', 'n_estimators': 90}
Time until beating the baseline: 653.4424s
100%|██████████| 10/10 [14:48<00:00, 88.87s/trial, best loss: 0.39702165382395616]

######################################################################
Best CV score in 10 iterations: -0.39702165382395616 (653.4424s until found).
MAE on test data: 0.339966802687627

  hpo_results = hpo_results.append({'regressor_name': regressor_name,


*AdaBoost* BEAT THE BASELINE of 0.2161!
Better CV score: -0.3410576819378248
Parameter combination: {'algorithm': 'kd_tree', 'leaf_size': 10, 'n_neighbors': 3, 'p': 3, 'weights': 'uniform'}
Time until beating the baseline: 26.4917s
*AdaBoost* BEAT THE BASELINE of 0.2161!
Better CV score: -0.3126743159772957
Parameter combination: {'algorithm': 'kd_tree', 'leaf_size': 32, 'n_neighbors': 3, 'p': 1, 'weights': 'distance'}
Time until beating the baseline: 537.259s
100%|██████████| 10/10 [15:07<00:00, 90.72s/trial, best loss: 0.3126743159772957]

######################################################################
Best CV score in 10 iterations: -0.3126743159772957 (537.259s until found).
MAE on test data: 0.27484357530416414.

----------------------------------------------------------------------
Using *Linear Regression* for estimation.
----------------------------------------------------------------------
  0%|          | 0/10 [00:00<?, ?trial/s, best loss=?]

  hpo_results = hpo_results.append({'regressor_name': regressor_name,


*Linear Regression* BEAT THE BASELINE of 0.2161!
Better CV score: -0.2815505160032627
Parameter combination: {'learning_rate': 0.02271680295054018, 'n_estimators': 180}
Time until beating the baseline: 202.246s
100%|██████████| 10/10 [18:51<00:00, 113.17s/trial, best loss: 0.2815505160032627]

######################################################################
Best CV score in 10 iterations: -0.2815505160032627 (202.246s until found).
MAE on test data: 0.22839275989365015.

######################################################################
The duration of the entire HPO pipeline for 4 classifiers across 10 trials each: 
2949.89505 seconds


  hpo_results = hpo_results.append({'regressor_name': regressor_name,


In [None]:
# Add a column which says if the classifier beat the baseline
hpo_results['beats_bl'] = np.where(hpo_results.loc[:,'test_mae'] < best_baseline_score, 'yes', 'no')
hpo_results.to_csv('hpo_results_mae.csv', index=False)
# See the HPO results
hpo_results

Unnamed: 0,regressor_name,best_cv_score,runtime_hpo,best_params,test_mae,beats_bl
0,K-Nearest Neighbors,-0.224976,9.0518,"{'copy_X': True, 'fit_intercept': True, 'n_job...",0.194255,yes
1,Linear Regression,-0.281551,202.246,"{'learning_rate': 0.02271680295054018, 'n_esti...",0.228393,no
2,AdaBoost,-0.312674,537.259,"{'algorithm': 'kd_tree', 'leaf_size': 32, 'n_n...",0.274844,no
3,Random Forest,-0.397022,653.4424,"{'ccp_alpha': 0.05320255998740314, 'max_depth'...",0.339967,no


Run hyperopt with RMSE calculation

In [None]:
# Initialize a dataframe for results collection
hpo_results = pd.DataFrame(columns=['regressor_name', 'best_cv_score', 'runtime_hpo', 'best_params', 'test_rmse'])

# Define the best baseline score
best_baseline_score = 0.3288  # Define your baseline RMSE score here

# Define how many iterations should be done
n_trials = 10

# All individual trials will be saved here
trials_dict = {}

# Start the timer to measure the runtime of the entire pipeline
hpo_time_start = time.time()

# Run the HPO and get the best params for each classifier
for i in range(len(df_best_regression)):

    # To improve the readability of code, creating the following objects:
    regressor_name = df_best_regression.loc[i, 'name']
    regressor_class = regression_models[names.index(regressor_name)]  # fetch from the list not df for baseline
    regressor_search_space = regression_search_spaces[names.index(regressor_name)]

    # To improve the readability of output:
    print()
    print('----------------------------------------------------------------------')
    print(f'Using *{regressor_name}* for estimation.')
    print('----------------------------------------------------------------------')

    # A objective function for receiving the CV scores for each model
    def hyperopt_cv_score(params):
        cv = RepeatedKFold(n_splits=5, n_repeats=5)  # can be also adjusted!
        # Check if 'n_estimators' is in params
        if 'n_estimators' in params:
            # Convert 'n_estimators' to an integer
            params['n_estimators'] = int(params['n_estimators'])
        model = regressor_class.set_params(**params)  # use the classifier from the list
        return cross_val_score(model,
                               X_train_scaled, y_train,
                               cv=cv,
                               scoring='neg_root_mean_squared_error',
                               error_score='raise').mean()

    # A helper function for finding the best model
    def f(params):
        global best_cv_score
        global best_params
        global best_time

        cv_score = hyperopt_cv_score(params)

        if cv_score > best_cv_score:
            # Are we beating the best baseline score?
            if cv_score < best_baseline_score:  # we are beating the baseline accuracy
                best_cv_score = cv_score  # what is the best score?
                best_params = params  # what are the best params?
                best_time = round(time.time() - start_time, 4)  # track how much time it took to find the best params

                print(f'*{regressor_name}* BEAT THE BASELINE of {best_baseline_score}!')
                print(f'Better CV score: {best_cv_score}')
                print(f'Parameter combination: {best_params}')
                print(f'Time until beating the baseline: {best_time}s')
            else:
                best_cv_score = cv_score  # what is the best score?
                best_params = params  # what are the best params?
                best_time = round(time.time() - start_time, 4)  # track how much time it took to find the best params

                # Print the results for reference
                print(f'New best CV score: {best_cv_score}')
                print(f'New best params: {best_params}')
                print(f'Time taken until new best combination found: {round(best_time, 5)}s')

        return {'loss': -cv_score,  # see the comment below
                'status': STATUS_OK}
        # Comment regarding 'negative cv_score' (from the referenced source):
        ## Since we are trying to maximize the CV score (cv_score in the code above),
        ## we must negate this value for hyperopt, since hyperopt only knows how to minimize a function.
        ## Minimizing a function f is the same as maximizing the negative of f.
        ## About FMIN: https://github.com/hyperopt/hyperopt/wiki/FMin

    # Defining global variables to be updated
    best_cv_score = float('-inf')  # best CV score (negative infinity since we want to maximize)
    best_params = None  # best hyperparameter combination
    best_time = 0  # runtime until the best CV score is computed
    trials = Trials()  # store info at each step

    # Start running the algorithm and track time
    start_time = time.time()
    ## Hyperopt function
    best = fmin(f, regressor_search_space,  # use the search space associated with the classifier
                algo=tpe.suggest,
                max_evals=n_trials,  # how many evaluations?
                trials=trials)
    # Save all trials
    trials_dict[regressor_name] = trials  # save into classifier-trials

    print()
    print('######################################################################')
    # Print the summary of the best results
    print(f'Best CV score in {n_trials} iterations: {best_cv_score} ({best_time}s until found).')

    # Compute the accuracy score on the best model of the classifier
    m = regressor_class.set_params(**best_params).fit(X_train_scaled, y_train) if best_params is not None else regressor_class
    predictions = m.predict(X_test_scaled)
    score_test = mean_squared_error(y_test, predictions, squared=False)  # RMSE calculation
    print(f'RMSE on test data: {score_test}.')

    # Append the best results to the df
    hpo_results = hpo_results.append({'regressor_name': regressor_name,
                                      'best_cv_score': best_cv_score,
                                      'runtime_hpo': best_time,
                                      'best_params': best_params,
                                      'test_rmse': score_test}, ignore_index=True)

# Mark the end of the entire pipeline
hpo_time_end = time.time()
print()
print('######################################################################')
print(f'The duration of the entire HPO pipeline for {len(df_best_regression)} classifiers across {n_trials} trials each: ')
print(f'{round(hpo_time_end - hpo_time_start, 5)} seconds')

# Sort the model results by test RMSE and then Runtime
hpo_results = hpo_results.sort_values(['test_rmse', 'runtime_hpo'], ascending=[1, 0]).reset_index(drop=True)



----------------------------------------------------------------------
Using *K-Nearest Neighbors* for estimation.
----------------------------------------------------------------------
*K-Nearest Neighbors* BEAT THE BASELINE of 0.3288!
Better CV score: -0.3623819572264
Parameter combination: {'copy_X': False, 'fit_intercept': True, 'n_jobs': 2, 'positive': False}
Time until beating the baseline: 0.8318s
*K-Nearest Neighbors* BEAT THE BASELINE of 0.3288!
Better CV score: -0.3621116479134369
Parameter combination: {'copy_X': False, 'fit_intercept': True, 'n_jobs': 4, 'positive': False}
Time until beating the baseline: 2.6998s
100%|██████████| 10/10 [00:09<00:00,  1.05trial/s, best loss: 0.3621116479134369]

######################################################################
Best CV score in 10 iterations: -0.3621116479134369 (2.6998s until found).
RMSE on test data: 0.30953487637560506.

----------------------------------------------------------------------
Using *Random Forest* for

  hpo_results = hpo_results.append({'regressor_name': regressor_name,


*Random Forest* BEAT THE BASELINE of 0.3288!
Better CV score: -0.9155612004933602
Parameter combination: {'ccp_alpha': 0.2767482375397266, 'max_depth': 15, 'max_features': 'sqrt', 'n_estimators': 168}
Time until beating the baseline: 164.635s
*Random Forest* BEAT THE BASELINE of 0.3288!
Better CV score: -0.6827584936051031
Parameter combination: {'ccp_alpha': 0.06946614147828717, 'max_depth': 4, 'max_features': 'sqrt', 'n_estimators': 146}
Time until beating the baseline: 439.5392s
*Random Forest* BEAT THE BASELINE of 0.3288!
Better CV score: -0.4928853699600972
Parameter combination: {'ccp_alpha': 0.009529619928259236, 'max_depth': 19, 'max_features': 'log2', 'n_estimators': 54}
Time until beating the baseline: 818.1451s
100%|██████████| 10/10 [16:55<00:00, 101.56s/trial, best loss: 0.4928853699600972]

######################################################################
Best CV score in 10 iterations: -0.4928853699600972 (818.1451s until found).
RMSE on test data: 1.431592455506398

  hpo_results = hpo_results.append({'regressor_name': regressor_name,


*AdaBoost* BEAT THE BASELINE of 0.3288!
Better CV score: -0.5460595112423507
Parameter combination: {'algorithm': 'brute', 'leaf_size': 23, 'n_neighbors': 13, 'p': 3, 'weights': 'uniform'}
Time until beating the baseline: 147.9142s
*AdaBoost* BEAT THE BASELINE of 0.3288!
Better CV score: -0.5121827898958767
Parameter combination: {'algorithm': 'auto', 'leaf_size': 26, 'n_neighbors': 8, 'p': 1, 'weights': 'distance'}
Time until beating the baseline: 157.7449s
*AdaBoost* BEAT THE BASELINE of 0.3288!
Better CV score: -0.5101716785242867
Parameter combination: {'algorithm': 'ball_tree', 'leaf_size': 22, 'n_neighbors': 6, 'p': 1, 'weights': 'distance'}
Time until beating the baseline: 176.9758s
*AdaBoost* BEAT THE BASELINE of 0.3288!
Better CV score: -0.5081905124983025
Parameter combination: {'algorithm': 'ball_tree', 'leaf_size': 23, 'n_neighbors': 4, 'p': 3, 'weights': 'uniform'}
Time until beating the baseline: 342.7654s
100%|██████████| 10/10 [08:28<00:00, 50.85s/trial, best loss: 0.50

  hpo_results = hpo_results.append({'regressor_name': regressor_name,


*Linear Regression* BEAT THE BASELINE of 0.3288!
Better CV score: -0.5154790913613229
Parameter combination: {'learning_rate': 0.40271124032392824, 'n_estimators': 170}
Time until beating the baseline: 112.1502s
*Linear Regression* BEAT THE BASELINE of 0.3288!
Better CV score: -0.43275664917248485
Parameter combination: {'learning_rate': 0.19029926735240615, 'n_estimators': 110}
Time until beating the baseline: 207.3689s
*Linear Regression* BEAT THE BASELINE of 0.3288!
Better CV score: -0.4194773496872971
Parameter combination: {'learning_rate': 0.1040620814897169, 'n_estimators': 60}
Time until beating the baseline: 352.1191s
100%|██████████| 10/10 [14:25<00:00, 86.59s/trial, best loss: 0.4194773496872971]

######################################################################
Best CV score in 10 iterations: -0.4194773496872971 (352.1191s until found).
RMSE on test data: 1.6931108066437375.

######################################################################
The duration of the ent

  hpo_results = hpo_results.append({'regressor_name': regressor_name,


In [None]:
# Add a column which says if the classifier beat the baseline
hpo_results['beats_bl'] = np.where(hpo_results.loc[:,'test_rmse'] < best_baseline_score, 'yes', 'no')
hpo_results.to_csv('hpo_results_rmse.csv', index=False)
# See the HPO results
hpo_results

Unnamed: 0,regressor_name,best_cv_score,runtime_hpo,best_params,test_rmse,beats_bl
0,K-Nearest Neighbors,-0.362112,2.6998,"{'copy_X': False, 'fit_intercept': True, 'n_jo...",0.309535,yes
1,Random Forest,-0.492885,818.1451,"{'ccp_alpha': 0.009529619928259236, 'max_depth...",1.431592,no
2,AdaBoost,-0.508191,342.7654,"{'algorithm': 'ball_tree', 'leaf_size': 23, 'n...",1.531155,no
3,Linear Regression,-0.419477,352.1191,"{'learning_rate': 0.1040620814897169, 'n_estim...",1.693111,no


### Testing hyperopt with KNN for full dataset

Reload the full dataset and prepare for modelling

In [None]:
df = pd.read_csv('df_prepped.csv')
df.shape

(32330, 50)

In [None]:
df = df.drop(['Countries','Farm'], axis=1)

In [None]:
df_test = df[df.Year == 2016]
df_train = df[df.Year < 2016]

In [None]:
# X and y
X_train = df_train.drop(columns=['Y_maize_major','Year'], axis=1)
y_train = df_train['Y_maize_major']
X_test = df_test.drop(columns=['Y_maize_major','Year'], axis=1)
y_test = df_test['Y_maize_major']

# Scale to [0,1] range
sc = MinMaxScaler()
X_train_scaled = pd.DataFrame(sc.fit_transform(X_train), columns=X_train.columns)
X_test_scaled = pd.DataFrame(sc.transform(X_test), columns=X_test.columns)

In [None]:
names = ['K-Nearest Neighbors']
data = {'name': names}
df_best_regression = pd.DataFrame(data)

Run hyperopt with MAE

In [None]:
# Initialize a dataframe for results collection
hpo_results = pd.DataFrame(columns=['regressor_name', 'best_cv_score', 'runtime_hpo', 'best_params', 'test_mae'])

# Define the best baseline score
best_baseline_score = 0.2161 # Define your baseline MAE score here

# Define how many iterations should be done
n_trials = 200

# All individual trials will be saved here
trials_dict = {}

# Start the timer to measure the runtime of the entire pipeline
hpo_time_start = time.time()

# Run the HPO and get the best params for each classifier
for i in range(len(df_best_regression)):

    # To improve the readability of code, creating the following objects:
    regressor_name = df_best_regression.loc[i, 'name']
    regressor_class = regression_models[names.index(regressor_name)]  # fetch from the list not df for baseline
    regressor_search_space = regression_search_spaces[names.index(regressor_name)]

    # To improve the readability of output:
    print()
    print('----------------------------------------------------------------------')
    print(f'Using *{regressor_name}* for estimation.')
    print('----------------------------------------------------------------------')

    # A objective function for receiving the CV scores for each model
    def hyperopt_cv_score(params):
        cv = RepeatedKFold(n_splits=5, n_repeats=5)  # can be also adjusted!
        # Check if 'n_estimators' is in params
        if 'n_estimators' in params:
            # Convert 'n_estimators' to an integer
            params['n_estimators'] = int(params['n_estimators'])
        model = regressor_class.set_params(**params)  # use the classifier from the list
        return cross_val_score(model,
                               X_train_scaled, y_train,
                               cv=cv,
                               scoring='neg_mean_absolute_error',
                               error_score='raise').mean()

    # A helper function for finding the best model
    def f(params):
        global best_cv_score
        global best_params
        global best_time

        cv_score = hyperopt_cv_score(params)

        if cv_score > best_cv_score:
            # Are we beating the best baseline score?
            if cv_score < best_baseline_score:  # we are beating the baseline accuracy
                best_cv_score = cv_score  # what is the best score?
                best_params = params  # what are the best params?
                best_time = round(time.time() - start_time, 4)  # track how much time it took to find the best params

                print(f'*{regressor_name}* BEAT THE BASELINE of {best_baseline_score}!')
                print(f'Better CV score: {best_cv_score}')
                print(f'Parameter combination: {best_params}')
                print(f'Time until beating the baseline: {best_time}s')
            else:
                best_cv_score = cv_score  # what is the best score?
                best_params = params  # what are the best params?
                best_time = round(time.time() - start_time, 4)  # track how much time it took to find the best params

                # Print the results for reference
                print(f'New best CV score: {best_cv_score}')
                print(f'New best params: {best_params}')
                print(f'Time taken until new best combination found: {round(best_time, 5)}s')

        return {'loss': -cv_score,  # see the comment below
                'status': STATUS_OK}
        # Comment regarding 'negative cv_score' (from the referenced source):
        ## Since we are trying to maximize the CV score (cv_score in the code above),
        ## we must negate this value for hyperopt, since hyperopt only knows how to minimize a function.
        ## Minimizing a function f is the same as maximizing the negative of f.
        ## About FMIN: https://github.com/hyperopt/hyperopt/wiki/FMin

    # Defining global variables to be updated
    best_cv_score = float('-inf')  # best CV score (negative infinity since we want to maximize)
    best_params = None  # best hyperparameter combination
    best_time = 0  # runtime until the best CV score is computed
    trials = Trials()  # store info at each step

    # Start running the algorithm and track time
    start_time = time.time()
    ## Hyperopt function
    best = fmin(f, regressor_search_space,  # use the search space associated with the classifier
                algo=tpe.suggest,
                max_evals=n_trials,  # how many evaluations?
                trials=trials)
    # Save all trials
    trials_dict[regressor_name] = trials  # save into classifier-trials

    print()
    print('######################################################################')
    # Print the summary of the best results
    print(f'Best CV score in {n_trials} iterations: {best_cv_score} ({best_time}s until found).')

    # Compute the accuracy score on the best model of the classifier
    m = regressor_class.set_params(**best_params).fit(X_train_scaled, y_train) if best_params is not None else regressor_class
    predictions = m.predict(X_test_scaled)
    score_test = mean_absolute_error(y_test, predictions)
    print(f'MAE on test data: {score_test}.')

    # Append the best results to the df
    hpo_results = hpo_results.append({'regressor_name': regressor_name,
                                      'best_cv_score': best_cv_score,
                                      'runtime_hpo': best_time,
                                      'best_params': best_params,
                                      'test_mae': score_test}, ignore_index=True)

# Mark the end of the entire pipeline
hpo_time_end = time.time()
print()
print('######################################################################')
print(f'The duration of the entire HPO pipeline for {len(df_best_regression)} classifiers across {n_trials} trials each: ')
print(f'{round(hpo_time_end - hpo_time_start, 5)} seconds')

# Sort the model results by test MAE and then Runtime
hpo_results = hpo_results.sort_values(['test_mae', 'runtime_hpo'], ascending=[1, 0]).reset_index(drop=True)



----------------------------------------------------------------------
Using *K-Nearest Neighbors* for estimation.
----------------------------------------------------------------------
*K-Nearest Neighbors* BEAT THE BASELINE of 0.2161!
Better CV score: -0.23053169437032356
Parameter combination: {'copy_X': True, 'fit_intercept': True, 'n_jobs': 1, 'positive': False}
Time until beating the baseline: 3.8179s
*K-Nearest Neighbors* BEAT THE BASELINE of 0.2161!
Better CV score: -0.22885334081016126
Parameter combination: {'copy_X': False, 'fit_intercept': True, 'n_jobs': 2, 'positive': True}
Time until beating the baseline: 7.8616s
*K-Nearest Neighbors* BEAT THE BASELINE of 0.2161!
Better CV score: -0.22884628832292864
Parameter combination: {'copy_X': True, 'fit_intercept': True, 'n_jobs': -1, 'positive': True}
Time until beating the baseline: 14.4906s
*K-Nearest Neighbors* BEAT THE BASELINE of 0.2161!
Better CV score: -0.22882365345629654
Parameter combination: {'copy_X': True, 'fit_int

  hpo_results = hpo_results.append({'regressor_name': regressor_name,


In [None]:
# Add a column which says if the classifier beat the baseline
hpo_results['beats_bl'] = np.where(hpo_results.loc[:,'test_mae'] < best_baseline_score, 'yes', 'no')
hpo_results.to_csv('hpo_results_mae_knn.csv', index=False)
# See the HPO results
hpo_results

Unnamed: 0,regressor_name,best_cv_score,runtime_hpo,best_params,test_mae,beats_bl
0,K-Nearest Neighbors,-0.228814,418.8338,"{'copy_X': False, 'fit_intercept': True, 'n_jo...",0.199428,yes


Run hyperopt with RMSE

In [None]:
# Initialize a dataframe for results collection
hpo_results = pd.DataFrame(columns=['regressor_name', 'best_cv_score', 'runtime_hpo', 'best_params', 'test_rmse'])

# Define the best baseline score
best_baseline_score = 0.3288  # Define your baseline RMSE score here

# Define how many iterations should be done
n_trials = 200

# All individual trials will be saved here
trials_dict = {}

# Start the timer to measure the runtime of the entire pipeline
hpo_time_start = time.time()

# Run the HPO and get the best params for each classifier
for i in range(len(df_best_regression)):

    # To improve the readability of code, creating the following objects:
    regressor_name = df_best_regression.loc[i, 'name']
    regressor_class = regression_models[names.index(regressor_name)]  # fetch from the list not df for baseline
    regressor_search_space = regression_search_spaces[names.index(regressor_name)]

    # To improve the readability of output:
    print()
    print('----------------------------------------------------------------------')
    print(f'Using *{regressor_name}* for estimation.')
    print('----------------------------------------------------------------------')

    # A objective function for receiving the CV scores for each model
    def hyperopt_cv_score(params):
        cv = RepeatedKFold(n_splits=5, n_repeats=5)  # can be also adjusted!
        # Check if 'n_estimators' is in params
        if 'n_estimators' in params:
            # Convert 'n_estimators' to an integer
            params['n_estimators'] = int(params['n_estimators'])
        model = regressor_class.set_params(**params)  # use the classifier from the list
        return cross_val_score(model,
                               X_train_scaled, y_train,
                               cv=cv,
                               scoring='neg_root_mean_squared_error',
                               error_score='raise').mean()

    # A helper function for finding the best model
    def f(params):
        global best_cv_score
        global best_params
        global best_time

        cv_score = hyperopt_cv_score(params)

        if cv_score > best_cv_score:
            # Are we beating the best baseline score?
            if cv_score < best_baseline_score:  # we are beating the baseline accuracy
                best_cv_score = cv_score  # what is the best score?
                best_params = params  # what are the best params?
                best_time = round(time.time() - start_time, 4)  # track how much time it took to find the best params

                print(f'*{regressor_name}* BEAT THE BASELINE of {best_baseline_score}!')
                print(f'Better CV score: {best_cv_score}')
                print(f'Parameter combination: {best_params}')
                print(f'Time until beating the baseline: {best_time}s')
            else:
                best_cv_score = cv_score  # what is the best score?
                best_params = params  # what are the best params?
                best_time = round(time.time() - start_time, 4)  # track how much time it took to find the best params

                # Print the results for reference
                print(f'New best CV score: {best_cv_score}')
                print(f'New best params: {best_params}')
                print(f'Time taken until new best combination found: {round(best_time, 5)}s')

        return {'loss': -cv_score,  # see the comment below
                'status': STATUS_OK}
        # Comment regarding 'negative cv_score' (from the referenced source):
        ## Since we are trying to maximize the CV score (cv_score in the code above),
        ## we must negate this value for hyperopt, since hyperopt only knows how to minimize a function.
        ## Minimizing a function f is the same as maximizing the negative of f.
        ## About FMIN: https://github.com/hyperopt/hyperopt/wiki/FMin

    # Defining global variables to be updated
    best_cv_score = float('-inf')  # best CV score (negative infinity since we want to maximize)
    best_params = None  # best hyperparameter combination
    best_time = 0  # runtime until the best CV score is computed
    trials = Trials()  # store info at each step

    # Start running the algorithm and track time
    start_time = time.time()
    ## Hyperopt function
    best = fmin(f, regressor_search_space,  # use the search space associated with the classifier
                algo=tpe.suggest,
                max_evals=n_trials,  # how many evaluations?
                trials=trials)
    # Save all trials
    trials_dict[regressor_name] = trials  # save into classifier-trials

    print()
    print('######################################################################')
    # Print the summary of the best results
    print(f'Best CV score in {n_trials} iterations: {best_cv_score} ({best_time}s until found).')

    # Compute the accuracy score on the best model of the classifier
    m = regressor_class.set_params(**best_params).fit(X_train_scaled, y_train) if best_params is not None else regressor_class
    predictions = m.predict(X_test_scaled)
    score_test = mean_squared_error(y_test, predictions, squared=False)  # RMSE calculation
    print(f'RMSE on test data: {score_test}.')

    # Append the best results to the df
    hpo_results = hpo_results.append({'regressor_name': regressor_name,
                                      'best_cv_score': best_cv_score,
                                      'runtime_hpo': best_time,
                                      'best_params': best_params,
                                      'test_rmse': score_test}, ignore_index=True)

# Mark the end of the entire pipeline
hpo_time_end = time.time()
print()
print('######################################################################')
print(f'The duration of the entire HPO pipeline for {len(df_best_regression)} classifiers across {n_trials} trials each: ')
print(f'{round(hpo_time_end - hpo_time_start, 5)} seconds')

# Sort the model results by test RMSE and then Runtime
hpo_results = hpo_results.sort_values(['test_rmse', 'runtime_hpo'], ascending=[1, 0]).reset_index(drop=True)



----------------------------------------------------------------------
Using *K-Nearest Neighbors* for estimation.
----------------------------------------------------------------------
*K-Nearest Neighbors* BEAT THE BASELINE of 0.3288!
Better CV score: -0.3717601898055275
Parameter combination: {'copy_X': True, 'fit_intercept': True, 'n_jobs': 1, 'positive': False}
Time until beating the baseline: 3.3438s
*K-Nearest Neighbors* BEAT THE BASELINE of 0.3288!
Better CV score: -0.371648099863865
Parameter combination: {'copy_X': True, 'fit_intercept': True, 'n_jobs': 4, 'positive': False}
Time until beating the baseline: 29.9271s
*K-Nearest Neighbors* BEAT THE BASELINE of 0.3288!
Better CV score: -0.3716358693381217
Parameter combination: {'copy_X': True, 'fit_intercept': True, 'n_jobs': -1, 'positive': False}
Time until beating the baseline: 227.98s
*K-Nearest Neighbors* BEAT THE BASELINE of 0.3288!
Better CV score: -0.3716041901317964
Parameter combination: {'copy_X': True, 'fit_interce

  hpo_results = hpo_results.append({'regressor_name': regressor_name,


In [None]:
# Add a column which says if the classifier beat the baseline
hpo_results['beats_bl'] = np.where(hpo_results.loc[:,'test_rmse'] < best_baseline_score, 'yes', 'no')
hpo_results.to_csv('hpo_results_rmse_knn.csv', index=False)
# See the HPO results
hpo_results

Unnamed: 0,regressor_name,best_cv_score,runtime_hpo,best_params,test_rmse,beats_bl
0,K-Nearest Neighbors,-0.371604,370.4984,"{'copy_X': True, 'fit_intercept': True, 'n_job...",1.649738,no


Reference: https://github.com/qetdr/xAutoML-Project1/blob/main/project1_notebook.ipynb