## Libraries

In [None]:
import time
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
import lightgbm as lgb

from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score

from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import MinMaxScaler

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

## Data preparation

Read in data from prepared file.

In [None]:
df = pd.read_csv('df_prepped.csv')
pd.set_option('display.max_columns', None)
df

Unnamed: 0,Year,Countries,Sand_1,Sand_2,Sand_3,Sand_4,Sand_5,Sand_6,Sand_7,Clay_1,Clay_2,Clay_3,Clay_4,Clay_5,Clay_6,Clay_7,OC_1,OC_2,OC_3,OC_4,OC_5,OC_6,OC_7,PAW_1,PAW_2,PAW_3,PAW_4,PAW_5,PAW_6,PAW_7,Y_maize_major,Farm,Sow_Maize_month_int,Harvest_Maize_month_int,sow_to_harvest_months,maize_lag-1,pcp_mean_lag-1,tmax_mean_lag-1,tmin_mean_lag-1,spi_mean_lag-1,maize_lag-2,pcp_mean_lag-2,tmax_mean_lag-2,tmin_mean_lag-2,spi_mean_lag-2,maize_lag-3,pcp_mean_lag-3,tmax_mean_lag-3,tmin_mean_lag-3,spi_mean_lag-3
0,2007,Angola,50,51,51,48,45,46,46,37,35,36,39,42,42,42,0.52,0.23,0.17,0.09,0.04,0.02,0.02,0.15,0.15,0.14,0.13,0.10,0.07,0.07,0.615357,104_Angola,9,4,7,0.554392,97.103755,301.939623,292.214020,0.093447,0.721607,129.051864,301.518536,292.496579,1.644698,0.620005,109.983325,301.786056,292.204097,0.514275
1,2007,Angola,62,64,63,59,58,59,59,27,25,26,29,31,30,30,0.11,0.05,0.07,0.04,0.02,0.02,0.01,0.11,0.10,0.10,0.09,0.07,0.07,0.03,0.257656,99_Angola,9,4,7,0.117051,59.292237,301.882929,288.092753,0.182926,0.300217,47.697564,303.988747,288.916992,0.909295,0.212699,41.130026,303.298082,288.642853,0.588172
2,2007,Angola,69,71,70,67,65,65,66,19,16,18,21,24,24,23,0.09,0.06,0.07,0.04,0.02,0.02,0.02,0.10,0.10,0.10,0.09,0.07,0.07,0.07,4.286831,108_Angola,9,4,7,3.093239,58.196545,302.891420,289.377311,0.991663,4.044452,42.130629,305.494178,290.535403,0.952237,2.295351,35.049776,304.824778,290.284886,0.371446
3,2007,Angola,60,63,61,57,53,53,53,29,26,28,32,35,36,36,0.46,0.16,0.14,0.08,0.05,0.04,0.03,0.12,0.13,0.12,0.12,0.11,0.10,0.09,0.700384,102_Angola,9,4,7,0.677797,149.210195,298.973795,287.311403,0.206751,0.907431,159.454723,299.404975,287.724299,1.374616,0.783018,174.088260,298.908208,287.362407,0.643207
4,2007,Angola,67,69,68,63,61,61,61,22,19,21,25,28,28,29,0.15,0.09,0.09,0.05,0.02,0.01,0.01,0.11,0.11,0.11,0.11,0.08,0.04,0.04,0.553450,43_Angola,9,4,7,0.412071,74.556629,304.006860,290.606725,-0.075621,0.675967,66.698670,304.644632,290.635254,1.144088,0.605584,67.404588,303.930955,290.564185,0.553079
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32325,2016,Zimbabwe,73,75,74,69,66,66,66,20,18,20,24,27,27,27,0.10,0.08,0.08,0.04,0.02,0.01,0.01,0.09,0.10,0.09,0.09,0.07,0.04,0.04,0.674080,3874_Zimbabwe,5,12,7,0.841480,25.249322,302.485864,291.402194,0.213789,0.926596,60.841103,301.103214,290.648780,0.666820,0.767325,56.355874,301.345864,290.478016,0.378187
32326,2016,Zimbabwe,57,58,58,54,52,51,52,28,26,26,31,33,34,33,0.14,0.12,0.10,0.05,0.03,0.01,0.01,0.13,0.14,0.13,0.11,0.09,0.03,0.03,0.707797,3875_Zimbabwe,5,12,7,0.685066,62.751591,299.755546,287.893781,0.412781,1.038142,62.393069,299.616883,288.078306,-0.008399,0.830597,89.248975,299.499617,287.620716,1.717135
32327,2016,Zimbabwe,55,57,56,54,52,51,51,30,28,30,32,34,35,35,0.53,0.41,0.30,0.22,0.18,0.16,0.15,0.18,0.19,0.17,0.15,0.14,0.14,0.14,0.805844,3859_Zimbabwe,5,12,7,0.481472,71.453830,296.963034,286.109140,0.004455,1.242612,129.128337,296.285672,286.086518,0.809915,0.994018,126.344999,296.054577,285.788589,0.512137
32328,2016,Zimbabwe,70,72,71,67,63,63,62,17,14,15,20,24,24,24,0.14,0.09,0.10,0.06,0.04,0.03,0.03,0.10,0.10,0.10,0.10,0.09,0.09,0.08,0.595883,3871_Zimbabwe,5,12,7,0.880191,74.755430,297.273136,285.907031,0.847040,0.720637,91.456254,297.102164,285.640060,1.451785,1.000904,62.046051,297.754080,285.755915,0.087132


In [None]:
df.shape

(32330, 50)

Additionally, `Countries` and `Farm` attributes as categorical are dropped.

In [None]:
df = df.drop(['Countries','Farm'], axis=1)
df

Unnamed: 0,Year,Sand_1,Sand_2,Sand_3,Sand_4,Sand_5,Sand_6,Sand_7,Clay_1,Clay_2,Clay_3,Clay_4,Clay_5,Clay_6,Clay_7,OC_1,OC_2,OC_3,OC_4,OC_5,OC_6,OC_7,PAW_1,PAW_2,PAW_3,PAW_4,PAW_5,PAW_6,PAW_7,Y_maize_major,Sow_Maize_month_int,Harvest_Maize_month_int,sow_to_harvest_months,maize_lag-1,pcp_mean_lag-1,tmax_mean_lag-1,tmin_mean_lag-1,spi_mean_lag-1,maize_lag-2,pcp_mean_lag-2,tmax_mean_lag-2,tmin_mean_lag-2,spi_mean_lag-2,maize_lag-3,pcp_mean_lag-3,tmax_mean_lag-3,tmin_mean_lag-3,spi_mean_lag-3
0,2007,50,51,51,48,45,46,46,37,35,36,39,42,42,42,0.52,0.23,0.17,0.09,0.04,0.02,0.02,0.15,0.15,0.14,0.13,0.10,0.07,0.07,0.615357,9,4,7,0.554392,97.103755,301.939623,292.214020,0.093447,0.721607,129.051864,301.518536,292.496579,1.644698,0.620005,109.983325,301.786056,292.204097,0.514275
1,2007,62,64,63,59,58,59,59,27,25,26,29,31,30,30,0.11,0.05,0.07,0.04,0.02,0.02,0.01,0.11,0.10,0.10,0.09,0.07,0.07,0.03,0.257656,9,4,7,0.117051,59.292237,301.882929,288.092753,0.182926,0.300217,47.697564,303.988747,288.916992,0.909295,0.212699,41.130026,303.298082,288.642853,0.588172
2,2007,69,71,70,67,65,65,66,19,16,18,21,24,24,23,0.09,0.06,0.07,0.04,0.02,0.02,0.02,0.10,0.10,0.10,0.09,0.07,0.07,0.07,4.286831,9,4,7,3.093239,58.196545,302.891420,289.377311,0.991663,4.044452,42.130629,305.494178,290.535403,0.952237,2.295351,35.049776,304.824778,290.284886,0.371446
3,2007,60,63,61,57,53,53,53,29,26,28,32,35,36,36,0.46,0.16,0.14,0.08,0.05,0.04,0.03,0.12,0.13,0.12,0.12,0.11,0.10,0.09,0.700384,9,4,7,0.677797,149.210195,298.973795,287.311403,0.206751,0.907431,159.454723,299.404975,287.724299,1.374616,0.783018,174.088260,298.908208,287.362407,0.643207
4,2007,67,69,68,63,61,61,61,22,19,21,25,28,28,29,0.15,0.09,0.09,0.05,0.02,0.01,0.01,0.11,0.11,0.11,0.11,0.08,0.04,0.04,0.553450,9,4,7,0.412071,74.556629,304.006860,290.606725,-0.075621,0.675967,66.698670,304.644632,290.635254,1.144088,0.605584,67.404588,303.930955,290.564185,0.553079
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32325,2016,73,75,74,69,66,66,66,20,18,20,24,27,27,27,0.10,0.08,0.08,0.04,0.02,0.01,0.01,0.09,0.10,0.09,0.09,0.07,0.04,0.04,0.674080,5,12,7,0.841480,25.249322,302.485864,291.402194,0.213789,0.926596,60.841103,301.103214,290.648780,0.666820,0.767325,56.355874,301.345864,290.478016,0.378187
32326,2016,57,58,58,54,52,51,52,28,26,26,31,33,34,33,0.14,0.12,0.10,0.05,0.03,0.01,0.01,0.13,0.14,0.13,0.11,0.09,0.03,0.03,0.707797,5,12,7,0.685066,62.751591,299.755546,287.893781,0.412781,1.038142,62.393069,299.616883,288.078306,-0.008399,0.830597,89.248975,299.499617,287.620716,1.717135
32327,2016,55,57,56,54,52,51,51,30,28,30,32,34,35,35,0.53,0.41,0.30,0.22,0.18,0.16,0.15,0.18,0.19,0.17,0.15,0.14,0.14,0.14,0.805844,5,12,7,0.481472,71.453830,296.963034,286.109140,0.004455,1.242612,129.128337,296.285672,286.086518,0.809915,0.994018,126.344999,296.054577,285.788589,0.512137
32328,2016,70,72,71,67,63,63,62,17,14,15,20,24,24,24,0.14,0.09,0.10,0.06,0.04,0.03,0.03,0.10,0.10,0.10,0.10,0.09,0.09,0.08,0.595883,5,12,7,0.880191,74.755430,297.273136,285.907031,0.847040,0.720637,91.456254,297.102164,285.640060,1.451785,1.000904,62.046051,297.754080,285.755915,0.087132


Create train and test sets and enable subsample for faster testing.

In [None]:
# Separate a test set, the year 2016
df_test = df[df.Year == 2016]
df_train = df[df.Year < 2016]

In [None]:
subsample_size_test = int(len(df_test)/10)  # subsample subset of data for faster demo
df_test = df_test.sample(n=subsample_size_test, random_state=0)

subsample_size_train = int(len(df_train)/10)
df_train = df_train.sample(n=subsample_size_train, random_state=0)

In [None]:
df_test.shape, df_train.shape

((293, 48), (2940, 48))

## Hyperopt

### Testing hyperopt on subsample with small number of evaluations

Select regressors

In [None]:
names = ['Linear Regression', 'Random Forest', 'K-Nearest Neighbors', 'AdaBoost']
data = {'name': names}
df_best_regression = pd.DataFrame(data)

In [None]:
regression_models = [LinearRegression(), RandomForestRegressor(), KNeighborsRegressor(), AdaBoostRegressor()]

Define search spaces

In [None]:
# Define individual search spaces manually
regression_search_spaces = [
    # Linear Regression {'copy_X': True, 'fit_intercept': True, 'n_jobs': None, 'positive': False}
    {
        'fit_intercept': hp.choice('fit_intercept', [True, False]),
        'copy_X': hp.choice('copy_X', [True, False]),
        'n_jobs': hp.choice('n_jobs', [-1, 1, 2, 4]),  # Adjust the choices based on the available resources
        'positive': hp.choice('positive', [True, False])
    },

    # Random Forest {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': None, 'max_features': 1.0, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0,
                  # 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}

    {
        'ccp_alpha': hp.uniform('ccp_alpha', 0.0, 0.5),
        'max_depth': hp.choice('max_depth', range(1, 20)),
        'max_features': hp.choice('max_features', ['sqrt', 'log2']),
        'n_estimators': hp.choice('n_estimators', range(50, 200, 1)),
    },


    # K-Nearest Neighbors {'algorithm': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 5, 'p': 2, 'weights': 'uniform'}

    {
        'n_neighbors': hp.choice('n_neighbors', range(2, 20, 1)),
        'p': hp.choice('p', [1, 3]),
        'weights': hp.choice('weights', ['uniform', 'distance']),
        'algorithm': hp.choice('algorithm', ['auto', 'ball_tree', 'kd_tree', 'brute']),
        'leaf_size': hp.choice('leaf_size', range(10, 40, 1)),
    },


    # AdaBoost {'base_estimator': 'deprecated', 'estimator': None, 'learning_rate': 1.0, 'loss': 'linear', 'n_estimators': 50, 'random_state': None}
    {
        'n_estimators': hp.quniform('n_estimators', 50, 200, 10),
        'learning_rate': hp.loguniform('learning_rate', -4, 0),
    },

]


Cross validation preparation

In [None]:
# X and y
X_train = df_train.drop(columns=['Y_maize_major','Year'], axis=1)
y_train = df_train['Y_maize_major']
X_test = df_test.drop(columns=['Y_maize_major','Year'], axis=1)
y_test = df_test['Y_maize_major']

# Scale to [0,1] range
sc = MinMaxScaler()
X_train_scaled = pd.DataFrame(sc.fit_transform(X_train), columns=X_train.columns)
X_test_scaled = pd.DataFrame(sc.transform(X_test), columns=X_test.columns)

Run hyperopt with MAE calculation

In [None]:
# Initialize a dataframe for results collection
hpo_results = pd.DataFrame(columns=['regressor_name', 'best_cv_score', 'runtime_hpo', 'best_params', 'test_mae'])

# Define the best baseline score
best_baseline_score = 0.2161 # Define your baseline MAE score here

# Define how many iterations should be done
n_trials = 10

# All individual trials will be saved here
trials_dict = {}

# Start the timer to measure the runtime of the entire pipeline
hpo_time_start = time.time()

# Run the HPO and get the best params for each classifier
for i in range(len(df_best_regression)):

    # To improve the readability of code, creating the following objects:
    regressor_name = df_best_regression.loc[i, 'name']
    regressor_class = regression_models[names.index(regressor_name)]  # fetch from the list not df for baseline
    regressor_search_space = regression_search_spaces[names.index(regressor_name)]

    # To improve the readability of output:
    print()
    print('----------------------------------------------------------------------')
    print(f'Using *{regressor_name}* for estimation.')
    print('----------------------------------------------------------------------')

    # A objective function for receiving the CV scores for each model
    def hyperopt_cv_score(params):
        cv = RepeatedKFold(n_splits=5, n_repeats=5)  # can be also adjusted!
        # Check if 'n_estimators' is in params
        if 'n_estimators' in params:
            # Convert 'n_estimators' to an integer
            params['n_estimators'] = int(params['n_estimators'])
        model = regressor_class.set_params(**params)  # use the classifier from the list
        return cross_val_score(model,
                               X_train_scaled, y_train,
                               cv=cv,
                               scoring='neg_mean_absolute_error',
                               error_score='raise').mean()

    # A helper function for finding the best model
    def f(params):
        global best_cv_score
        global best_params
        global best_time

        cv_score = hyperopt_cv_score(params)

        if cv_score > best_cv_score:
            # Are we beating the best baseline score?
            if cv_score < best_baseline_score:  # we are beating the baseline accuracy
                best_cv_score = cv_score  # what is the best score?
                best_params = params  # what are the best params?
                best_time = round(time.time() - start_time, 4)  # track how much time it took to find the best params

                print(f'*{regressor_name}* BEAT THE BASELINE of {best_baseline_score}!')
                print(f'Better CV score: {best_cv_score}')
                print(f'Parameter combination: {best_params}')
                print(f'Time until beating the baseline: {best_time}s')
            else:
                best_cv_score = cv_score  # what is the best score?
                best_params = params  # what are the best params?
                best_time = round(time.time() - start_time, 4)  # track how much time it took to find the best params

                # Print the results for reference
                print(f'New best CV score: {best_cv_score}')
                print(f'New best params: {best_params}')
                print(f'Time taken until new best combination found: {round(best_time, 5)}s')

        return {'loss': -cv_score,  # see the comment below
                'status': STATUS_OK}
        # Comment regarding 'negative cv_score' (from the referenced source):
        ## Since we are trying to maximize the CV score (cv_score in the code above),
        ## we must negate this value for hyperopt, since hyperopt only knows how to minimize a function.
        ## Minimizing a function f is the same as maximizing the negative of f.
        ## About FMIN: https://github.com/hyperopt/hyperopt/wiki/FMin

    # Defining global variables to be updated
    best_cv_score = float('-inf')  # best CV score (negative infinity since we want to maximize)
    best_params = None  # best hyperparameter combination
    best_time = 0  # runtime until the best CV score is computed
    trials = Trials()  # store info at each step

    # Start running the algorithm and track time
    start_time = time.time()
    ## Hyperopt function
    best = fmin(f, regressor_search_space,  # use the search space associated with the classifier
                algo=tpe.suggest,
                max_evals=n_trials,  # how many evaluations?
                trials=trials)
    # Save all trials
    trials_dict[regressor_name] = trials  # save into classifier-trials

    print()
    print('######################################################################')
    # Print the summary of the best results
    print(f'Best CV score in {n_trials} iterations: {best_cv_score} ({best_time}s until found).')

    # Compute the accuracy score on the best model of the classifier
    m = regressor_class.set_params(**best_params).fit(X_train_scaled, y_train) if best_params is not None else regressor_class
    predictions = m.predict(X_test_scaled)
    score_test = mean_absolute_error(y_test, predictions)
    print(f'MAE on test data: {score_test}.')

    # Append the best results to the df
    hpo_results = hpo_results.append({'regressor_name': regressor_name,
                                      'best_cv_score': best_cv_score,
                                      'runtime_hpo': best_time,
                                      'best_params': best_params,
                                      'test_mae': score_test}, ignore_index=True)

# Mark the end of the entire pipeline
hpo_time_end = time.time()
print()
print('######################################################################')
print(f'The duration of the entire HPO pipeline for {len(df_best_regression)} classifiers across {n_trials} trials each: ')
print(f'{round(hpo_time_end - hpo_time_start, 5)} seconds')

# Sort the model results by test MAE and then Runtime
hpo_results = hpo_results.sort_values(['test_mae', 'runtime_hpo'], ascending=[1, 0]).reset_index(drop=True)



----------------------------------------------------------------------
Using *Linear Regression* for estimation.
----------------------------------------------------------------------
*Linear Regression* BEAT THE BASELINE of 0.2161!
Better CV score: -0.24296980351822214
Parameter combination: {'copy_X': False, 'fit_intercept': False, 'n_jobs': -1, 'positive': False}
Time until beating the baseline: 0.4993s
*Linear Regression* BEAT THE BASELINE of 0.2161!
Better CV score: -0.2381719753851962
Parameter combination: {'copy_X': False, 'fit_intercept': True, 'n_jobs': -1, 'positive': True}
Time until beating the baseline: 2.1371s
*Linear Regression* BEAT THE BASELINE of 0.2161!
Better CV score: -0.23794360018664318
Parameter combination: {'copy_X': True, 'fit_intercept': True, 'n_jobs': 1, 'positive': True}
Time until beating the baseline: 4.9545s
100%|██████████| 10/10 [00:06<00:00,  1.47trial/s, best loss: 0.23794360018664318]

############################################################

  hpo_results = hpo_results.append({'regressor_name': regressor_name,


*Random Forest* BEAT THE BASELINE of 0.2161!
Better CV score: -0.5689560338431129
Parameter combination: {'ccp_alpha': 0.17568997182691004, 'max_depth': 3, 'max_features': 'sqrt', 'n_estimators': 166}
Time until beating the baseline: 15.4454s
*Random Forest* BEAT THE BASELINE of 0.2161!
Better CV score: -0.4465440553777596
Parameter combination: {'ccp_alpha': 0.06613672666764309, 'max_depth': 19, 'max_features': 'log2', 'n_estimators': 118}
Time until beating the baseline: 86.9503s
*Random Forest* BEAT THE BASELINE of 0.2161!
Better CV score: -0.30201715068763396
Parameter combination: {'ccp_alpha': 0.005425925728565151, 'max_depth': 8, 'max_features': 'sqrt', 'n_estimators': 194}
Time until beating the baseline: 242.6732s
100%|██████████| 10/10 [04:37<00:00, 27.76s/trial, best loss: 0.30201715068763396]

######################################################################
Best CV score in 10 iterations: -0.30201715068763396 (242.6732s until found).
MAE on test data: 0.22342803514806

  hpo_results = hpo_results.append({'regressor_name': regressor_name,


*K-Nearest Neighbors* BEAT THE BASELINE of 0.2161!
Better CV score: -0.42496297725946097
Parameter combination: {'algorithm': 'auto', 'leaf_size': 27, 'n_neighbors': 13, 'p': 1, 'weights': 'uniform'}
Time until beating the baseline: 2.6802s
*K-Nearest Neighbors* BEAT THE BASELINE of 0.2161!
Better CV score: -0.4028675948951836
Parameter combination: {'algorithm': 'brute', 'leaf_size': 26, 'n_neighbors': 5, 'p': 1, 'weights': 'uniform'}
Time until beating the baseline: 6.6468s
*K-Nearest Neighbors* BEAT THE BASELINE of 0.2161!
Better CV score: -0.40070927168712017
Parameter combination: {'algorithm': 'kd_tree', 'leaf_size': 28, 'n_neighbors': 3, 'p': 1, 'weights': 'uniform'}
Time until beating the baseline: 9.5589s
*K-Nearest Neighbors* BEAT THE BASELINE of 0.2161!
Better CV score: -0.38299779346796575
Parameter combination: {'algorithm': 'brute', 'leaf_size': 32, 'n_neighbors': 6, 'p': 1, 'weights': 'distance'}
Time until beating the baseline: 54.8343s
100%|██████████| 10/10 [02:21<00:

  hpo_results = hpo_results.append({'regressor_name': regressor_name,


*AdaBoost* BEAT THE BASELINE of 0.2161!
Better CV score: -0.29367602226851725
Parameter combination: {'learning_rate': 0.033608926527254404, 'n_estimators': 160}
Time until beating the baseline: 87.0017s
*AdaBoost* BEAT THE BASELINE of 0.2161!
Better CV score: -0.2931456899903173
Parameter combination: {'learning_rate': 0.09217094503368232, 'n_estimators': 110}
Time until beating the baseline: 619.6593s
100%|██████████| 10/10 [10:19<00:00, 61.97s/trial, best loss: 0.2931456899903173]

######################################################################
Best CV score in 10 iterations: -0.2931456899903173 (619.6593s until found).
MAE on test data: 0.24396658069163268.

######################################################################
The duration of the entire HPO pipeline for 4 classifiers across 10 trials each: 
1050.94073 seconds


  hpo_results = hpo_results.append({'regressor_name': regressor_name,


In [None]:
# Add a column which says if the classifier beat the baseline
hpo_results['beats_bl'] = np.where(hpo_results.loc[:,'test_mae'] < best_baseline_score, 'yes', 'no')
hpo_results.to_csv('hpo_results_mae.csv', index=False)
# See the HPO results
hpo_results

Unnamed: 0,regressor_name,best_cv_score,runtime_hpo,best_params,test_mae,beats_bl
0,Linear Regression,-0.237944,4.9545,"{'copy_X': True, 'fit_intercept': True, 'n_job...",0.182933,yes
1,Random Forest,-0.302017,242.6732,"{'ccp_alpha': 0.005425925728565151, 'max_depth...",0.223428,no
2,AdaBoost,-0.293146,619.6593,"{'learning_rate': 0.09217094503368232, 'n_esti...",0.243967,no
3,K-Nearest Neighbors,-0.382998,54.8343,"{'algorithm': 'brute', 'leaf_size': 32, 'n_nei...",0.281084,no


Run hyperopt with RMSE calculation

In [None]:
# Initialize a dataframe for results collection
hpo_results = pd.DataFrame(columns=['regressor_name', 'best_cv_score', 'runtime_hpo', 'best_params', 'test_rmse'])

# Define the best baseline score
best_baseline_score = 0.3288  # Define your baseline RMSE score here

# Define how many iterations should be done
n_trials = 10

# All individual trials will be saved here
trials_dict = {}

# Start the timer to measure the runtime of the entire pipeline
hpo_time_start = time.time()

# Run the HPO and get the best params for each classifier
for i in range(len(df_best_regression)):

    # To improve the readability of code, creating the following objects:
    regressor_name = df_best_regression.loc[i, 'name']
    regressor_class = regression_models[names.index(regressor_name)]  # fetch from the list not df for baseline
    regressor_search_space = regression_search_spaces[names.index(regressor_name)]

    # To improve the readability of output:
    print()
    print('----------------------------------------------------------------------')
    print(f'Using *{regressor_name}* for estimation.')
    print('----------------------------------------------------------------------')

    # A objective function for receiving the CV scores for each model
    def hyperopt_cv_score(params):
        cv = RepeatedKFold(n_splits=5, n_repeats=5)  # can be also adjusted!
        # Check if 'n_estimators' is in params
        if 'n_estimators' in params:
            # Convert 'n_estimators' to an integer
            params['n_estimators'] = int(params['n_estimators'])
        model = regressor_class.set_params(**params)  # use the classifier from the list
        return cross_val_score(model,
                               X_train_scaled, y_train,
                               cv=cv,
                               scoring='neg_root_mean_squared_error',
                               error_score='raise').mean()

    # A helper function for finding the best model
    def f(params):
        global best_cv_score
        global best_params
        global best_time

        cv_score = hyperopt_cv_score(params)

        if cv_score > best_cv_score:
            # Are we beating the best baseline score?
            if cv_score < best_baseline_score:  # we are beating the baseline accuracy
                best_cv_score = cv_score  # what is the best score?
                best_params = params  # what are the best params?
                best_time = round(time.time() - start_time, 4)  # track how much time it took to find the best params

                print(f'*{regressor_name}* BEAT THE BASELINE of {best_baseline_score}!')
                print(f'Better CV score: {best_cv_score}')
                print(f'Parameter combination: {best_params}')
                print(f'Time until beating the baseline: {best_time}s')
            else:
                best_cv_score = cv_score  # what is the best score?
                best_params = params  # what are the best params?
                best_time = round(time.time() - start_time, 4)  # track how much time it took to find the best params

                # Print the results for reference
                print(f'New best CV score: {best_cv_score}')
                print(f'New best params: {best_params}')
                print(f'Time taken until new best combination found: {round(best_time, 5)}s')

        return {'loss': -cv_score,  # see the comment below
                'status': STATUS_OK}
        # Comment regarding 'negative cv_score' (from the referenced source):
        ## Since we are trying to maximize the CV score (cv_score in the code above),
        ## we must negate this value for hyperopt, since hyperopt only knows how to minimize a function.
        ## Minimizing a function f is the same as maximizing the negative of f.
        ## About FMIN: https://github.com/hyperopt/hyperopt/wiki/FMin

    # Defining global variables to be updated
    best_cv_score = float('-inf')  # best CV score (negative infinity since we want to maximize)
    best_params = None  # best hyperparameter combination
    best_time = 0  # runtime until the best CV score is computed
    trials = Trials()  # store info at each step

    # Start running the algorithm and track time
    start_time = time.time()
    ## Hyperopt function
    best = fmin(f, regressor_search_space,  # use the search space associated with the classifier
                algo=tpe.suggest,
                max_evals=n_trials,  # how many evaluations?
                trials=trials)
    # Save all trials
    trials_dict[regressor_name] = trials  # save into classifier-trials

    print()
    print('######################################################################')
    # Print the summary of the best results
    print(f'Best CV score in {n_trials} iterations: {best_cv_score} ({best_time}s until found).')

    # Compute the accuracy score on the best model of the classifier
    m = regressor_class.set_params(**best_params).fit(X_train_scaled, y_train) if best_params is not None else regressor_class
    predictions = m.predict(X_test_scaled)
    score_test = mean_squared_error(y_test, predictions, squared=False)  # RMSE calculation
    print(f'RMSE on test data: {score_test}.')

    # Append the best results to the df
    hpo_results = hpo_results.append({'regressor_name': regressor_name,
                                      'best_cv_score': best_cv_score,
                                      'runtime_hpo': best_time,
                                      'best_params': best_params,
                                      'test_rmse': score_test}, ignore_index=True)

# Mark the end of the entire pipeline
hpo_time_end = time.time()
print()
print('######################################################################')
print(f'The duration of the entire HPO pipeline for {len(df_best_regression)} classifiers across {n_trials} trials each: ')
print(f'{round(hpo_time_end - hpo_time_start, 5)} seconds')

# Sort the model results by test RMSE and then Runtime
hpo_results = hpo_results.sort_values(['test_rmse', 'runtime_hpo'], ascending=[1, 0]).reset_index(drop=True)



----------------------------------------------------------------------
Using *Linear Regression* for estimation.
----------------------------------------------------------------------
*Linear Regression* BEAT THE BASELINE of 0.3288!
Better CV score: -0.3868316894469164
Parameter combination: {'copy_X': True, 'fit_intercept': True, 'n_jobs': 4, 'positive': True}
Time until beating the baseline: 0.4168s
*Linear Regression* BEAT THE BASELINE of 0.3288!
Better CV score: -0.38192039781586506
Parameter combination: {'copy_X': False, 'fit_intercept': True, 'n_jobs': 4, 'positive': False}
Time until beating the baseline: 1.4502s
*Linear Regression* BEAT THE BASELINE of 0.3288!
Better CV score: -0.3805624690688404
Parameter combination: {'copy_X': False, 'fit_intercept': True, 'n_jobs': 4, 'positive': False}
Time until beating the baseline: 2.4097s
100%|██████████| 10/10 [00:04<00:00,  2.12trial/s, best loss: 0.3805624690688404]

################################################################

  hpo_results = hpo_results.append({'regressor_name': regressor_name,


*Random Forest* BEAT THE BASELINE of 0.3288!
Better CV score: -0.6375666295322734
Parameter combination: {'ccp_alpha': 0.04277491500367392, 'max_depth': 10, 'max_features': 'log2', 'n_estimators': 97}
Time until beating the baseline: 19.2097s
*Random Forest* BEAT THE BASELINE of 0.3288!
Better CV score: -0.5325815779130417
Parameter combination: {'ccp_alpha': 0.017833706147461714, 'max_depth': 14, 'max_features': 'sqrt', 'n_estimators': 73}
Time until beating the baseline: 58.9603s
100%|██████████| 10/10 [05:01<00:00, 30.16s/trial, best loss: 0.5325815779130417]

######################################################################
Best CV score in 10 iterations: -0.5325815779130417 (58.9603s until found).
RMSE on test data: 1.5383144632229695.

----------------------------------------------------------------------
Using *K-Nearest Neighbors* for estimation.
----------------------------------------------------------------------
  0%|          | 0/10 [00:00<?, ?trial/s, best loss=?]

  hpo_results = hpo_results.append({'regressor_name': regressor_name,


*K-Nearest Neighbors* BEAT THE BASELINE of 0.3288!
Better CV score: -0.6039470445604822
Parameter combination: {'algorithm': 'ball_tree', 'leaf_size': 19, 'n_neighbors': 9, 'p': 3, 'weights': 'distance'}
Time until beating the baseline: 42.1226s
*K-Nearest Neighbors* BEAT THE BASELINE of 0.3288!
Better CV score: -0.5910855238751407
Parameter combination: {'algorithm': 'kd_tree', 'leaf_size': 22, 'n_neighbors': 4, 'p': 3, 'weights': 'distance'}
Time until beating the baseline: 56.1241s
*K-Nearest Neighbors* BEAT THE BASELINE of 0.3288!
Better CV score: -0.5851321055789337
Parameter combination: {'algorithm': 'ball_tree', 'leaf_size': 31, 'n_neighbors': 5, 'p': 3, 'weights': 'distance'}
Time until beating the baseline: 182.0509s
100%|██████████| 10/10 [03:10<00:00, 19.10s/trial, best loss: 0.5851321055789337]

######################################################################
Best CV score in 10 iterations: -0.5851321055789337 (182.0509s until found).
RMSE on test data: 1.18047450780

  hpo_results = hpo_results.append({'regressor_name': regressor_name,


*AdaBoost* BEAT THE BASELINE of 0.3288!
Better CV score: -0.4654391461685109
Parameter combination: {'learning_rate': 0.4391853838567028, 'n_estimators': 70}
Time until beating the baseline: 27.8457s
*AdaBoost* BEAT THE BASELINE of 0.3288!
Better CV score: -0.4516744138694081
Parameter combination: {'learning_rate': 0.03820226790591132, 'n_estimators': 80}
Time until beating the baseline: 162.6616s
*AdaBoost* BEAT THE BASELINE of 0.3288!
Better CV score: -0.4483900775828862
Parameter combination: {'learning_rate': 0.03580302378682828, 'n_estimators': 120}
Time until beating the baseline: 275.2787s
*AdaBoost* BEAT THE BASELINE of 0.3288!
Better CV score: -0.4463879026121158
Parameter combination: {'learning_rate': 0.040566558900453256, 'n_estimators': 130}
Time until beating the baseline: 345.8561s
*AdaBoost* BEAT THE BASELINE of 0.3288!
Better CV score: -0.44297855994739466
Parameter combination: {'learning_rate': 0.16333272051917694, 'n_estimators': 60}
Time until beating the baseline

  hpo_results = hpo_results.append({'regressor_name': regressor_name,


In [None]:
# Add a column which says if the classifier beat the baseline
hpo_results['beats_bl'] = np.where(hpo_results.loc[:,'test_rmse'] < best_baseline_score, 'yes', 'no')
hpo_results.to_csv('hpo_results_rmse.csv', index=False)
# See the HPO results
hpo_results

Unnamed: 0,regressor_name,best_cv_score,runtime_hpo,best_params,test_rmse,beats_bl
0,Linear Regression,-0.380562,2.4097,"{'copy_X': False, 'fit_intercept': True, 'n_jo...",0.294936,yes
1,K-Nearest Neighbors,-0.585132,182.0509,"{'algorithm': 'ball_tree', 'leaf_size': 31, 'n...",1.180475,no
2,Random Forest,-0.532582,58.9603,"{'ccp_alpha': 0.017833706147461714, 'max_depth...",1.538314,no
3,AdaBoost,-0.442451,529.2484,"{'learning_rate': 0.06406448823848802, 'n_esti...",1.826196,no


### Testing hyperopt with Linear Regression for full dataset

Reload the full dataset and prepare for modelling

In [None]:
df = pd.read_csv('df_prepped.csv')
df.shape

(32330, 50)

In [None]:
df = df.drop(['Countries','Farm'], axis=1)

In [None]:
df_test = df[df.Year == 2016]
df_train = df[df.Year < 2016]

In [None]:
# X and y
X_train = df_train.drop(columns=['Y_maize_major','Year'], axis=1)
y_train = df_train['Y_maize_major']
X_test = df_test.drop(columns=['Y_maize_major','Year'], axis=1)
y_test = df_test['Y_maize_major']

# Scale to [0,1] range
sc = MinMaxScaler()
X_train_scaled = pd.DataFrame(sc.fit_transform(X_train), columns=X_train.columns)
X_test_scaled = pd.DataFrame(sc.transform(X_test), columns=X_test.columns)

In [None]:
names = ['Linear Regression']
data = {'name': names}
df_best_regression = pd.DataFrame(data)

Run hyperopt with MAE

In [None]:
# Initialize a dataframe for results collection
hpo_results = pd.DataFrame(columns=['regressor_name', 'best_cv_score', 'runtime_hpo', 'best_params', 'test_mae'])

# Define the best baseline score
best_baseline_score = 0.2161 # Define your baseline MAE score here

# Define how many iterations should be done
n_trials = 200

# All individual trials will be saved here
trials_dict = {}

# Start the timer to measure the runtime of the entire pipeline
hpo_time_start = time.time()

# Run the HPO and get the best params for each classifier
for i in range(len(df_best_regression)):

    # To improve the readability of code, creating the following objects:
    regressor_name = df_best_regression.loc[i, 'name']
    regressor_class = regression_models[names.index(regressor_name)]  # fetch from the list not df for baseline
    regressor_search_space = regression_search_spaces[names.index(regressor_name)]

    # To improve the readability of output:
    print()
    print('----------------------------------------------------------------------')
    print(f'Using *{regressor_name}* for estimation.')
    print('----------------------------------------------------------------------')

    # A objective function for receiving the CV scores for each model
    def hyperopt_cv_score(params):
        cv = RepeatedKFold(n_splits=5, n_repeats=5)  # can be also adjusted!
        # Check if 'n_estimators' is in params
        if 'n_estimators' in params:
            # Convert 'n_estimators' to an integer
            params['n_estimators'] = int(params['n_estimators'])
        model = regressor_class.set_params(**params)  # use the classifier from the list
        return cross_val_score(model,
                               X_train_scaled, y_train,
                               cv=cv,
                               scoring='neg_mean_absolute_error',
                               error_score='raise').mean()

    # A helper function for finding the best model
    def f(params):
        global best_cv_score
        global best_params
        global best_time

        cv_score = hyperopt_cv_score(params)

        if cv_score > best_cv_score:
            # Are we beating the best baseline score?
            if cv_score < best_baseline_score:  # we are beating the baseline accuracy
                best_cv_score = cv_score  # what is the best score?
                best_params = params  # what are the best params?
                best_time = round(time.time() - start_time, 4)  # track how much time it took to find the best params

                print(f'*{regressor_name}* BEAT THE BASELINE of {best_baseline_score}!')
                print(f'Better CV score: {best_cv_score}')
                print(f'Parameter combination: {best_params}')
                print(f'Time until beating the baseline: {best_time}s')
            else:
                best_cv_score = cv_score  # what is the best score?
                best_params = params  # what are the best params?
                best_time = round(time.time() - start_time, 4)  # track how much time it took to find the best params

                # Print the results for reference
                print(f'New best CV score: {best_cv_score}')
                print(f'New best params: {best_params}')
                print(f'Time taken until new best combination found: {round(best_time, 5)}s')

        return {'loss': -cv_score,  # see the comment below
                'status': STATUS_OK}
        # Comment regarding 'negative cv_score' (from the referenced source):
        ## Since we are trying to maximize the CV score (cv_score in the code above),
        ## we must negate this value for hyperopt, since hyperopt only knows how to minimize a function.
        ## Minimizing a function f is the same as maximizing the negative of f.
        ## About FMIN: https://github.com/hyperopt/hyperopt/wiki/FMin

    # Defining global variables to be updated
    best_cv_score = float('-inf')  # best CV score (negative infinity since we want to maximize)
    best_params = None  # best hyperparameter combination
    best_time = 0  # runtime until the best CV score is computed
    trials = Trials()  # store info at each step

    # Start running the algorithm and track time
    start_time = time.time()
    ## Hyperopt function
    best = fmin(f, regressor_search_space,  # use the search space associated with the classifier
                algo=tpe.suggest,
                max_evals=n_trials,  # how many evaluations?
                trials=trials)
    # Save all trials
    trials_dict[regressor_name] = trials  # save into classifier-trials

    print()
    print('######################################################################')
    # Print the summary of the best results
    print(f'Best CV score in {n_trials} iterations: {best_cv_score} ({best_time}s until found).')

    # Compute the accuracy score on the best model of the classifier
    m = regressor_class.set_params(**best_params).fit(X_train_scaled, y_train) if best_params is not None else regressor_class
    predictions = m.predict(X_test_scaled)
    score_test = mean_absolute_error(y_test, predictions)
    print(f'MAE on test data: {score_test}.')

    # Append the best results to the df
    hpo_results = hpo_results.append({'regressor_name': regressor_name,
                                      'best_cv_score': best_cv_score,
                                      'runtime_hpo': best_time,
                                      'best_params': best_params,
                                      'test_mae': score_test}, ignore_index=True)

# Mark the end of the entire pipeline
hpo_time_end = time.time()
print()
print('######################################################################')
print(f'The duration of the entire HPO pipeline for {len(df_best_regression)} classifiers across {n_trials} trials each: ')
print(f'{round(hpo_time_end - hpo_time_start, 5)} seconds')

# Sort the model results by test MAE and then Runtime
hpo_results = hpo_results.sort_values(['test_mae', 'runtime_hpo'], ascending=[1, 0]).reset_index(drop=True)



----------------------------------------------------------------------
Using *Linear Regression* for estimation.
----------------------------------------------------------------------
*Linear Regression* BEAT THE BASELINE of 0.2161!
Better CV score: -0.23054177608914553
Parameter combination: {'copy_X': False, 'fit_intercept': True, 'n_jobs': 2, 'positive': False}
Time until beating the baseline: 2.9925s
*Linear Regression* BEAT THE BASELINE of 0.2161!
Better CV score: -0.2303309940471637
Parameter combination: {'copy_X': False, 'fit_intercept': False, 'n_jobs': 2, 'positive': False}
Time until beating the baseline: 9.5755s
*Linear Regression* BEAT THE BASELINE of 0.2161!
Better CV score: -0.2288610280079438
Parameter combination: {'copy_X': True, 'fit_intercept': True, 'n_jobs': 2, 'positive': True}
Time until beating the baseline: 15.1442s
*Linear Regression* BEAT THE BASELINE of 0.2161!
Better CV score: -0.22883618191451102
Parameter combination: {'copy_X': False, 'fit_intercept': 

  hpo_results = hpo_results.append({'regressor_name': regressor_name,


In [None]:
# Add a column which says if the classifier beat the baseline
hpo_results['beats_bl'] = np.where(hpo_results.loc[:,'test_mae'] < best_baseline_score, 'yes', 'no')
hpo_results.to_csv('hpo_results_mae_lr.csv', index=False)
# See the HPO results
hpo_results

Unnamed: 0,regressor_name,best_cv_score,runtime_hpo,best_params,test_mae,beats_bl
0,Linear Regression,-0.228806,269.525,"{'copy_X': True, 'fit_intercept': True, 'n_job...",0.199428,yes


Run hyperopt with RMSE

In [None]:
# Initialize a dataframe for results collection
hpo_results = pd.DataFrame(columns=['regressor_name', 'best_cv_score', 'runtime_hpo', 'best_params', 'test_rmse'])

# Define the best baseline score
best_baseline_score = 0.3288  # Define your baseline RMSE score here

# Define how many iterations should be done
n_trials = 200

# All individual trials will be saved here
trials_dict = {}

# Start the timer to measure the runtime of the entire pipeline
hpo_time_start = time.time()

# Run the HPO and get the best params for each classifier
for i in range(len(df_best_regression)):

    # To improve the readability of code, creating the following objects:
    regressor_name = df_best_regression.loc[i, 'name']
    regressor_class = regression_models[names.index(regressor_name)]  # fetch from the list not df for baseline
    regressor_search_space = regression_search_spaces[names.index(regressor_name)]

    # To improve the readability of output:
    print()
    print('----------------------------------------------------------------------')
    print(f'Using *{regressor_name}* for estimation.')
    print('----------------------------------------------------------------------')

    # A objective function for receiving the CV scores for each model
    def hyperopt_cv_score(params):
        cv = RepeatedKFold(n_splits=5, n_repeats=5)  # can be also adjusted!
        # Check if 'n_estimators' is in params
        if 'n_estimators' in params:
            # Convert 'n_estimators' to an integer
            params['n_estimators'] = int(params['n_estimators'])
        model = regressor_class.set_params(**params)  # use the classifier from the list
        return cross_val_score(model,
                               X_train_scaled, y_train,
                               cv=cv,
                               scoring='neg_root_mean_squared_error',
                               error_score='raise').mean()

    # A helper function for finding the best model
    def f(params):
        global best_cv_score
        global best_params
        global best_time

        cv_score = hyperopt_cv_score(params)

        if cv_score > best_cv_score:
            # Are we beating the best baseline score?
            if cv_score < best_baseline_score:  # we are beating the baseline accuracy
                best_cv_score = cv_score  # what is the best score?
                best_params = params  # what are the best params?
                best_time = round(time.time() - start_time, 4)  # track how much time it took to find the best params

                print(f'*{regressor_name}* BEAT THE BASELINE of {best_baseline_score}!')
                print(f'Better CV score: {best_cv_score}')
                print(f'Parameter combination: {best_params}')
                print(f'Time until beating the baseline: {best_time}s')
            else:
                best_cv_score = cv_score  # what is the best score?
                best_params = params  # what are the best params?
                best_time = round(time.time() - start_time, 4)  # track how much time it took to find the best params

                # Print the results for reference
                print(f'New best CV score: {best_cv_score}')
                print(f'New best params: {best_params}')
                print(f'Time taken until new best combination found: {round(best_time, 5)}s')

        return {'loss': -cv_score,  # see the comment below
                'status': STATUS_OK}
        # Comment regarding 'negative cv_score' (from the referenced source):
        ## Since we are trying to maximize the CV score (cv_score in the code above),
        ## we must negate this value for hyperopt, since hyperopt only knows how to minimize a function.
        ## Minimizing a function f is the same as maximizing the negative of f.
        ## About FMIN: https://github.com/hyperopt/hyperopt/wiki/FMin

    # Defining global variables to be updated
    best_cv_score = float('-inf')  # best CV score (negative infinity since we want to maximize)
    best_params = None  # best hyperparameter combination
    best_time = 0  # runtime until the best CV score is computed
    trials = Trials()  # store info at each step

    # Start running the algorithm and track time
    start_time = time.time()
    ## Hyperopt function
    best = fmin(f, regressor_search_space,  # use the search space associated with the classifier
                algo=tpe.suggest,
                max_evals=n_trials,  # how many evaluations?
                trials=trials)
    # Save all trials
    trials_dict[regressor_name] = trials  # save into classifier-trials

    print()
    print('######################################################################')
    # Print the summary of the best results
    print(f'Best CV score in {n_trials} iterations: {best_cv_score} ({best_time}s until found).')

    # Compute the accuracy score on the best model of the classifier
    m = regressor_class.set_params(**best_params).fit(X_train_scaled, y_train) if best_params is not None else regressor_class
    predictions = m.predict(X_test_scaled)
    score_test = mean_squared_error(y_test, predictions, squared=False)  # RMSE calculation
    print(f'RMSE on test data: {score_test}.')

    # Append the best results to the df
    hpo_results = hpo_results.append({'regressor_name': regressor_name,
                                      'best_cv_score': best_cv_score,
                                      'runtime_hpo': best_time,
                                      'best_params': best_params,
                                      'test_rmse': score_test}, ignore_index=True)

# Mark the end of the entire pipeline
hpo_time_end = time.time()
print()
print('######################################################################')
print(f'The duration of the entire HPO pipeline for {len(df_best_regression)} classifiers across {n_trials} trials each: ')
print(f'{round(hpo_time_end - hpo_time_start, 5)} seconds')

# Sort the model results by test RMSE and then Runtime
hpo_results = hpo_results.sort_values(['test_rmse', 'runtime_hpo'], ascending=[1, 0]).reset_index(drop=True)



----------------------------------------------------------------------
Using *Linear Regression* for estimation.
----------------------------------------------------------------------
*Linear Regression* BEAT THE BASELINE of 0.3288!
Better CV score: -0.3718505836595213
Parameter combination: {'copy_X': True, 'fit_intercept': True, 'n_jobs': -1, 'positive': False}
Time until beating the baseline: 2.7224s
*Linear Regression* BEAT THE BASELINE of 0.3288!
Better CV score: -0.3717298266246012
Parameter combination: {'copy_X': False, 'fit_intercept': False, 'n_jobs': 4, 'positive': False}
Time until beating the baseline: 8.1831s
*Linear Regression* BEAT THE BASELINE of 0.3288!
Better CV score: -0.371721356717252
Parameter combination: {'copy_X': True, 'fit_intercept': False, 'n_jobs': -1, 'positive': False}
Time until beating the baseline: 49.675s
*Linear Regression* BEAT THE BASELINE of 0.3288!
Better CV score: -0.3716820797497892
Parameter combination: {'copy_X': True, 'fit_intercept': Fa

  hpo_results = hpo_results.append({'regressor_name': regressor_name,


In [None]:
# Add a column which says if the classifier beat the baseline
hpo_results['beats_bl'] = np.where(hpo_results.loc[:,'test_rmse'] < best_baseline_score, 'yes', 'no')
hpo_results.to_csv('hpo_results_rmse_lr.csv', index=False)
# See the HPO results
hpo_results

Unnamed: 0,regressor_name,best_cv_score,runtime_hpo,best_params,test_rmse,beats_bl
0,Linear Regression,-0.371619,568.8765,"{'copy_X': False, 'fit_intercept': True, 'n_jo...",0.329736,no


Reference: https://github.com/qetdr/xAutoML-Project1/blob/main/project1_notebook.ipynb