Goal of this notebook is to test wide range of potential models.

1. Lazypredict
-> choose promising models
2. GridSearch on selection of promising models
3. Summary of hot candidates including metric, high influencing variables, pca of cluster, etc.


Open todo's-> (maybe other notebook)
- hyperparameter optimization
- dimension reduction

# Lazy Predict

We'll use LazyRegressor as we're dealing with a supervised learning regression problem and want to check potential models for our usecase

Select models from LazyRegressor as processing failed for some in previous sessions.

In [1]:
from copy import deepcopy
from datetime import datetime

import joblib
import pandas as pd
from config import OUTPUT_DIR, TRAIN_TEST_SPLIT_ELECTRIC_FILE
from lazypredict.Supervised import REGRESSORS, LazyRegressor

In [2]:
current_date = datetime.now().strftime('%Y_%m_%d')

In [3]:
current_date = datetime.now().strftime('%Y_%m_%d')
X_train, X_test, y_train, y_test = joblib.load(TRAIN_TEST_SPLIT_ELECTRIC_FILE)

In [4]:
# example prints to validate correct output
X_train.head(2)

Unnamed: 0,member_state_AT,member_state_BE,member_state_BG,member_state_CY,member_state_CZ,member_state_DE,member_state_DK,member_state_EE,member_state_ES,member_state_FI,...,commercial_name_iX3,commercial_name_other,category_of_vehicle_M1,category_of_vehicle_M1.1,category_of_vehicle_M1G,category_of_vehicle_N1,mass_vehicle,engine_power,year,electric_range
981250,-0.18,-0.19,-0.03,-0.01,-0.06,-0.74,-0.18,-0.03,-0.17,-0.12,...,-0.09,-0.23,0.02,-0.0,-0.02,-0.0,2.07,-0.04,-0.23,-0.2
3356637,-0.18,-0.19,-0.03,-0.01,-0.06,1.36,-0.18,-0.03,-0.17,-0.12,...,-0.09,-0.23,0.02,-0.0,-0.02,-0.0,0.28,0.07,1.02,1.33


In [5]:
y_train.head(2)

981250    249.00
3356637   156.00
Name: electric_energy_consumption, dtype: float64

In [6]:
# explicit selection through mannual list (based on lazypredict.Supervised.REGRESSION)

# import sklearn.svm
# import xgboost
# import lightgbm

# model_selection = [
#     ('AdaBoostRegressor', sklearn.ensemble._weight_boosting.AdaBoostRegressor),
#     ('BaggingRegressor', sklearn.ensemble._bagging.BaggingRegressor),
#     ('BayesianRidge', sklearn.linear_model._bayes.BayesianRidge),
#     ('DecisionTreeRegressor', sklearn.tree._classes.DecisionTreeRegressor),
#     ('DummyRegressor', sklearn.dummy.DummyRegressor),
#     ('ElasticNet', sklearn.linear_model._coordinate_descent.ElasticNet),
#     ('ElasticNetCV', sklearn.linear_model._coordinate_descent.ElasticNetCV),
#     ('ExtraTreeRegressor', sklearn.tree._classes.ExtraTreeRegressor),
#     ('ExtraTreesRegressor', sklearn.ensemble._forest.ExtraTreesRegressor),
#     ('GammaRegressor', sklearn.linear_model._glm.glm.GammaRegressor),
#     ('GaussianProcessRegressor', sklearn.gaussian_process._gpr.GaussianProcessRegressor),
#     ('GradientBoostingRegressor', sklearn.ensemble._gb.GradientBoostingRegressor),
#     ('HistGradientBoostingRegressor', sklearn.ensemble._hist_gradient_boosting.gradient_boosting.HistGradientBoostingRegressor),
#     ('HuberRegressor', sklearn.linear_model._huber.HuberRegressor),
#     ('KNeighborsRegressor', sklearn.neighbors._regression.KNeighborsRegressor),
#     ('KernelRidge', sklearn.kernel_ridge.KernelRidge),
#     ('Lars', sklearn.linear_model._least_angle.Lars),
#     ('LarsCV', sklearn.linear_model._least_angle.LarsCV),
#     ('Lasso', sklearn.linear_model._coordinate_descent.Lasso),
#     ('LassoCV', sklearn.linear_model._coordinate_descent.LassoCV),
#     ('LassoLars', sklearn.linear_model._least_angle.LassoLars),
#     ('LassoLarsCV', sklearn.linear_model._least_angle.LassoLarsCV),
#     ('LassoLarsIC', sklearn.linear_model._least_angle.LassoLarsIC),
#     ('LinearRegression', sklearn.linear_model._base.LinearRegression),
#     ('LinearSVR', sklearn.svm._classes.LinearSVR),
#     ('MLPRegressor', sklearn.neural_network._multilayer_perceptron.MLPRegressor),
#     ('NuSVR', sklearn.svm._classes.NuSVR),
#     ('OrthogonalMatchingPursuit', sklearn.linear_model._omp.OrthogonalMatchingPursuit),
#     ('OrthogonalMatchingPursuitCV', sklearn.linear_model._omp.OrthogonalMatchingPursuitCV),
#     ('PassiveAggressiveRegressor', sklearn.linear_model._passive_aggressive.PassiveAggressiveRegressor),
#     ('PoissonRegressor', sklearn.linear_model._glm.glm.PoissonRegressor),
#     ('QuantileRegressor', sklearn.linear_model._quantile.QuantileRegressor),
#     ('RANSACRegressor', sklearn.linear_model._ransac.RANSACRegressor),
#     ('RandomForestRegressor', sklearn.ensemble._forest.RandomForestRegressor),
#     ('Ridge', sklearn.linear_model._ridge.Ridge),
#     ('RidgeCV', sklearn.linear_model._ridge.RidgeCV),
#     ('SGDRegressor', sklearn.linear_model._stochastic_gradient.SGDRegressor),
#     ('SVR', sklearn.svm._classes.SVR),
#     ('TransformedTargetRegressor', sklearn.compose._target.TransformedTargetRegressor),
#     ('TweedieRegressor', sklearn.linear_model._glm.glm.TweedieRegressor),
#     ('XGBRegressor', xgboost.sklearn.XGBRegressor),
#     ('LGBMRegressor', lightgbm.sklearn.LGBMRegressor)
# ]

# selection through ignore

model_selection = REGRESSORS
"""
----
Removed due to failed executions:
----
Removed due to high compute time:
----
NuSVR
QuantileRegressor
SVR
"""
models_to_ignore = list([
    "NuSVR",
    # "MLPRegressor",
    "QuantileRegressor",
    "SVR"
])

Use chunk approach

In [7]:
class LazyPredictChunk:
    def __init__(self, models=None, predictions=None, regressors="all"):
        self.models = models
        self.predictions = predictions
        self.regressors = regressors

In [8]:
chunk_size = 1
lazy_predict_chunks = []

# split regressors into chunks
i = 0
chunk_buffer = []
for model, model_class in model_selection:
    # ignore models that are in ignore list
    if model in models_to_ignore:
        i+=1
        continue

    # append if chunk_buffer below size limit
    if len(chunk_buffer) < chunk_size:
        chunk_buffer.append((model, model_class))

    # chunk size reached or last iteration step reached
    if len(chunk_buffer) == chunk_size or (i+1) == len(REGRESSORS):
        lazy_predict_chunks.append(LazyPredictChunk(regressors=deepcopy(chunk_buffer)))
        # empty chunk_buffer
        chunk_buffer = []

    i+=1

In [9]:
print("Executing LazyRegressor through: ", len(lazy_predict_chunks), "chunks")

# save compute if executed without predictions
predictions_flag = False

for chunk in lazy_predict_chunks:
    print("Processings the following regressors in this chunk: ", chunk.regressors)
    reg = LazyRegressor(verbose=0, ignore_warnings=True, custom_metric=None, regressors=chunk.regressors, predictions=predictions_flag)
    models,predictions = reg.fit(X_train, X_test, y_train, y_test)
    # models,predictions = reg.fit(X_train[:500], X_test[:500], y_train[:500], y_test[:500])

    # save trained models with joblib
    for model_name, model in reg.models.items():
        joblib.dump(model, f"{OUTPUT_DIR}models/{current_date}-lazy-{model_name}.pkl")

    # store results into LazyPredictChunk variables
    chunk.models = deepcopy(models) # deepcopy to not have weird references
    if predictions_flag:
        chunk.predictions = deepcopy(predictions)

Executing LazyRegressor through:  39 chunks
Processings the following regressors in this chunk:  [('AdaBoostRegressor', <class 'sklearn.ensemble._weight_boosting.AdaBoostRegressor'>)]
'tuple' object has no attribute '__name__'
Invalid Regressor(s)


100%|██████████| 1/1 [01:38<00:00, 98.53s/it]


Processings the following regressors in this chunk:  [('BaggingRegressor', <class 'sklearn.ensemble._bagging.BaggingRegressor'>)]
'tuple' object has no attribute '__name__'
Invalid Regressor(s)


100%|██████████| 1/1 [00:47<00:00, 47.58s/it]


Processings the following regressors in this chunk:  [('BayesianRidge', <class 'sklearn.linear_model._bayes.BayesianRidge'>)]
'tuple' object has no attribute '__name__'
Invalid Regressor(s)


100%|██████████| 1/1 [00:17<00:00, 17.80s/it]


Processings the following regressors in this chunk:  [('DecisionTreeRegressor', <class 'sklearn.tree._classes.DecisionTreeRegressor'>)]
'tuple' object has no attribute '__name__'
Invalid Regressor(s)


100%|██████████| 1/1 [00:12<00:00, 12.39s/it]


Processings the following regressors in this chunk:  [('DummyRegressor', <class 'sklearn.dummy.DummyRegressor'>)]
'tuple' object has no attribute '__name__'
Invalid Regressor(s)


100%|██████████| 1/1 [00:07<00:00,  7.48s/it]


Processings the following regressors in this chunk:  [('ElasticNet', <class 'sklearn.linear_model._coordinate_descent.ElasticNet'>)]
'tuple' object has no attribute '__name__'
Invalid Regressor(s)


100%|██████████| 1/1 [00:10<00:00, 10.23s/it]


Processings the following regressors in this chunk:  [('ElasticNetCV', <class 'sklearn.linear_model._coordinate_descent.ElasticNetCV'>)]
'tuple' object has no attribute '__name__'
Invalid Regressor(s)


100%|██████████| 1/1 [00:44<00:00, 44.39s/it]


Processings the following regressors in this chunk:  [('ExtraTreeRegressor', <class 'sklearn.tree._classes.ExtraTreeRegressor'>)]
'tuple' object has no attribute '__name__'
Invalid Regressor(s)


100%|██████████| 1/1 [00:13<00:00, 13.62s/it]


Processings the following regressors in this chunk:  [('ExtraTreesRegressor', <class 'sklearn.ensemble._forest.ExtraTreesRegressor'>)]
'tuple' object has no attribute '__name__'
Invalid Regressor(s)


100%|██████████| 1/1 [08:59<00:00, 539.32s/it]


Processings the following regressors in this chunk:  [('GammaRegressor', <class 'sklearn.linear_model._glm.glm.GammaRegressor'>)]
'tuple' object has no attribute '__name__'
Invalid Regressor(s)


100%|██████████| 1/1 [00:08<00:00,  8.75s/it]


Processings the following regressors in this chunk:  [('GaussianProcessRegressor', <class 'sklearn.gaussian_process._gpr.GaussianProcessRegressor'>)]
'tuple' object has no attribute '__name__'
Invalid Regressor(s)


100%|██████████| 1/1 [00:03<00:00,  3.63s/it]


Processings the following regressors in this chunk:  [('GradientBoostingRegressor', <class 'sklearn.ensemble._gb.GradientBoostingRegressor'>)]
'tuple' object has no attribute '__name__'
Invalid Regressor(s)


100%|██████████| 1/1 [02:29<00:00, 149.53s/it]


Processings the following regressors in this chunk:  [('HistGradientBoostingRegressor', <class 'sklearn.ensemble._hist_gradient_boosting.gradient_boosting.HistGradientBoostingRegressor'>)]
'tuple' object has no attribute '__name__'
Invalid Regressor(s)


100%|██████████| 1/1 [00:23<00:00, 23.23s/it]


Processings the following regressors in this chunk:  [('HuberRegressor', <class 'sklearn.linear_model._huber.HuberRegressor'>)]
'tuple' object has no attribute '__name__'
Invalid Regressor(s)


100%|██████████| 1/1 [01:42<00:00, 102.53s/it]


Processings the following regressors in this chunk:  [('KNeighborsRegressor', <class 'sklearn.neighbors._regression.KNeighborsRegressor'>)]
'tuple' object has no attribute '__name__'
Invalid Regressor(s)


100%|██████████| 1/1 [16:59<00:00, 1019.17s/it]


Processings the following regressors in this chunk:  [('KernelRidge', <class 'sklearn.kernel_ridge.KernelRidge'>)]
'tuple' object has no attribute '__name__'
Invalid Regressor(s)


100%|██████████| 1/1 [00:04<00:00,  4.11s/it]


Processings the following regressors in this chunk:  [('Lars', <class 'sklearn.linear_model._least_angle.Lars'>)]
'tuple' object has no attribute '__name__'
Invalid Regressor(s)


100%|██████████| 1/1 [00:10<00:00, 10.26s/it]


Processings the following regressors in this chunk:  [('LarsCV', <class 'sklearn.linear_model._least_angle.LarsCV'>)]
'tuple' object has no attribute '__name__'
Invalid Regressor(s)


100%|██████████| 1/1 [00:30<00:00, 30.84s/it]


Processings the following regressors in this chunk:  [('Lasso', <class 'sklearn.linear_model._coordinate_descent.Lasso'>)]
'tuple' object has no attribute '__name__'
Invalid Regressor(s)


100%|██████████| 1/1 [00:14<00:00, 14.59s/it]


Processings the following regressors in this chunk:  [('LassoCV', <class 'sklearn.linear_model._coordinate_descent.LassoCV'>)]
'tuple' object has no attribute '__name__'
Invalid Regressor(s)


100%|██████████| 1/1 [01:52<00:00, 112.62s/it]


Processings the following regressors in this chunk:  [('LassoLars', <class 'sklearn.linear_model._least_angle.LassoLars'>)]
'tuple' object has no attribute '__name__'
Invalid Regressor(s)


100%|██████████| 1/1 [00:09<00:00,  9.03s/it]


Processings the following regressors in this chunk:  [('LassoLarsCV', <class 'sklearn.linear_model._least_angle.LassoLarsCV'>)]
'tuple' object has no attribute '__name__'
Invalid Regressor(s)


100%|██████████| 1/1 [00:19<00:00, 19.41s/it]


Processings the following regressors in this chunk:  [('LassoLarsIC', <class 'sklearn.linear_model._least_angle.LassoLarsIC'>)]
'tuple' object has no attribute '__name__'
Invalid Regressor(s)


100%|██████████| 1/1 [00:16<00:00, 16.77s/it]


Processings the following regressors in this chunk:  [('LinearRegression', <class 'sklearn.linear_model._base.LinearRegression'>)]
'tuple' object has no attribute '__name__'
Invalid Regressor(s)


100%|██████████| 1/1 [00:14<00:00, 14.63s/it]


Processings the following regressors in this chunk:  [('LinearSVR', <class 'sklearn.svm._classes.LinearSVR'>)]
'tuple' object has no attribute '__name__'
Invalid Regressor(s)


100%|██████████| 1/1 [05:01<00:00, 301.83s/it]


Processings the following regressors in this chunk:  [('MLPRegressor', <class 'sklearn.neural_network._multilayer_perceptron.MLPRegressor'>)]
'tuple' object has no attribute '__name__'
Invalid Regressor(s)


100%|██████████| 1/1 [22:30<00:00, 1350.02s/it]


Processings the following regressors in this chunk:  [('OrthogonalMatchingPursuit', <class 'sklearn.linear_model._omp.OrthogonalMatchingPursuit'>)]
'tuple' object has no attribute '__name__'
Invalid Regressor(s)


100%|██████████| 1/1 [00:08<00:00,  8.97s/it]


Processings the following regressors in this chunk:  [('OrthogonalMatchingPursuitCV', <class 'sklearn.linear_model._omp.OrthogonalMatchingPursuitCV'>)]
'tuple' object has no attribute '__name__'
Invalid Regressor(s)


100%|██████████| 1/1 [00:19<00:00, 19.98s/it]


Processings the following regressors in this chunk:  [('PassiveAggressiveRegressor', <class 'sklearn.linear_model._passive_aggressive.PassiveAggressiveRegressor'>)]
'tuple' object has no attribute '__name__'
Invalid Regressor(s)


100%|██████████| 1/1 [00:12<00:00, 12.41s/it]


Processings the following regressors in this chunk:  [('PoissonRegressor', <class 'sklearn.linear_model._glm.glm.PoissonRegressor'>)]
'tuple' object has no attribute '__name__'
Invalid Regressor(s)


100%|██████████| 1/1 [00:13<00:00, 13.28s/it]


Processings the following regressors in this chunk:  [('RANSACRegressor', <class 'sklearn.linear_model._ransac.RANSACRegressor'>)]
'tuple' object has no attribute '__name__'
Invalid Regressor(s)


100%|██████████| 1/1 [00:32<00:00, 32.86s/it]


Processings the following regressors in this chunk:  [('RandomForestRegressor', <class 'sklearn.ensemble._forest.RandomForestRegressor'>)]
'tuple' object has no attribute '__name__'
Invalid Regressor(s)


100%|██████████| 1/1 [04:52<00:00, 292.67s/it]


Processings the following regressors in this chunk:  [('Ridge', <class 'sklearn.linear_model._ridge.Ridge'>)]
'tuple' object has no attribute '__name__'
Invalid Regressor(s)


100%|██████████| 1/1 [00:08<00:00,  8.82s/it]


Processings the following regressors in this chunk:  [('RidgeCV', <class 'sklearn.linear_model._ridge.RidgeCV'>)]
'tuple' object has no attribute '__name__'
Invalid Regressor(s)


100%|██████████| 1/1 [00:19<00:00, 19.11s/it]


Processings the following regressors in this chunk:  [('SGDRegressor', <class 'sklearn.linear_model._stochastic_gradient.SGDRegressor'>)]
'tuple' object has no attribute '__name__'
Invalid Regressor(s)


100%|██████████| 1/1 [00:10<00:00, 10.10s/it]


Processings the following regressors in this chunk:  [('TransformedTargetRegressor', <class 'sklearn.compose._target.TransformedTargetRegressor'>)]
'tuple' object has no attribute '__name__'
Invalid Regressor(s)


100%|██████████| 1/1 [00:13<00:00, 13.14s/it]


Processings the following regressors in this chunk:  [('TweedieRegressor', <class 'sklearn.linear_model._glm.glm.TweedieRegressor'>)]
'tuple' object has no attribute '__name__'
Invalid Regressor(s)


100%|██████████| 1/1 [00:08<00:00,  8.72s/it]


Processings the following regressors in this chunk:  [('XGBRegressor', <class 'xgboost.sklearn.XGBRegressor'>)]
'tuple' object has no attribute '__name__'
Invalid Regressor(s)


100%|██████████| 1/1 [00:13<00:00, 13.01s/it]


Processings the following regressors in this chunk:  [('LGBMRegressor', <class 'lightgbm.sklearn.LGBMRegressor'>)]
'tuple' object has no attribute '__name__'
Invalid Regressor(s)


  0%|          | 0/1 [00:00<?, ?it/s]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.092728 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1566
[LightGBM] [Info] Number of data points in the train set: 304128, number of used features: 296
[LightGBM] [Info] Start training from score 168.161005


100%|██████████| 1/1 [00:11<00:00, 11.30s/it]


In [10]:
# build combined results
all_models = pd.DataFrame()
all_predictions = pd.DataFrame()

for chunk in lazy_predict_chunks:
    all_models = pd.concat([all_models, chunk.models])
    if predictions_flag:
        all_predictions = pd.concat([all_predictions, chunk.predictions])

In [11]:
output_file = OUTPUT_DIR + current_date + "-lazy_models.csv"
all_models.to_csv(output_file)
all_models

Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AdaBoostRegressor,0.40,0.40,19.06,98.53
BaggingRegressor,0.74,0.74,12.54,47.58
BayesianRidge,0.86,0.86,9.27,17.8
DecisionTreeRegressor,0.74,0.74,12.6,12.39
DummyRegressor,-0.00,-0.00,24.54,7.48
ElasticNet,0.73,0.73,12.69,10.23
ElasticNetCV,0.85,0.85,9.42,44.39
ExtraTreeRegressor,0.97,0.97,3.94,13.62
ExtraTreesRegressor,0.98,0.98,3.41,539.32
GammaRegressor,0.74,0.74,12.55,8.75


In [12]:
if predictions_flag:
    output_file = OUTPUT_DIR + current_date + "-lazy_predictions.csv"
    all_predictions.to_csv(output_file)
    all_predictions

In [13]:
print("Predictions ran on X_train of shape: ", X_train.shape)

Predictions ran on X_train of shape:  (304128, 298)


In [14]:
all_models.columns

Index(['Adjusted R-Squared', 'R-Squared', 'RMSE', 'Time Taken'], dtype='object')

Models sorted by R-Squared

In [15]:
all_models.sort_values(by=['R-Squared'], ascending=False)

Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ExtraTreesRegressor,0.98,0.98,3.41,539.32
ExtraTreeRegressor,0.97,0.97,3.94,13.62
KNeighborsRegressor,0.97,0.97,4.1,1019.17
LGBMRegressor,0.96,0.96,4.95,11.3
MLPRegressor,0.96,0.96,4.95,1350.02
HistGradientBoostingRegressor,0.95,0.95,5.5,23.23
GradientBoostingRegressor,0.89,0.89,8.19,149.53
BayesianRidge,0.86,0.86,9.27,17.8
RidgeCV,0.86,0.86,9.28,19.11
Ridge,0.86,0.86,9.28,8.82


Models sorted by Time Taken

In [16]:
all_models.sort_values(by=['Time Taken'], ascending=False)

Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
MLPRegressor,0.96,0.96,4.95,1350.02
KNeighborsRegressor,0.97,0.97,4.1,1019.17
ExtraTreesRegressor,0.98,0.98,3.41,539.32
LinearSVR,0.83,0.83,10.04,301.83
RandomForestRegressor,0.74,0.74,12.53,292.67
GradientBoostingRegressor,0.89,0.89,8.19,149.53
LassoCV,0.86,0.86,9.29,112.62
HuberRegressor,0.84,0.84,9.86,102.53
AdaBoostRegressor,0.40,0.40,19.06,98.53
BaggingRegressor,0.74,0.74,12.54,47.58


In [17]:
joblib.dump(lazy_predict_chunks, f"{OUTPUT_DIR}models/{current_date}-lazy_predict_chunks.pkl")

['files/output/models/2025_03_13-lazy_predict_chunks.pkl']

In [49]:
model = joblib.load(f"{OUTPUT_DIR}models/2025_03_13-lazy-ExtraTreesRegressor.pkl")

In [50]:
y_pred = model.predict(X_test)
y_pred

array([169.        , 143.98      , 177.16      , ..., 165.91      ,
       163.28309028, 200.55      ])

In [52]:
from sklearn.metrics import mean_squared_error

mean_squared_error_reg = mean_squared_error(y_true=y_test, y_pred=y_pred)
mean_squared_error_reg

11.630973078074135

In [53]:
from sklearn.metrics import root_mean_squared_error

root_mean_squared_error_reg = root_mean_squared_error(y_true=y_test, y_pred=y_pred)
root_mean_squared_error_reg

3.4104212464260386

In [54]:
from sklearn.metrics import r2_score

r2_score_reg = r2_score(y_true=y_test, y_pred=y_pred)
r2_score_reg

0.980678301456946