Goal of this notebook is to test wide range of potential models.

1. Lazypredict
-> choose promising models
2. GridSearch on selection of promising models
3. Summary of hot candidates including metric, high influencing variables, pca of cluster, etc.


Open todo's-> (maybe other notebook)
- hyperparameter optimization
- dimension reduction

In [2]:
import numpy as np
import pandas as pd
from config import MERGED_ELECTRIC_FILE, DENSITY_THRESHOLD, DATABASE_FILE_INDEX, DATABASE_FILE_DTYPES, REPLACE_STRING_OTHER, ELECTRIC_TARGET
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [3]:
# load dataset for electric cars
df = pd.read_csv(MERGED_ELECTRIC_FILE, dtype=DATABASE_FILE_DTYPES, index_col=DATABASE_FILE_INDEX)

# Preprocessing for electric dataset

(later added to 1_X)

In [4]:
df.columns

Index(['member_state', 'manufacturer_name_eu', 'vehicle_type',
       'commercial_name', 'category_of_vehicle', 'fuel_type', 'fuel_mode',
       'innovative_technologies', 'mass_vehicle', 'weltp_test_mass',
       'engine_capacity', 'engine_power', 'erwltp', 'year', 'electric_range',
       'electric_energy_consumption', 'fuel_consumption',
       'specific_co2_emissions'],
      dtype='object')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3945297 entries, 56003781 to 134630842
Data columns (total 18 columns):
 #   Column                       Dtype  
---  ------                       -----  
 0   member_state                 object 
 1   manufacturer_name_eu         object 
 2   vehicle_type                 object 
 3   commercial_name              object 
 4   category_of_vehicle          object 
 5   fuel_type                    object 
 6   fuel_mode                    object 
 7   innovative_technologies      object 
 8   mass_vehicle                 float64
 9   weltp_test_mass              float64
 10  engine_capacity              float64
 11  engine_power                 float64
 12  erwltp                       float64
 13  year                         int64  
 14  electric_range               float64
 15  electric_energy_consumption  float64
 16  fuel_consumption             float64
 17  specific_co2_emissions       float64
dtypes: float64(9), int64(1), object(8)
mem

In [6]:
def categorize_categorical_quantitive_cols(df):
    cat_cols = pd.DataFrame.select_dtypes(df, include=["object"]).columns
    quant_cols = df.select_dtypes(exclude=["object"]).columns

    return cat_cols, quant_cols

In [7]:
cols_to_be_dropped = list()

cat_cols, quant_cols = categorize_categorical_quantitive_cols(df)

## Remove columns below density threshold

In [8]:
missing_percentage = df.isna().sum() / len(df)
print(missing_percentage)

member_state                   0.000000
manufacturer_name_eu           0.000000
vehicle_type                   0.000221
commercial_name                0.005351
category_of_vehicle            0.005234
fuel_type                      0.000000
fuel_mode                      0.000000
innovative_technologies        0.999567
mass_vehicle                   0.000016
weltp_test_mass                0.043012
engine_capacity                1.000000
engine_power                   0.069891
erwltp                         1.000000
year                           0.000000
electric_range                 0.115409
electric_energy_consumption    0.036422
fuel_consumption               0.999976
specific_co2_emissions         0.000003
dtype: float64


In [9]:
cols_to_be_dropped = list()

for col, percentage in missing_percentage.items():
    if percentage > DENSITY_THRESHOLD:
        cols_to_be_dropped.append(col)

print(f"Columns to be dropped due to availability density below threshold: {cols_to_be_dropped}")

Columns to be dropped due to availability density below threshold: ['innovative_technologies', 'engine_capacity', 'erwltp', 'fuel_consumption']


In [10]:
missing_percentage = df.isna().sum() / len(df)
print(missing_percentage)

member_state                   0.000000
manufacturer_name_eu           0.000000
vehicle_type                   0.000221
commercial_name                0.005351
category_of_vehicle            0.005234
fuel_type                      0.000000
fuel_mode                      0.000000
innovative_technologies        0.999567
mass_vehicle                   0.000016
weltp_test_mass                0.043012
engine_capacity                1.000000
engine_power                   0.069891
erwltp                         1.000000
year                           0.000000
electric_range                 0.115409
electric_energy_consumption    0.036422
fuel_consumption               0.999976
specific_co2_emissions         0.000003
dtype: float64


## Quantitative columns

- replace missing values with median of variable
- drop missing values if replacement is no option (e.g. for target variable)

In [11]:
df["specific_co2_emissions"].value_counts(normalize=True)
# -> all values are 0 -> no use for us
cols_to_be_dropped.append("specific_co2_emissions")

In [12]:
# electric_energy_consumption is our target and we should only keep rows with value
df.dropna(subset=["electric_energy_consumption"], inplace=True)

In [13]:
print("Dropping columns: ", cols_to_be_dropped)
df.drop(columns=cols_to_be_dropped, inplace=True)

# reinit
cols_to_be_dropped = list()

Dropping columns:  ['innovative_technologies', 'engine_capacity', 'erwltp', 'fuel_consumption', 'specific_co2_emissions']


In [14]:
# replace NaN with median
imputer = SimpleImputer(strategy='median')
cat_cols, quant_cols = categorize_categorical_quantitive_cols(df)
# make sure we don't accidentely manipulate target variable
quant_cols_replace = list(quant_cols)
quant_cols_replace.remove(ELECTRIC_TARGET)
# we don't want to replace anything in year column
quant_cols_replace.remove("year")

df[quant_cols] = imputer.fit_transform(df[quant_cols])

In [15]:
missing_percentage = df.isna().sum() / len(df)
print(missing_percentage)

member_state                   0.000000
manufacturer_name_eu           0.000000
vehicle_type                   0.000009
commercial_name                0.005487
category_of_vehicle            0.000000
fuel_type                      0.000000
fuel_mode                      0.000000
mass_vehicle                   0.000000
weltp_test_mass                0.000000
engine_power                   0.000000
year                           0.000000
electric_range                 0.000000
electric_energy_consumption    0.000000
dtype: float64


## Categorical Columns

### Analyze and preprocess columns based on value distribution & uniqueness

In [16]:
# analyze uniqueness
for col in cat_cols:
    len_unique = len(df[col].unique())
    print(col, " unique vals: ", len_unique)
    if len_unique == 1:
        # we don't need cols with only one value -> drop
        cols_to_be_dropped.append(col)


member_state  unique vals:  29
manufacturer_name_eu  unique vals:  72
vehicle_type  unique vals:  286
commercial_name  unique vals:  1452
category_of_vehicle  unique vals:  4
fuel_type  unique vals:  1
fuel_mode  unique vals:  1


In [17]:
print("Dropping columns: ", cols_to_be_dropped)
df.drop(columns=cols_to_be_dropped, inplace=True)

Dropping columns:  ['fuel_type', 'fuel_mode']


- Reduce number of unique values through adding a "other" value/class representing all values below a certain threshold.
- Additionally we'll replace Na values with REPLACE_STRING_OTHER

In [18]:
# replace Na values for cat_cols with REPLACE_STRING_OTHER

cat_cols, quant_cols = categorize_categorical_quantitive_cols(df)
df[cat_cols] = df[cat_cols].fillna(value=REPLACE_STRING_OTHER)

In [19]:
def reduce_unique_col_vals_through_other(df, col, threshold=0.01):
    # check if we can reduce the number of unique values in a column
    # by grouping the values that have a frequency of less than threshold
    # into a new category
    # returns the modified dataframe and the new unique values
    value_counts = df[col].value_counts(normalize=True)
    other_vals = value_counts[value_counts < threshold].index
    df[col] = df[col].apply(lambda x: REPLACE_STRING_OTHER if x in other_vals else x)
    return df

In [20]:
cat_cols, quant_cols = categorize_categorical_quantitive_cols(df)

missing_percentage = df[cat_cols].isna().sum() / len(df)
print(missing_percentage)

member_state            0.0
manufacturer_name_eu    0.0
vehicle_type            0.0
commercial_name         0.0
category_of_vehicle     0.0
dtype: float64


In [21]:
# Preprocess manufacturer_name_eu to reduce number of unique values for future encoding
reduce_unique_col_vals_through_other(df, "manufacturer_name_eu", threshold=0.01)
print("Reduced unique vals of manufacturer_name_eu to: ", len(df["manufacturer_name_eu"].unique()))
df["manufacturer_name_eu"].value_counts(normalize=True)

Reduced unique vals of manufacturer_name_eu to:  22


manufacturer_name_eu
TESLA                       0.141083
VOLKSWAGEN                  0.115710
BMW AG                      0.077433
STELLANTIS AUTO             0.073975
RENAULT                     0.066638
MERCEDES-BENZ AG            0.064153
AUDI AG                     0.045662
other                       0.045388
KIA                         0.042644
SKODA                       0.038538
DACIA                       0.035599
STELLANTIS EUROPE           0.035227
SAIC MOTOR CORPORATION      0.033021
VOLVO                       0.031792
PSA                         0.028290
HYUNDAI                     0.028079
HYUNDAI CZECH               0.023124
SEAT                        0.018831
NISSAN AUTOMOTIVE EUROPE    0.018339
FORD WERKE GMBH             0.013361
POLESTAR                    0.012542
FIAT GROUP                  0.010568
Name: proportion, dtype: float64

In [22]:
# Preprocess vehicle_type to reduce number of unique values for future encoding
reduce_unique_col_vals_through_other(df, "vehicle_type", threshold=0.0025)
print("Reduced unique vals of vehicle_type to: ", len(df["vehicle_type"].unique()))
df["vehicle_type"].value_counts(normalize=True)

Reduced unique vals of vehicle_type to:  58


vehicle_type
003        0.095796
U          0.086123
E2         0.056271
FA1        0.042240
E1         0.038021
NY         0.037694
3          0.036661
DBG        0.035602
other      0.031466
X          0.028982
AG         0.025825
FZ         0.025174
F2B        0.024680
OSE        0.021735
FML2E      0.020535
AA         0.019769
NE         0.019515
V          0.019071
451        0.018996
RCB        0.018478
GE         0.017625
K1         0.016716
CV         0.015480
ZE1        0.015161
G4C        0.014753
SEH3       0.014491
AH         0.013661
LSK        0.013359
DE         0.013081
G3XE       0.012079
B          0.011768
SG2        0.010208
ZS1        0.009953
U1X        0.009769
BMWi-1     0.009682
Y1A        0.009359
204 X      0.007333
BMWi-N     0.005759
DR         0.005653
E2EQEW     0.005594
EAM1(M)    0.004966
E          0.004851
005        0.004815
AH2        0.004644
EP21       0.004161
EB         0.003908
EP22-L     0.003801
AG0        0.003734
SC2E       0.003720
FH1    

In [23]:
# Preprocess commercial_name to reduce number of unique values for future encoding
reduce_unique_col_vals_through_other(df, "commercial_name", threshold=0.0005)
print("Reduced unique vals of commercial_name to: ", len(df["commercial_name"].unique()))
df["commercial_name"].value_counts(normalize=True)

Reduced unique vals of commercial_name to:  181


commercial_name
MODEL Y         0.071774
MODEL 3         0.055174
other           0.051414
500             0.041274
SPRING          0.035602
                  ...   
E-TRON S        0.000533
Q4 E-TRON       0.000525
iX1 xDrive30    0.000524
600             0.000519
e-tron 55       0.000515
Name: proportion, Length: 181, dtype: float64

In [24]:
# encoding
cat_cols, quant_cols = categorize_categorical_quantitive_cols(df)

ct_electric = ColumnTransformer(transformers=[("encoder", OneHotEncoder(sparse_output=False), cat_cols)], remainder="passthrough")
transformed_array = ct_electric.fit_transform(df)
encoder_feature_names = ct_electric.named_transformers_["encoder"].get_feature_names_out(cat_cols)

preserved_col_names = list(encoder_feature_names)
preserved_col_names.extend(list(quant_cols))

df_enc = pd.DataFrame(transformed_array, columns=preserved_col_names)

In [25]:
df_enc.head(2)

Unnamed: 0,member_state_AT,member_state_BE,member_state_BG,member_state_CY,member_state_CZ,member_state_DE,member_state_DK,member_state_EE,member_state_ES,member_state_FI,...,category_of_vehicle_M1,category_of_vehicle_M1.1,category_of_vehicle_M1G,category_of_vehicle_N1,mass_vehicle,weltp_test_mass,engine_power,year,electric_range,electric_energy_consumption
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1860.0,1940.0,150.0,2021.0,313.0,169.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1860.0,1940.0,150.0,2021.0,313.0,169.0


TODO consider doing PCA here :)

## Split dataset
- features, target
- train, test

In [31]:
X = df_enc.drop(columns = ELECTRIC_TARGET)
y = df_enc[ELECTRIC_TARGET]

In [32]:
# we're using higher test_size here to reduce time for compute due to X_train size
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)

## Feature Scaling

In [33]:
# encode all variables that are scalable (will include encoded cat_cols)
cat_cols, quant_cols = categorize_categorical_quantitive_cols(X)

sc = StandardScaler()
X_train[quant_cols] = sc.fit_transform(X_train[quant_cols])
X_test[quant_cols] = sc.fit_transform(X_test[quant_cols])

# we're also scaling encoded columns, consider only encoding quant_cols
# X_train.iloc[:, -5:]

In [34]:
X_train.head(2)

Unnamed: 0,member_state_AT,member_state_BE,member_state_BG,member_state_CY,member_state_CZ,member_state_DE,member_state_DK,member_state_EE,member_state_ES,member_state_FI,...,commercial_name_other,category_of_vehicle_M1,category_of_vehicle_M1.1,category_of_vehicle_M1G,category_of_vehicle_N1,mass_vehicle,weltp_test_mass,engine_power,year,electric_range
3464753,-0.17657,-0.187687,-0.031708,-0.015231,-0.058773,1.358056,-0.180603,-0.026386,-0.173682,-0.119944,...,4.292588,0.014763,-0.001147,-0.014387,-0.003106,-0.275138,-0.259511,0.065535,1.024148,0.033785
254882,-0.17657,-0.187687,-0.031708,-0.015231,-0.058773,1.358056,-0.180603,-0.026386,-0.173682,-0.119944,...,-0.23296,0.014763,-0.001147,-0.014387,-0.003106,1.665811,1.680289,0.16273,-1.482669,0.146162


# Lazy Predict

We'll use LazyRegressor as we're dealing with a supervised learning regression problem and want to check potential models for our usecase

In [39]:
from lazypredict.Supervised import LazyRegressor, REGRESSORS
from copy import deepcopy

Select models from LazyRegressor as processing failed for some in previous sessions.

In [None]:
# explicit selection through mannual list (based on lazypredict.Supervised.REGRESSION)

# import sklearn
# import xgboost
# import lightgbm

# GaussianProcessRegressor model failed to execute
# KernelRidge model failed to execute
# LazyRegressor stuck at 62% (26/42) -> 25 MLPRegressor || 26 NuSVR?

# model_selection = [
#     ('AdaBoostRegressor', sklearn.ensemble._weight_boosting.AdaBoostRegressor),
#     ('BaggingRegressor', sklearn.ensemble._bagging.BaggingRegressor),
#     ('BayesianRidge', sklearn.linear_model._bayes.BayesianRidge),
#     ('DecisionTreeRegressor', sklearn.tree._classes.DecisionTreeRegressor),
#     ('DummyRegressor', sklearn.dummy.DummyRegressor),
#     ('ElasticNet', sklearn.linear_model._coordinate_descent.ElasticNet),
#     ('ElasticNetCV', sklearn.linear_model._coordinate_descent.ElasticNetCV),
#     ('ExtraTreeRegressor', sklearn.tree._classes.ExtraTreeRegressor),
#     ('ExtraTreesRegressor', sklearn.ensemble._forest.ExtraTreesRegressor),
#     ('GammaRegressor', sklearn.linear_model._glm.glm.GammaRegressor),
#     ('GaussianProcessRegressor', sklearn.gaussian_process._gpr.GaussianProcessRegressor),
#     ('GradientBoostingRegressor', sklearn.ensemble._gb.GradientBoostingRegressor),
#     ('HistGradientBoostingRegressor', sklearn.ensemble._hist_gradient_boosting.gradient_boosting.HistGradientBoostingRegressor),
#     ('HuberRegressor', sklearn.linear_model._huber.HuberRegressor),
#     ('KNeighborsRegressor', sklearn.neighbors._regression.KNeighborsRegressor),
#     ('KernelRidge', sklearn.kernel_ridge.KernelRidge),
#     ('Lars', sklearn.linear_model._least_angle.Lars),
#     ('LarsCV', sklearn.linear_model._least_angle.LarsCV),
#     ('Lasso', sklearn.linear_model._coordinate_descent.Lasso),
#     ('LassoCV', sklearn.linear_model._coordinate_descent.LassoCV),
#     ('LassoLars', sklearn.linear_model._least_angle.LassoLars),
#     ('LassoLarsCV', sklearn.linear_model._least_angle.LassoLarsCV),
#     ('LassoLarsIC', sklearn.linear_model._least_angle.LassoLarsIC),
#     ('LinearRegression', sklearn.linear_model._base.LinearRegression),
#     ('LinearSVR', sklearn.svm._classes.LinearSVR),
#     ('MLPRegressor', sklearn.neural_network._multilayer_perceptron.MLPRegressor),
#     ('NuSVR', sklearn.svm._classes.NuSVR),
#     ('OrthogonalMatchingPursuit', sklearn.linear_model._omp.OrthogonalMatchingPursuit),
#     ('OrthogonalMatchingPursuitCV', sklearn.linear_model._omp.OrthogonalMatchingPursuitCV),
#     ('PassiveAggressiveRegressor', sklearn.linear_model._passive_aggressive.PassiveAggressiveRegressor),
#     ('PoissonRegressor', sklearn.linear_model._glm.glm.PoissonRegressor),
#     ('QuantileRegressor', sklearn.linear_model._quantile.QuantileRegressor),
#     ('RANSACRegressor', sklearn.linear_model._ransac.RANSACRegressor),
#     ('RandomForestRegressor', sklearn.ensemble._forest.RandomForestRegressor),
#     ('Ridge', sklearn.linear_model._ridge.Ridge),
#     ('RidgeCV', sklearn.linear_model._ridge.RidgeCV),
#     ('SGDRegressor', sklearn.linear_model._stochastic_gradient.SGDRegressor),
#     ('SVR', sklearn.svm._classes.SVR),
#     ('TransformedTargetRegressor', sklearn.compose._target.TransformedTargetRegressor),
#     ('TweedieRegressor', sklearn.linear_model._glm.glm.TweedieRegressor),
#     ('XGBRegressor', xgboost.sklearn.XGBRegressor),
#     ('LGBMRegressor', lightgbm.sklearn.LGBMRegressor)
# ]

# selection through ignore
model_selection = REGRESSORS
"""
----
Removed due to failed executions:
----
GaussianProcessRegressor model failed to execute
KernelRidge model failed to execute
LazyRegressor stuck at 62% (26/42) -> 25 MLPRegressor || 26 NuSVR?

----
Removed due to high compute time:
----
LassoCV
QuantileRegressor
RANSACRegressor
"""
models_to_ignore = list([
    "GaussianProcessRegressor",
     "KernelRidge",
     "LassoCV",
     "NuSVR",
     "MLPRegressor",
     "QuantileRegressor",
     "RANSACRegressor",
])

Use chunk approach

In [37]:
class LazyPredictChunk:
    def __init__(self, models=None, predictions=None, regressors="all"):
        self.models = models
        self.predictions = predictions
        self.regressors = regressors

In [40]:
chunk_size = 10
lazy_predict_chunks = []

# split regressors into chunks
i = 0
chunk_buffer = []
for model, model_class in model_selection:
    # ignore models that are in ignore list
    if model in models_to_ignore:
        i+=1
        continue

    # append if chunk_buffer below size limit
    if len(chunk_buffer) < chunk_size:
        chunk_buffer.append((model, model_class))
    
    # chunk size reached or last iteration step reached
    if len(chunk_buffer) == chunk_size or (i+1) == len(REGRESSORS):
        lazy_predict_chunks.append(LazyPredictChunk(regressors=deepcopy(chunk_buffer)))
        # empty chunk_buffer
        chunk_buffer = []

    i+=1

In [None]:
print("Executing LazyRegressor through: ", len(lazy_predict_chunks), "chunks")

for chunk in lazy_predict_chunks:
    print("Processings the following regressors in this chunk: ", chunk.regressors)
    reg = LazyRegressor(verbose=0,ignore_warnings=False, custom_metric=None, regressors=chunk.regressors)
    models,predictions = reg.fit(X_train, X_test, y_train, y_test)
    # models,predictions = reg.fit(X_train[:500], X_test[:500], y_train[:500], y_test[:500])
    
    # store results into LazyPredictChunk variables
    chunk.models = models
    chunk.predictions = predictions

Executing LazyRegressor through:  4 chunks
Processings the following regressors in this chunk:  [('AdaBoostRegressor', <class 'sklearn.ensemble._weight_boosting.AdaBoostRegressor'>), ('BaggingRegressor', <class 'sklearn.ensemble._bagging.BaggingRegressor'>), ('BayesianRidge', <class 'sklearn.linear_model._bayes.BayesianRidge'>), ('DecisionTreeRegressor', <class 'sklearn.tree._classes.DecisionTreeRegressor'>), ('DummyRegressor', <class 'sklearn.dummy.DummyRegressor'>), ('ElasticNet', <class 'sklearn.linear_model._coordinate_descent.ElasticNet'>), ('ElasticNetCV', <class 'sklearn.linear_model._coordinate_descent.ElasticNetCV'>), ('ExtraTreeRegressor', <class 'sklearn.tree._classes.ExtraTreeRegressor'>), ('ExtraTreesRegressor', <class 'sklearn.ensemble._forest.ExtraTreesRegressor'>), ('GammaRegressor', <class 'sklearn.linear_model._glm.glm.GammaRegressor'>)]
'tuple' object has no attribute '__name__'
Invalid Regressor(s)


  0%|          | 0/10 [00:00<?, ?it/s]

In [None]:
# build combined results
all_models = pd.DataFrame()
all_predictions = pd.DataFrame()

for chunk in lazy_predict_chunks:
    all_models = pd.concat([all_models, chunk.models])
    all_predictions = pd.concat([all_predictions, chunk.predictions])

In [None]:
all_models

Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ExtraTreesRegressor,0.57,0.83,9.8,0.53
BaggingRegressor,0.49,0.80,10.7,0.1
DecisionTreeRegressor,0.43,0.77,11.3,0.04
ElasticNetCV,0.40,0.76,11.62,0.35
BayesianRidge,0.36,0.74,11.97,0.09
ExtraTreeRegressor,0.27,0.71,12.78,0.05
ElasticNet,0.19,0.68,13.47,0.04
GammaRegressor,0.17,0.67,13.63,0.2
AdaBoostRegressor,0.11,0.64,14.09,0.26
DummyRegressor,-1.51,-0.00,23.68,0.04


In [None]:
all_predictions

Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ExtraTreesRegressor,0.57,0.83,9.8,0.53
BaggingRegressor,0.49,0.80,10.7,0.1
DecisionTreeRegressor,0.43,0.77,11.3,0.04
ElasticNetCV,0.40,0.76,11.62,0.35
BayesianRidge,0.36,0.74,11.97,0.09
ExtraTreeRegressor,0.27,0.71,12.78,0.05
ElasticNet,0.19,0.68,13.47,0.04
GammaRegressor,0.17,0.67,13.63,0.2
AdaBoostRegressor,0.11,0.64,14.09,0.26
DummyRegressor,-1.51,-0.00,23.68,0.04
