Goal of this notebook is to test wide range of potential models.

1. Lazypredict
-> choose promising models
2. GridSearch on selection of promising models
3. Summary of hot candidates including metric, high influencing variables, pca of cluster, etc.


Open todo's-> (maybe other notebook)
- hyperparameter optimization
- dimension reduction

In [1]:
from lazypredict.Supervised import LazyRegressor
import numpy as np
import pandas as pd
from config import MERGED_ELECTRIC_FILE, DENSITY_THRESHOLD, DATABASE_FILE_INDEX, DATABASE_FILE_DTYPES, REPLACE_STRING_OTHER, ELECTRIC_TARGET
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [2]:
# load dataset for electric cars
df = pd.read_csv(MERGED_ELECTRIC_FILE, dtype=DATABASE_FILE_DTYPES, index_col=DATABASE_FILE_INDEX)

# Preprocessing for electric dataset

(later added to 1_X)

In [3]:
df.columns

Index(['member_state', 'manufacturer_name_eu', 'vehicle_type',
       'commercial_name', 'category_of_vehicle', 'fuel_type', 'fuel_mode',
       'innovative_technologies', 'mass_vehicle', 'weltp_test_mass',
       'engine_capacity', 'engine_power', 'erwltp', 'year', 'electric_range',
       'electric_energy_consumption', 'fuel_consumption',
       'specific_co2_emissions'],
      dtype='object')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3945297 entries, 56003781 to 134630842
Data columns (total 18 columns):
 #   Column                       Dtype  
---  ------                       -----  
 0   member_state                 object 
 1   manufacturer_name_eu         object 
 2   vehicle_type                 object 
 3   commercial_name              object 
 4   category_of_vehicle          object 
 5   fuel_type                    object 
 6   fuel_mode                    object 
 7   innovative_technologies      object 
 8   mass_vehicle                 float64
 9   weltp_test_mass              float64
 10  engine_capacity              float64
 11  engine_power                 float64
 12  erwltp                       float64
 13  year                         int64  
 14  electric_range               float64
 15  electric_energy_consumption  float64
 16  fuel_consumption             float64
 17  specific_co2_emissions       float64
dtypes: float64(9), int64(1), object(8)
mem

In [5]:
def categorize_categorical_quantitive_cols(df):
    cat_cols = pd.DataFrame.select_dtypes(df, include=["object"]).columns
    quant_cols = df.select_dtypes(exclude=["object"]).columns

    return cat_cols, quant_cols

In [6]:
cols_to_be_dropped = list()

cat_cols, quant_cols = categorize_categorical_quantitive_cols(df)

## Remove columns below density threshold

In [7]:
missing_percentage = df.isna().sum() / len(df)
print(missing_percentage)

member_state                  0.00
manufacturer_name_eu          0.00
vehicle_type                  0.00
commercial_name               0.01
category_of_vehicle           0.01
fuel_type                     0.00
fuel_mode                     0.00
innovative_technologies       1.00
mass_vehicle                  0.00
weltp_test_mass               0.04
engine_capacity               1.00
engine_power                  0.07
erwltp                        1.00
year                          0.00
electric_range                0.12
electric_energy_consumption   0.04
fuel_consumption              1.00
specific_co2_emissions        0.00
dtype: float64


In [8]:
cols_to_be_dropped = list()

for col, percentage in missing_percentage.items():
    if percentage > DENSITY_THRESHOLD:
        cols_to_be_dropped.append(col)

print(f"Columns to be dropped due to availability density below threshold: {cols_to_be_dropped}")

Columns to be dropped due to availability density below threshold: ['innovative_technologies', 'engine_capacity', 'erwltp', 'fuel_consumption']


In [9]:
missing_percentage = df.isna().sum() / len(df)
print(missing_percentage)

member_state                  0.00
manufacturer_name_eu          0.00
vehicle_type                  0.00
commercial_name               0.01
category_of_vehicle           0.01
fuel_type                     0.00
fuel_mode                     0.00
innovative_technologies       1.00
mass_vehicle                  0.00
weltp_test_mass               0.04
engine_capacity               1.00
engine_power                  0.07
erwltp                        1.00
year                          0.00
electric_range                0.12
electric_energy_consumption   0.04
fuel_consumption              1.00
specific_co2_emissions        0.00
dtype: float64


## Quantitative columns

- replace missing values with median of variable
- drop missing values if replacement is no option (e.g. for target variable)

In [10]:
df["specific_co2_emissions"].value_counts(normalize=True)
# -> all values are 0 -> no use for us
cols_to_be_dropped.append("specific_co2_emissions")

In [11]:
# electric_energy_consumption is our target and we should only keep rows with value
df.dropna(subset=["electric_energy_consumption"], inplace=True)

In [12]:
print("Dropping columns: ", cols_to_be_dropped)
df.drop(columns=cols_to_be_dropped, inplace=True)

# reinit
cols_to_be_dropped = list()

Dropping columns:  ['innovative_technologies', 'engine_capacity', 'erwltp', 'fuel_consumption', 'specific_co2_emissions']


In [13]:
# replace NaN with median
imputer = SimpleImputer(strategy='median')
cat_cols, quant_cols = categorize_categorical_quantitive_cols(df)
# make sure we don't accidentely manipulate target variable
quant_cols_replace = list(quant_cols)
quant_cols_replace.remove(ELECTRIC_TARGET)
# we don't want to replace anything in year column
quant_cols_replace.remove("year")

df[quant_cols] = imputer.fit_transform(df[quant_cols])

In [14]:
missing_percentage = df.isna().sum() / len(df)
print(missing_percentage)

member_state                  0.00
manufacturer_name_eu          0.00
vehicle_type                  0.00
commercial_name               0.01
category_of_vehicle           0.00
fuel_type                     0.00
fuel_mode                     0.00
mass_vehicle                  0.00
weltp_test_mass               0.00
engine_power                  0.00
year                          0.00
electric_range                0.00
electric_energy_consumption   0.00
dtype: float64


## Categorical Columns

### Analyze and preprocess columns based on value distribution & uniqueness

In [15]:
# analyze uniqueness
for col in cat_cols:
    len_unique = len(df[col].unique())
    print(col, " unique vals: ", len_unique)
    if len_unique == 1:
        # we don't need cols with only one value -> drop
        cols_to_be_dropped.append(col)


member_state  unique vals:  29
manufacturer_name_eu  unique vals:  72
vehicle_type  unique vals:  286
commercial_name  unique vals:  1452
category_of_vehicle  unique vals:  4
fuel_type  unique vals:  1
fuel_mode  unique vals:  1


In [16]:
print("Dropping columns: ", cols_to_be_dropped)
df.drop(columns=cols_to_be_dropped, inplace=True)

Dropping columns:  ['fuel_type', 'fuel_mode']


- Reduce number of unique values through adding a "other" value/class representing all values below a certain threshold.
- Additionally we'll replace Na values with REPLACE_STRING_OTHER

In [17]:
# replace Na values for cat_cols with REPLACE_STRING_OTHER

cat_cols, quant_cols = categorize_categorical_quantitive_cols(df)
df[cat_cols] = df[cat_cols].fillna(value=REPLACE_STRING_OTHER)

In [18]:
def reduce_unique_col_vals_through_other(df, col, threshold=0.01):
    # check if we can reduce the number of unique values in a column
    # by grouping the values that have a frequency of less than threshold
    # into a new category
    # returns the modified dataframe and the new unique values
    value_counts = df[col].value_counts(normalize=True)
    other_vals = value_counts[value_counts < threshold].index
    df[col] = df[col].apply(lambda x: REPLACE_STRING_OTHER if x in other_vals else x)
    return df

In [19]:
cat_cols, quant_cols = categorize_categorical_quantitive_cols(df)

missing_percentage = df[cat_cols].isna().sum() / len(df)
print(missing_percentage)

member_state           0.00
manufacturer_name_eu   0.00
vehicle_type           0.00
commercial_name        0.00
category_of_vehicle    0.00
dtype: float64


In [20]:
# Preprocess manufacturer_name_eu to reduce number of unique values for future encoding
reduce_unique_col_vals_through_other(df, "manufacturer_name_eu", threshold=0.01)
print("Reduced unique vals of manufacturer_name_eu to: ", len(df["manufacturer_name_eu"].unique()))
df["manufacturer_name_eu"].value_counts(normalize=True)

Reduced unique vals of manufacturer_name_eu to:  22


manufacturer_name_eu
TESLA                      0.14
VOLKSWAGEN                 0.12
BMW AG                     0.08
STELLANTIS AUTO            0.07
RENAULT                    0.07
MERCEDES-BENZ AG           0.06
AUDI AG                    0.05
other                      0.05
KIA                        0.04
SKODA                      0.04
DACIA                      0.04
STELLANTIS EUROPE          0.04
SAIC MOTOR CORPORATION     0.03
VOLVO                      0.03
PSA                        0.03
HYUNDAI                    0.03
HYUNDAI CZECH              0.02
SEAT                       0.02
NISSAN AUTOMOTIVE EUROPE   0.02
FORD WERKE GMBH            0.01
POLESTAR                   0.01
FIAT GROUP                 0.01
Name: proportion, dtype: float64

In [21]:
# Preprocess vehicle_type to reduce number of unique values for future encoding
reduce_unique_col_vals_through_other(df, "vehicle_type", threshold=0.0025)
print("Reduced unique vals of vehicle_type to: ", len(df["vehicle_type"].unique()))
df["vehicle_type"].value_counts(normalize=True)

Reduced unique vals of vehicle_type to:  58


vehicle_type
003       0.10
U         0.09
E2        0.06
FA1       0.04
E1        0.04
NY        0.04
3         0.04
DBG       0.04
other     0.03
X         0.03
AG        0.03
FZ        0.03
F2B       0.02
OSE       0.02
FML2E     0.02
AA        0.02
NE        0.02
V         0.02
451       0.02
RCB       0.02
GE        0.02
K1        0.02
CV        0.02
ZE1       0.02
G4C       0.01
SEH3      0.01
AH        0.01
LSK       0.01
DE        0.01
G3XE      0.01
B         0.01
SG2       0.01
ZS1       0.01
U1X       0.01
BMWi-1    0.01
Y1A       0.01
204 X     0.01
BMWi-N    0.01
DR        0.01
E2EQEW    0.01
EAM1(M)   0.00
E         0.00
005       0.00
AH2       0.00
EP21      0.00
EB        0.00
EP22-L    0.00
AG0       0.00
SC2E      0.00
FH1       0.00
BMWI-N    0.00
SK3       0.00
639/2     0.00
FW        0.00
CE        0.00
FE0E      0.00
OS        0.00
HX11      0.00
Name: proportion, dtype: float64

In [22]:
# Preprocess commercial_name to reduce number of unique values for future encoding
reduce_unique_col_vals_through_other(df, "commercial_name", threshold=0.0005)
print("Reduced unique vals of commercial_name to: ", len(df["commercial_name"].unique()))
df["commercial_name"].value_counts(normalize=True)

Reduced unique vals of commercial_name to:  181


commercial_name
MODEL Y        0.07
MODEL 3        0.06
other          0.05
500            0.04
SPRING         0.04
               ... 
E-TRON S       0.00
Q4 E-TRON      0.00
iX1 xDrive30   0.00
600            0.00
e-tron 55      0.00
Name: proportion, Length: 181, dtype: float64

In [23]:
# encoding
cat_cols, quant_cols = categorize_categorical_quantitive_cols(df)

ct_electric = ColumnTransformer(transformers=[("encoder", OneHotEncoder(sparse_output=False), cat_cols)], remainder="passthrough")
transformed_array = ct_electric.fit_transform(df)
encoder_feature_names = ct_electric.named_transformers_["encoder"].get_feature_names_out(cat_cols)

preserved_col_names = list(encoder_feature_names)
preserved_col_names.extend(list(quant_cols))

df_enc = pd.DataFrame(transformed_array, columns=preserved_col_names)

## Split dataset
- features, target
- train, test

In [24]:
X = df.drop(columns = ELECTRIC_TARGET)
y= df[ELECTRIC_TARGET]

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

## Feature Scaling

In [26]:
cat_cols, quant_cols = categorize_categorical_quantitive_cols(X)


In [27]:
cat_cols, quant_cols = categorize_categorical_quantitive_cols(X)

sc = StandardScaler()
X_train[quant_cols] = sc.fit_transform(X_train[quant_cols])
X_test[quant_cols] = sc.fit_transform(X_test[quant_cols])

# Lazy Predict

We'll use LazyRegressor as we're dealing with a supervised learning regression problem and want to check potential models for our usecase

In [None]:
reg = LazyRegressor(verbose=0,ignore_warnings=False, custom_metric=None )
models,predictions = reg.fit(X_train, X_test, y_train, y_test)

models

 19%|█▉        | 8/42 [06:03<12:15, 21.64s/it]   