# 0 Daten & Packages laden

In [1]:
import pandas as pd
import os
import gc
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns

In [2]:
X_train = pd.read_parquet("/kaggle/input/axa-challenge-final/X_train_hyperparemeter_opt.parquet")
y_train = pd.read_parquet("/kaggle/input/axa-challenge-final/y_train_hyperparemeter_opt.parquet")

In [3]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 877416 entries, 3251228 to 8325804
Data columns (total 11 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   tripduration          877416 non-null  int32  
 1   start station id      877416 non-null  float64
 2   end station id        877416 non-null  float64
 3   gender                877416 non-null  object 
 4   age                   877416 non-null  int32  
 5   month                 877416 non-null  int64  
 6   weekday               877416 non-null  object 
 7   time_hours            877416 non-null  int64  
 8   distance_travelled_m  877416 non-null  float32
 9   same_start_end        877416 non-null  int64  
 10  is_holiday            877416 non-null  int8   
dtypes: float32(1), float64(2), int32(2), int64(3), int8(1), object(2)
memory usage: 64.4+ MB


Datentransformationen sind notwendig weil einige Datenformate durch Parquet verloren gegangen sind

In [4]:
X_train["start station id"] = X_train["start station id"].astype("object")
X_train["end station id"] = X_train["end station id"].astype("object")
X_train["month"] = X_train["month"].astype("object")
X_train["time_hours"] = X_train["time_hours"].astype("object")

X_train["same_start_end"] = X_train["time_hours"].astype("int8")

# 6 Hyperparameter Optimierung verschiedener Modellansätze

### 6.0 Benchmark (Elastic Net nur mit Zeitvariablen)

In [5]:
%%capture
! pip install feature_engine

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import quantile_transform
from sklearn.preprocessing import FunctionTransformer
from feature_engine.encoding import RareLabelEncoder
from feature_engine.outliers import Winsorizer
from feature_engine.encoding import MeanEncoder
from sklearn.impute import SimpleImputer
from feature_engine.imputation import CategoricalImputer
from sklearn.preprocessing import KBinsDiscretizer

In [7]:
#Import der Pipelines und Definition der log_trans Funktion
import joblib

def log_trans(x):
    return np.log(x+0.01)

preprocessor_benchmark = joblib.load("/kaggle/input/axa-challenge-final/preprocessor_benchmark.joblib")


Durchführung von RandomizedSearchCV

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

Pipeline_benchmark = Pipeline([("preprocessor_pipeline", preprocessor_benchmark), 
                               ("scaling", StandardScaler(with_mean=True) ),
                            ('lasso', LogisticRegression(random_state=1, penalty="elasticnet", solver="saga"))])

#param_grid={'lasso__C': [1, 0.5, 0.1, 0.01]}
param_grid = dict(
    lasso__C=uniform(0.00001, 1),
    lasso__l1_ratio=uniform(0,1)
    )

np.random.seed(seed=1)
search = RandomizedSearchCV(Pipeline_benchmark, param_grid, n_jobs=4, cv=4, verbose=4, scoring=["roc_auc", "accuracy"], refit=False, return_train_score=True, n_iter=50)
search.fit(X_train, np.ravel(y_train))
gc.collect()

results_benchmark = pd.DataFrame(search.cv_results_)

del search
gc.collect()

Fitting 4 folds for each of 50 candidates, totalling 200 fits
[CV 1/4] END lasso__C=0.417032004702574, lasso__l1_ratio=0.7203244934421581; accuracy: (train=0.890, test=0.890) roc_auc: (train=0.720, test=0.716) total time=  25.3s
[CV 2/4] END lasso__C=0.00012437481734488664, lasso__l1_ratio=0.30233257263183977; accuracy: (train=0.890, test=0.890) roc_auc: (train=0.711, test=0.711) total time=  20.5s
[CV 1/4] END lasso__C=0.14676589081711305, lasso__l1_ratio=0.0923385947687978; accuracy: (train=0.890, test=0.890) roc_auc: (train=0.720, test=0.716) total time=  24.6s
[CV 2/4] END lasso__C=0.1862702113776709, lasso__l1_ratio=0.34556072704304774; accuracy: (train=0.890, test=0.890) roc_auc: (train=0.718, test=0.720) total time=  21.8s
[CV 1/4] END lasso__C=0.39677747423066995, lasso__l1_ratio=0.538816734003357; accuracy: (train=0.890, test=0.890) roc_auc: (train=0.720, test=0.716) total time=  25.6s
[CV 2/4] END lasso__C=0.4192045144032948, lasso__l1_ratio=0.6852195003967595; accuracy: (tra

0

In [9]:
results_benchmark

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_lasso__C,param_lasso__l1_ratio,params,split0_test_roc_auc,split1_test_roc_auc,split2_test_roc_auc,...,split3_test_accuracy,mean_test_accuracy,std_test_accuracy,rank_test_accuracy,split0_train_accuracy,split1_train_accuracy,split2_train_accuracy,split3_train_accuracy,mean_train_accuracy,std_train_accuracy
0,22.709976,1.426286,1.489233,0.069505,0.417032,0.720324,"{'lasso__C': 0.417032004702574, 'lasso__l1_rat...",0.715758,0.719781,0.716686,...,0.889722,0.889725,2e-06,1,0.889725,0.889725,0.889725,0.889726,0.889725,6.58012e-07
1,20.825151,1.19157,1.499437,0.038804,0.000124,0.302333,"{'lasso__C': 0.00012437481734488664, 'lasso__l...",0.708892,0.711491,0.710585,...,0.889722,0.889725,2e-06,1,0.889725,0.889725,0.889725,0.889726,0.889725,6.58012e-07
2,22.89421,1.332209,1.533854,0.10757,0.146766,0.092339,"{'lasso__C': 0.14676589081711305, 'lasso__l1_r...",0.715762,0.719778,0.716687,...,0.889722,0.889725,2e-06,1,0.889725,0.889725,0.889725,0.889726,0.889725,6.58012e-07
3,22.707212,1.593081,1.593838,0.082514,0.18627,0.345561,"{'lasso__C': 0.1862702113776709, 'lasso__l1_ra...",0.715763,0.719781,0.716689,...,0.889722,0.889725,2e-06,1,0.889725,0.889725,0.889725,0.889726,0.889725,6.58012e-07
4,22.887338,1.796734,1.485727,0.062036,0.396777,0.538817,"{'lasso__C': 0.39677747423066995, 'lasso__l1_r...",0.715757,0.719781,0.716686,...,0.889722,0.889725,2e-06,1,0.889725,0.889725,0.889725,0.889726,0.889725,6.58012e-07
5,22.805449,1.836336,1.522022,0.098244,0.419205,0.68522,"{'lasso__C': 0.4192045144032948, 'lasso__l1_ra...",0.715758,0.719781,0.716686,...,0.889722,0.889725,2e-06,1,0.889725,0.889725,0.889725,0.889726,0.889725,6.58012e-07
6,23.085974,1.905852,1.511806,0.088186,0.204462,0.878117,"{'lasso__C': 0.20446224973151744, 'lasso__l1_r...",0.715773,0.719782,0.716694,...,0.889722,0.889725,2e-06,1,0.889725,0.889725,0.889725,0.889726,0.889725,6.58012e-07
7,22.560495,1.615798,1.427529,0.006783,0.027398,0.670468,"{'lasso__C': 0.027397593197926163, 'lasso__l1_...",0.715862,0.7198,0.71674,...,0.889722,0.889725,2e-06,1,0.889725,0.889725,0.889725,0.889726,0.889725,6.58012e-07
8,22.995677,1.520745,1.451638,0.031603,0.417315,0.55869,"{'lasso__C': 0.417314802367127, 'lasso__l1_rat...",0.715757,0.719781,0.716685,...,0.889722,0.889725,2e-06,1,0.889725,0.889725,0.889725,0.889726,0.889725,6.58012e-07
9,22.646399,1.499469,1.482648,0.057,0.140397,0.198101,"{'lasso__C': 0.14039693859523378, 'lasso__l1_r...",0.715765,0.71978,0.71669,...,0.889722,0.889725,2e-06,1,0.889725,0.889725,0.889725,0.889726,0.889725,6.58012e-07


Wegschreiben der Ergebnisse

In [10]:
results_benchmark.to_csv(f"lasso_benchmark.csv", index=False)

In [11]:
del results_benchmark
gc.collect()

21

### 6.1 Elastic Net Regression

Durchführung von RandomizedSearchCV

In [12]:
#Import der Pipelines und Definition der log_trans Funktion
import joblib

def log_trans(x):
    return np.log(x+0.01)

preprocessor_final_1 = joblib.load("/kaggle/input/axa-challenge-final/preprocessor_final_1.joblib")
preprocessor_final_2 = joblib.load("/kaggle/input/axa-challenge-final/preprocessor_final_2.joblib")

In [13]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
#import joblib

#preprocessor_final_1 = joblib.load("/kaggle/input/axa-challenge-final/preprocessor_final_1.joblib")
#preprocessor_final_2 = joblib.load("/kaggle/input/axa-challenge-final/preprocessor_final_2.joblib")

#Alle Schritte in eine Pipeline
Pipeline_lasso_1 = Pipeline([("preprocessor_pipeline", preprocessor_final_1), 
                            ("scaling", StandardScaler(with_mean=True) ),
                            ('lasso', LogisticRegression(random_state=1, penalty="elasticnet", solver="saga"))])

Pipeline_lasso_2 = Pipeline([("preprocessor_pipeline", preprocessor_final_2), 
                             ("scaling", StandardScaler(with_mean=True)),
                            ('lasso', LogisticRegression(random_state=1, penalty="elasticnet", solver="saga"))])


lasso_pipeline_list = [Pipeline_lasso_1, Pipeline_lasso_2]


#param_grid={'lasso__C': [1, 0.5, 0.1, 0.01]}
param_grid = dict(
    lasso__C=uniform(0.00001, 1),
    lasso__l1_ratio=uniform(0,1)
    )


dict_results_lasso= {}

i=1
for p in lasso_pipeline_list:
    
    np.random.seed(seed=1)
    search = RandomizedSearchCV(p, param_grid, n_jobs=4, cv=4, verbose=4, scoring=["roc_auc", "accuracy"], refit=False, return_train_score=True, n_iter=50)
    search.fit(X_train, np.ravel(y_train) )
    
#     dict_results_ridge[i] = pd.DataFrame({"ridge__alpha": search.cv_results_["params"][0]["ridge__alpha"], 
#                                "ridge__tol": search.cv_results_["params"][0]["ridge__tol"],
#                                "mean_fit_time": search.cv_results_["mean_fit_time"],
#                                 "mean_test_roc_auc": search.cv_results_["mean_test_roc_auc"],
#                                 "mean_test_accuracy": search.cv_results_["mean_test_accuracy"],
#                                 "mean_train_roc_auc": search.cv_results_["mean_train_roc_auc"],
#                                 "mean_train_accuracy": search.cv_results_["mean_train_accuracy"]})
    
    dict_results_lasso[i] = pd.DataFrame(search.cv_results_)
    
    print(i)
    i +=1
    gc.collect()
    
    


Fitting 4 folds for each of 50 candidates, totalling 200 fits

[CV 4/4] END lasso__C=0.9033895205622537, lasso__l1_ratio=0.5736794866722859; accuracy: (train=0.890, test=0.890) roc_auc: (train=0.718, test=0.721) total time=  25.2s
[CV 2/4] END lasso__C=0.417032004702574, lasso__l1_ratio=0.7203244934421581; accuracy: (train=0.946, test=0.946) roc_auc: (train=0.932, test=0.931) total time= 2.9min





[CV 1/4] END lasso__C=0.00288032703115897, lasso__l1_ratio=0.6171449136207239; accuracy: (train=0.890, test=0.890) roc_auc: (train=0.719, test=0.716) total time=  23.0s
[CV 1/4] END lasso__C=0.417032004702574, lasso__l1_ratio=0.7203244934421581; accuracy: (train=0.947, test=0.946) roc_auc: (train=0.932, test=0.931) total time= 2.5min
[CV 3/4] END lasso__C=0.14676589081711305, lasso__l1_ratio=0.0923385947687978; accuracy: (train=0.946, test=0.946) roc_auc: (train=0.932, test=0.931) total time=  31.3s
[CV 3/4] END lasso__C=0.1862702113776709, lasso__l1_ratio=0.34556072704304774; accuracy: (train=0.946, test=0.946) roc_auc: (train=0.932, test=0.931) total time=  29.2s
[CV 1/4] END lasso__C=0.39677747423066995, lasso__l1_ratio=0.538816734003357; accuracy: (train=0.947, test=0.946) roc_auc: (train=0.932, test=0.931) total time= 1.9min

[CV 3/4] END lasso__C=0.00288032703115897, lasso__l1_ratio=0.6171449136207239; accuracy: (train=0.890, test=0.890) roc_auc: (train=0.719, test=0.716) total 

In [14]:
dict_results_lasso[1]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_lasso__C,param_lasso__l1_ratio,params,split0_test_roc_auc,split1_test_roc_auc,split2_test_roc_auc,...,split3_test_accuracy,mean_test_accuracy,std_test_accuracy,rank_test_accuracy,split0_train_accuracy,split1_train_accuracy,split2_train_accuracy,split3_train_accuracy,mean_train_accuracy,std_train_accuracy
0,109.338971,55.030666,2.999161,0.127006,0.417032,0.720324,"{'lasso__C': 0.417032004702574, 'lasso__l1_rat...",0.930555,0.930995,0.931452,...,0.947035,0.946357,0.00042,26,0.946513,0.946481,0.946447,0.946212,0.946413,0.000119
1,24.440899,0.233409,2.934089,0.074388,0.000124,0.302333,"{'lasso__C': 0.00012437481734488664, 'lasso__l...",0.915711,0.916713,0.916851,...,0.946183,0.94545,0.000468,50,0.945596,0.945494,0.945607,0.945213,0.945478,0.000159
2,30.296763,4.855773,3.059804,0.232956,0.146766,0.092339,"{'lasso__C': 0.14676589081711305, 'lasso__l1_r...",0.930547,0.930988,0.931443,...,0.947031,0.946358,0.000414,11,0.94652,0.946484,0.946453,0.946219,0.946419,0.000118
3,66.993221,24.952238,2.999102,0.05727,0.18627,0.345561,"{'lasso__C': 0.1862702113776709, 'lasso__l1_ra...",0.930549,0.93099,0.931446,...,0.947031,0.946356,0.000416,33,0.946516,0.946479,0.946455,0.946219,0.946417,0.000116
4,84.479128,40.052903,2.936867,0.047316,0.396777,0.538817,"{'lasso__C': 0.39677747423066995, 'lasso__l1_r...",0.930555,0.930995,0.931452,...,0.947045,0.946358,0.000425,11,0.946511,0.946482,0.946452,0.94621,0.946414,0.000119
5,112.354905,55.534087,3.070508,0.107795,0.419205,0.68522,"{'lasso__C': 0.4192045144032948, 'lasso__l1_ra...",0.930555,0.930995,0.931452,...,0.947045,0.946358,0.000425,11,0.946513,0.946482,0.946449,0.946212,0.946414,0.000119
6,51.992871,19.414321,3.172278,0.191613,0.204462,0.878117,"{'lasso__C': 0.20446224973151744, 'lasso__l1_r...",0.930548,0.93099,0.931446,...,0.947022,0.946355,0.000414,41,0.946514,0.946473,0.946446,0.946221,0.946413,0.000114
7,27.096481,1.26279,3.010583,0.18402,0.027398,0.670468,"{'lasso__C': 0.027397593197926163, 'lasso__l1_...",0.93046,0.930919,0.931372,...,0.947031,0.946372,0.000406,3,0.946578,0.946499,0.946493,0.94626,0.946458,0.000119
8,85.905634,41.348127,3.079961,0.167328,0.417315,0.55869,"{'lasso__C': 0.417314802367127, 'lasso__l1_rat...",0.930555,0.930995,0.931452,...,0.947045,0.946358,0.000426,11,0.946511,0.946484,0.946452,0.946209,0.946414,0.00012
9,29.117614,2.838167,3.019679,0.21465,0.140397,0.198101,"{'lasso__C': 0.14039693859523378, 'lasso__l1_r...",0.930546,0.930987,0.931443,...,0.947031,0.946362,0.000412,6,0.946516,0.946476,0.946453,0.946219,0.946416,0.000116


In [15]:
dict_results_lasso[2]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_lasso__C,param_lasso__l1_ratio,params,split0_test_roc_auc,split1_test_roc_auc,split2_test_roc_auc,...,split3_test_accuracy,mean_test_accuracy,std_test_accuracy,rank_test_accuracy,split0_train_accuracy,split1_train_accuracy,split2_train_accuracy,split3_train_accuracy,mean_train_accuracy,std_train_accuracy
0,16.542554,0.791911,2.408758,0.070775,0.417032,0.720324,"{'lasso__C': 0.417032004702574, 'lasso__l1_rat...",0.929109,0.929443,0.930135,...,0.947067,0.946336,0.000441,28,0.946548,0.946418,0.946438,0.946148,0.946388,0.000147
1,16.232665,1.145779,2.439714,0.117391,0.000124,0.302333,"{'lasso__C': 0.00012437481734488664, 'lasso__l...",0.917597,0.918859,0.91878,...,0.946228,0.945448,0.000554,50,0.945744,0.945499,0.945709,0.945317,0.945567,0.000172
2,16.518961,0.863559,2.536274,0.061114,0.146766,0.092339,"{'lasso__C': 0.14676589081711305, 'lasso__l1_r...",0.929102,0.929437,0.930129,...,0.947081,0.946346,0.000443,7,0.946546,0.946429,0.946438,0.946151,0.946391,0.000146
3,16.607996,0.81351,2.554893,0.166712,0.18627,0.345561,"{'lasso__C': 0.1862702113776709, 'lasso__l1_ra...",0.929104,0.929439,0.930131,...,0.947072,0.946342,0.00044,12,0.946543,0.946425,0.946438,0.946148,0.946388,0.000146
4,16.601283,0.650435,2.780057,0.277366,0.396777,0.538817,"{'lasso__C': 0.39677747423066995, 'lasso__l1_r...",0.929108,0.929443,0.930135,...,0.947067,0.946339,0.00044,17,0.946546,0.946418,0.946437,0.946146,0.946387,0.000147
5,16.59795,0.609153,2.492088,0.076151,0.419205,0.68522,"{'lasso__C': 0.4192045144032948, 'lasso__l1_ra...",0.929109,0.929443,0.930135,...,0.947067,0.946337,0.000441,26,0.946548,0.946418,0.946437,0.946148,0.946388,0.000147
6,16.531961,0.49386,2.431261,0.04138,0.204462,0.878117,"{'lasso__C': 0.20446224973151744, 'lasso__l1_r...",0.929105,0.929441,0.930133,...,0.947072,0.946339,0.000441,17,0.946549,0.946418,0.946441,0.946145,0.946388,0.000149
7,15.819022,0.988545,2.478615,0.069613,0.027398,0.670468,"{'lasso__C': 0.027397593197926163, 'lasso__l1_...",0.92906,0.929411,0.930091,...,0.947099,0.946345,0.000456,9,0.946557,0.946429,0.946453,0.946157,0.946399,0.000148
8,16.640223,1.032537,2.540058,0.208939,0.417315,0.55869,"{'lasso__C': 0.417314802367127, 'lasso__l1_rat...",0.929109,0.929443,0.930135,...,0.947067,0.946339,0.00044,17,0.946548,0.946418,0.946437,0.946148,0.946388,0.000147
9,16.674832,1.145753,2.472865,0.183681,0.140397,0.198101,"{'lasso__C': 0.14039693859523378, 'lasso__l1_r...",0.929102,0.929437,0.930129,...,0.947086,0.946347,0.000445,6,0.946548,0.946429,0.946437,0.946151,0.946391,0.000146


Wegschreiben der Ergebnisse

In [16]:
for i in range(2):
    dict_results_lasso[i+1].to_csv(f"lasso_pipeline_{i+1}_results.csv", index=False)