# 0 Daten & Packages laden

In [1]:
import pandas as pd
import os
import gc
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns

In [2]:
X_train = pd.read_parquet("/kaggle/input/axa-challenge-final/X_train_hyperparemeter_opt.parquet")
y_train = pd.read_parquet("/kaggle/input/axa-challenge-final/y_train_hyperparemeter_opt.parquet")

In [3]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 877416 entries, 3251228 to 8325804
Data columns (total 11 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   tripduration          877416 non-null  int32  
 1   start station id      877416 non-null  float64
 2   end station id        877416 non-null  float64
 3   gender                877416 non-null  object 
 4   age                   877416 non-null  int32  
 5   month                 877416 non-null  int64  
 6   weekday               877416 non-null  object 
 7   time_hours            877416 non-null  int64  
 8   distance_travelled_m  877416 non-null  float32
 9   same_start_end        877416 non-null  int64  
 10  is_holiday            877416 non-null  int8   
dtypes: float32(1), float64(2), int32(2), int64(3), int8(1), object(2)
memory usage: 64.4+ MB


Datentransformationen sind notwendig weil einige Datenformate durch Parquet verloren gegangen sind

In [4]:
X_train["start station id"] = X_train["start station id"].astype("object")
X_train["end station id"] = X_train["end station id"].astype("object")
X_train["month"] = X_train["month"].astype("object")
X_train["time_hours"] = X_train["time_hours"].astype("object")

X_train["same_start_end"] = X_train["time_hours"].astype("int8")

# 6 Hyperparameter Optimierung verschiedener Modellansätze

### 6.4 LGBM

Laden der Pakete und Pipelines

In [5]:
%%capture
! pip install feature_engine

In [6]:
#Laden der Pakete
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import quantile_transform
from sklearn.preprocessing import FunctionTransformer
from feature_engine.encoding import RareLabelEncoder
from feature_engine.outliers import Winsorizer
from feature_engine.encoding import MeanEncoder
from sklearn.impute import SimpleImputer
from feature_engine.imputation import CategoricalImputer
from sklearn.preprocessing import KBinsDiscretizer

In [7]:
#Import der Pipelines und Definition der log_trans Funktion
import joblib

def log_trans(x):
    return np.log(x+0.01)

preprocessor_final_1 = joblib.load("/kaggle/input/axa-challenge-final/preprocessor_final_1.joblib")
preprocessor_final_2 = joblib.load("/kaggle/input/axa-challenge-final/preprocessor_final_2.joblib")

Durchführung von RandomizedSearchCV

In [8]:
from lightgbm import LGBMClassifier
#from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform


#Alle Schritte in eine Pipeline
Pipeline_lgbm_1 = Pipeline([("preprocessor_pipeline", preprocessor_final_1), 
                            ('lgbm', LGBMClassifier(random_state=1, reg_alpha=0.4) )])

Pipeline_lgbm_2 = Pipeline([("preprocessor_pipeline", preprocessor_final_2), 
                            ('lgbm', LGBMClassifier(random_state=1, reg_alpha=0.4) )])


lgbm_pipeline_list = [Pipeline_lgbm_1, Pipeline_lgbm_2]


param_grid_lgbm = dict(
    lgbm__n_estimators=randint(200, 600),
    lgbm__learning_rate=uniform(0.001, 0.1),
    lgbm__num_leaves=randint(31, 200),
    lgbm__max_depth=randint(-3, 10),
    lgbm__min_child_samples=randint(50, 100),
    )


dict_results_lgbm= {}

i=1
for p in lgbm_pipeline_list:
    np.random.seed(seed=99)
    search = RandomizedSearchCV(p, param_grid_lgbm, n_jobs=4, cv=4, verbose=4, scoring=["roc_auc", "accuracy"], refit=False, return_train_score=True, n_iter=40)
    search.fit(X_train, np.ravel(y_train) )
        
    dict_results_lgbm[i] = pd.DataFrame(search.cv_results_)
    
    print(i)
    i +=1
    gc.collect()

Fitting 4 folds for each of 40 candidates, totalling 160 fits
[CV 2/4] END lgbm__learning_rate=0.06822785586307918, lgbm__max_depth=6, lgbm__min_child_samples=90, lgbm__n_estimators=401, lgbm__num_leaves=99; accuracy: (train=0.953, test=0.951) roc_auc: (train=0.954, test=0.944) total time= 1.4min
[CV 1/4] END lgbm__learning_rate=0.08180499633648478, lgbm__max_depth=-2, lgbm__min_child_samples=73, lgbm__n_estimators=454, lgbm__num_leaves=66; accuracy: (train=0.955, test=0.951) roc_auc: (train=0.966, test=0.943) total time= 1.3min
[CV 1/4] END lgbm__learning_rate=0.005669572050695326, lgbm__max_depth=-2, lgbm__min_child_samples=98, lgbm__n_estimators=421, lgbm__num_leaves=90; accuracy: (train=0.951, test=0.950) roc_auc: (train=0.940, test=0.935) total time= 2.2min
[CV 4/4] END lgbm__learning_rate=0.07797930281899398, lgbm__max_depth=1, lgbm__min_child_samples=73, lgbm__n_estimators=330, lgbm__num_leaves=171; accuracy: (train=0.946, test=0.947) roc_auc: (train=0.918, test=0.920) total tim



[CV 3/4] END lgbm__learning_rate=0.08106581636464107, lgbm__max_depth=-3, lgbm__min_child_samples=88, lgbm__n_estimators=581, lgbm__num_leaves=90; accuracy: (train=0.957, test=0.951) roc_auc: (train=0.975, test=0.945) total time= 1.7min
[CV 3/4] END lgbm__learning_rate=0.047623813470984225, lgbm__max_depth=-1, lgbm__min_child_samples=90, lgbm__n_estimators=483, lgbm__num_leaves=153; accuracy: (train=0.956, test=0.951) roc_auc: (train=0.973, test=0.946) total time= 1.9min
[CV 3/4] END lgbm__learning_rate=0.06317125445882107, lgbm__max_depth=2, lgbm__min_child_samples=79, lgbm__n_estimators=349, lgbm__num_leaves=136; accuracy: (train=0.950, test=0.949) roc_auc: (train=0.938, test=0.937) total time=  43.6s
[CV 3/4] END lgbm__learning_rate=0.08985617542080104, lgbm__max_depth=-2, lgbm__min_child_samples=95, lgbm__n_estimators=549, lgbm__num_leaves=62; accuracy: (train=0.956, test=0.951) roc_auc: (train=0.969, test=0.945) total time= 1.5min
[CV 3/4] END lgbm__learning_rate=0.018442025303130

In [9]:
dict_results_lgbm[1]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_lgbm__learning_rate,param_lgbm__max_depth,param_lgbm__min_child_samples,param_lgbm__n_estimators,param_lgbm__num_leaves,params,...,split3_test_accuracy,mean_test_accuracy,std_test_accuracy,rank_test_accuracy,split0_train_accuracy,split1_train_accuracy,split2_train_accuracy,split3_train_accuracy,mean_train_accuracy,std_train_accuracy
0,57.814052,0.354775,24.942893,0.336975,0.068228,6,90,401,99,"{'lgbm__learning_rate': 0.06822785586307918, '...",...,0.952109,0.951421,0.000399,6,0.953358,0.953355,0.95346,0.953169,0.953336,0.000105
1,56.643465,0.209848,22.571567,0.712651,0.081805,-2,73,454,66,"{'lgbm__learning_rate': 0.08180499633648478, '...",...,0.952055,0.951294,0.000451,20,0.955217,0.955223,0.955246,0.954858,0.955136,0.000161
2,89.989086,6.395338,31.686865,0.816324,0.00567,-2,98,421,90,"{'lgbm__learning_rate': 0.005669572050695326, ...",...,0.950509,0.949917,0.000354,30,0.950619,0.950588,0.950599,0.950321,0.950532,0.000122
3,30.266797,0.656304,7.09385,0.11667,0.077979,1,73,330,171,"{'lgbm__learning_rate': 0.07797930281899398, '...",...,0.946953,0.946273,0.000407,38,0.946458,0.946382,0.946459,0.946139,0.94636,0.000131
4,26.302572,0.244048,6.560417,0.155978,0.034677,1,89,279,78,"{'lgbm__learning_rate': 0.03467723072241872, '...",...,0.944264,0.943645,0.000504,40,0.943875,0.943603,0.943669,0.943589,0.943684,0.000114
5,43.992972,1.505099,10.721162,0.103465,0.010361,2,96,428,93,"{'lgbm__learning_rate': 0.010361309329775258, ...",...,0.946657,0.945925,0.000528,39,0.946104,0.946022,0.946102,0.945814,0.94601,0.000118
6,80.248247,3.80117,41.43422,0.390939,0.030227,-3,71,439,130,"{'lgbm__learning_rate': 0.030226911595916162, ...",...,0.952091,0.95141,0.000406,7,0.954807,0.954636,0.954693,0.954399,0.954634,0.000149
7,58.935284,2.851311,23.26659,0.782037,0.041976,7,55,340,127,"{'lgbm__learning_rate': 0.04197553539636337, '...",...,0.951922,0.95132,0.000351,16,0.95338,0.953393,0.953466,0.953097,0.953334,0.000141
8,93.872564,5.213218,51.570973,1.27802,0.02988,8,51,531,145,"{'lgbm__learning_rate': 0.029880160409220303, ...",...,0.9521,0.951427,0.000393,4,0.954702,0.954702,0.954656,0.954519,0.954645,7.5e-05
9,77.743178,0.669698,31.639978,1.806316,0.008761,-2,91,323,192,"{'lgbm__learning_rate': 0.008761383660001487, ...",...,0.951261,0.950565,0.000445,26,0.951918,0.952068,0.952055,0.951755,0.951949,0.000126


In [10]:
dict_results_lgbm[2]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_lgbm__learning_rate,param_lgbm__max_depth,param_lgbm__min_child_samples,param_lgbm__n_estimators,param_lgbm__num_leaves,params,...,split3_test_accuracy,mean_test_accuracy,std_test_accuracy,rank_test_accuracy,split0_train_accuracy,split1_train_accuracy,split2_train_accuracy,split3_train_accuracy,mean_train_accuracy,std_train_accuracy
0,60.299151,3.609391,23.528423,1.267769,0.068228,6,90,401,99,"{'lgbm__learning_rate': 0.06822785586307918, '...",...,0.952205,0.951485,0.000429,11,0.953444,0.953488,0.95357,0.953286,0.953447,0.000103
1,56.713781,2.393085,21.100888,1.606245,0.081805,-2,73,454,66,"{'lgbm__learning_rate': 0.08180499633648478, '...",...,0.952223,0.951442,0.000459,17,0.955097,0.954922,0.95494,0.954702,0.954915,0.000141
2,95.975707,6.345927,30.075379,1.728331,0.00567,-2,98,421,90,"{'lgbm__learning_rate': 0.005669572050695326, ...",...,0.95081,0.950195,0.000373,30,0.950872,0.950871,0.950942,0.950685,0.950843,9.5e-05
3,28.82176,0.673603,6.88281,0.30685,0.077979,1,73,330,171,"{'lgbm__learning_rate': 0.07797930281899398, '...",...,0.947359,0.946597,0.000455,38,0.946835,0.946727,0.946818,0.946491,0.946718,0.000137
4,25.596562,0.385685,6.010258,0.119755,0.034677,1,89,279,78,"{'lgbm__learning_rate': 0.03467723072241872, '...",...,0.944204,0.943784,0.000372,40,0.944055,0.943707,0.943826,0.943659,0.943812,0.000153
5,42.838229,1.348461,10.466051,0.517891,0.010361,2,96,428,93,"{'lgbm__learning_rate': 0.010361309329775258, ...",...,0.947035,0.946267,0.000542,39,0.9465,0.946391,0.946364,0.946143,0.94635,0.00013
6,80.283895,5.291366,41.157383,1.594888,0.030227,-3,71,439,130,"{'lgbm__learning_rate': 0.030226911595916162, ...",...,0.952292,0.951509,0.000466,7,0.95465,0.954583,0.954623,0.954336,0.954548,0.000125
7,61.068681,3.830735,23.65515,1.472939,0.041976,7,55,340,127,"{'lgbm__learning_rate': 0.04197553539636337, '...",...,0.952178,0.951439,0.000427,18,0.953517,0.953453,0.953623,0.953261,0.953463,0.000132
8,98.453525,6.731687,54.646398,1.992593,0.02988,8,51,531,145,"{'lgbm__learning_rate': 0.029880160409220303, ...",...,0.952251,0.951486,0.000471,10,0.954805,0.954761,0.954849,0.954568,0.954746,0.000107
9,77.527042,0.741104,30.145871,2.006821,0.008761,-2,91,323,192,"{'lgbm__learning_rate': 0.008761383660001487, ...",...,0.951471,0.950765,0.000408,26,0.952336,0.952252,0.952369,0.952017,0.952243,0.000138


Wegschreiben der Ergebnisse

In [11]:
for i in range(2):
    dict_results_lgbm[i+1].to_csv(f"lgbm_pipeline_{i+1}_results.csv", index=False)