# 0 Load data

In [1]:
import pandas as pd
import os
import gc
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns

In [2]:
X_train = pd.read_parquet("/kaggle/input/axa-challenge-final/X_train_hyperparemeter_opt.parquet")
y_train = pd.read_parquet("/kaggle/input/axa-challenge-final/y_train_hyperparemeter_opt.parquet")

In [3]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 877416 entries, 3251228 to 8325804
Data columns (total 11 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   tripduration          877416 non-null  int32  
 1   start station id      877416 non-null  float64
 2   end station id        877416 non-null  float64
 3   gender                877416 non-null  object 
 4   age                   877416 non-null  int8   
 5   month                 877416 non-null  int64  
 6   weekday               877416 non-null  object 
 7   time_hours            877416 non-null  int64  
 8   distance_travelled_m  877416 non-null  float32
 9   same_start_end        877416 non-null  int64  
 10  is_holiday            877416 non-null  int8   
dtypes: float32(1), float64(2), int32(1), int64(3), int8(2), object(2)
memory usage: 61.9+ MB


In [4]:
X_train["start station id"] = X_train["start station id"].astype("object")
X_train["end station id"] = X_train["end station id"].astype("object")
X_train["month"] = X_train["month"].astype("object")
X_train["time_hours"] = X_train["time_hours"].astype("object")

X_train["same_start_end"] = X_train["time_hours"].astype("int8")

# 6 Model CV

### 6.4 RF

In [5]:
! pip install feature_engine

Collecting feature_engine
  Downloading feature_engine-1.4.0-py2.py3-none-any.whl (276 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m276.4/276.4 kB[0m [31m968.4 kB/s[0m eta [36m0:00:00[0m
Installing collected packages: feature_engine
Successfully installed feature_engine-1.4.0
[0m

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import quantile_transform
from sklearn.preprocessing import FunctionTransformer
from feature_engine.encoding import RareLabelEncoder
from feature_engine.outliers import Winsorizer
from feature_engine.encoding import MeanEncoder
from sklearn.impute import SimpleImputer
from feature_engine.imputation import CategoricalImputer
from sklearn.preprocessing import KBinsDiscretizer

In [7]:
import joblib

def log_trans(x):
    return np.log(x+0.01)

preprocessor_final_1 = joblib.load("/kaggle/input/axa-challenge-final/preprocessor_final_1.joblib")
preprocessor_final_2 = joblib.load("/kaggle/input/axa-challenge-final/preprocessor_final_2.joblib")

In [8]:
from sklearn.ensemble import RandomForestClassifier
#from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

#import joblib

#preprocessor_final_1 = joblib.load("/kaggle/input/axa-challenge-final/preprocessor_final_1.joblib")
#preprocessor_final_2 = joblib.load("/kaggle/input/axa-challenge-final/preprocessor_final_2.joblib")

#Alle Schritte in eine Pipeline
Pipeline_rf_1 = Pipeline([("preprocessor_pipeline", preprocessor_final_1), 
                            ('rf', RandomForestClassifier(random_state=1) )])

Pipeline_rf_2 = Pipeline([("preprocessor_pipeline", preprocessor_final_2), 
                            ('rf', RandomForestClassifier(random_state=1) )])


rf_pipeline_list = [Pipeline_rf_1, Pipeline_rf_2]


param_grid_rf = dict(
    rf__n_estimators=randint(20, 70),
    rf__min_samples_split=uniform(0.001, 0.05),
    rf__max_features=["sqrt", "log2", None],
    )

#param_grid={'ridge__alpha': [0.1, 0.5], "ridge__tol": [1e-2, 1e-3]}

dict_results_rf= {}

i=1
for p in rf_pipeline_list:
    
    np.random.seed(seed=1)
    search = RandomizedSearchCV(p, param_grid_rf, n_jobs=4, cv=4, verbose=4, scoring=["roc_auc", "accuracy"], refit=False, return_train_score=True, n_iter=50)
    search.fit(X_train, np.ravel(y_train) )
    
#     dict_results_ridge[i] = pd.DataFrame({"ridge__alpha": search.cv_results_["params"][0]["ridge__alpha"], 
#                                "ridge__tol": search.cv_results_["params"][0]["ridge__tol"],
#                                "mean_fit_time": search.cv_results_["mean_fit_time"],
#                                 "mean_test_roc_auc": search.cv_results_["mean_test_roc_auc"],
#                                 "mean_test_accuracy": search.cv_results_["mean_test_accuracy"],
#                                 "mean_train_roc_auc": search.cv_results_["mean_train_roc_auc"],
#                                 "mean_train_accuracy": search.cv_results_["mean_train_accuracy"]})
    
    dict_results_rf[i] = pd.DataFrame(search.cv_results_)
    
    print(i)
    i +=1
    gc.collect()
    
    


Fitting 4 folds for each of 50 candidates, totalling 200 fits
[CV 1/4] END rf__max_features=log2, rf__min_samples_split=0.050859240546943435, rf__n_estimators=28; accuracy: (train=0.909, test=0.909) roc_auc: (train=0.907, test=0.904) total time=  47.6s
[CV 1/4] END rf__max_features=log2, rf__min_samples_split=0.016116628631591988, rf__n_estimators=35; accuracy: (train=0.936, test=0.936) roc_auc: (train=0.913, test=0.910) total time= 1.2min
[CV 1/4] END rf__max_features=sqrt, rf__min_samples_split=0.0056169297384398905, rf__n_estimators=32; accuracy: (train=0.946, test=0.945) roc_auc: (train=0.926, test=0.921) total time= 1.4min
[CV 2/4] END rf__max_features=log2, rf__min_samples_split=0.034487301840174005, rf__n_estimators=45; accuracy: (train=0.932, test=0.933) roc_auc: (train=0.908, test=0.907) total time= 1.3min
[CV 1/4] END rf__max_features=None, rf__min_samples_split=0.04331554583430086, rf__n_estimators=57; accuracy: (train=0.945, test=0.944) roc_auc: (train=0.920, test=0.916) to



1
Fitting 4 folds for each of 50 candidates, totalling 200 fits

[CV 2/4] END rf__max_features=None, rf__min_samples_split=0.04529710496553873, rf__n_estimators=69; accuracy: (train=0.944, test=0.944) roc_auc: (train=0.919, test=0.918) total time= 9.8min
[CV 2/4] END rf__max_features=None, rf__min_samples_split=0.04642675754598996, rf__n_estimators=40; accuracy: (train=0.944, test=0.944) roc_auc: (train=0.919, test=0.918) total time= 5.7min
[CV 2/4] END rf__max_features=None, rf__min_samples_split=0.0017910621423278142, rf__n_estimators=45; accuracy: (train=0.949, test=0.948) roc_auc: (train=0.948, test=0.935) total time= 9.7min
[CV 2/4] END rf__max_features=log2, rf__min_samples_split=0.05086614252257403, rf__n_estimators=39; accuracy: (train=0.920, test=0.921) roc_auc: (train=0.905, test=0.904) total time=  59.4s
[CV 2/4] END rf__max_features=log2, rf__min_samples_split=0.038138241504354575, rf__n_estimators=33; accuracy: (train=0.926, test=0.926) roc_auc: (train=0.906, test=0.905) t

In [9]:
dict_results_rf[1]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_rf__max_features,param_rf__min_samples_split,param_rf__n_estimators,params,split0_test_roc_auc,split1_test_roc_auc,...,split3_test_accuracy,mean_test_accuracy,std_test_accuracy,rank_test_accuracy,split0_train_accuracy,split1_train_accuracy,split2_train_accuracy,split3_train_accuracy,mean_train_accuracy,std_train_accuracy
0,43.295128,0.161378,4.288374,0.040932,log2,0.050859,28,"{'rf__max_features': 'log2', 'rf__min_samples_...",0.904344,0.904205,...,0.912092,0.908471,0.003754,50,0.908582,0.9101,0.902967,0.912071,0.90843,0.003388
1,65.098071,0.304638,5.175323,0.144507,log2,0.016117,35,"{'rf__max_features': 'log2', 'rf__min_samples_...",0.910285,0.911433,...,0.940302,0.938523,0.002128,29,0.936181,0.940968,0.937868,0.93993,0.938736,0.00185
2,77.313239,1.190884,5.653748,0.20883,sqrt,0.005617,32,"{'rf__max_features': 'sqrt', 'rf__min_samples_...",0.920745,0.919251,...,0.945677,0.945404,0.000416,5,0.9457,0.945621,0.945741,0.945136,0.945549,0.000243
3,71.01196,0.478623,5.446903,0.071484,log2,0.034487,45,"{'rf__max_features': 'log2', 'rf__min_samples_...",0.907394,0.90679,...,0.931959,0.932111,0.000352,40,0.93209,0.932499,0.932844,0.931426,0.932215,0.000528
4,503.107129,7.687496,6.124331,0.140888,,0.043316,57,"{'rf__max_features': None, 'rf__min_samples_sp...",0.916126,0.918166,...,0.944888,0.944405,0.000356,8,0.944577,0.944422,0.944378,0.944244,0.944405,0.000119
5,601.741082,7.751715,6.627957,0.211438,,0.027227,62,"{'rf__max_features': None, 'rf__min_samples_sp...",0.92123,0.923045,...,0.944888,0.944405,0.000356,8,0.944577,0.944422,0.944378,0.944244,0.944405,0.000119
6,61.503598,0.680933,5.038101,0.048712,sqrt,0.002369,24,"{'rf__max_features': 'sqrt', 'rf__min_samples_...",0.925334,0.925026,...,0.946912,0.946423,0.000355,2,0.947066,0.946704,0.946385,0.946321,0.946619,0.000296
7,82.279761,0.589099,5.913468,0.059002,log2,0.022535,50,"{'rf__max_features': 'log2', 'rf__min_samples_...",0.909194,0.909203,...,0.936518,0.935761,0.000652,34,0.935828,0.935932,0.935606,0.935991,0.935839,0.000147
8,104.380629,0.923533,6.320288,0.206398,sqrt,0.039919,61,"{'rf__max_features': 'sqrt', 'rf__min_samples_...",0.906573,0.90641,...,0.932862,0.932466,0.000541,38,0.93216,0.932657,0.933178,0.932491,0.932621,0.000368
9,62.719044,0.715655,5.120714,0.087439,log2,0.041138,42,"{'rf__max_features': 'log2', 'rf__min_samples_...",0.905653,0.907356,...,0.923161,0.921676,0.001766,46,0.923238,0.921485,0.919433,0.9232,0.921839,0.001559


In [10]:
dict_results_rf[2]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_rf__max_features,param_rf__min_samples_split,param_rf__n_estimators,params,split0_test_roc_auc,split1_test_roc_auc,...,split3_test_accuracy,mean_test_accuracy,std_test_accuracy,rank_test_accuracy,split0_train_accuracy,split1_train_accuracy,split2_train_accuracy,split3_train_accuracy,mean_train_accuracy,std_train_accuracy
0,45.23915,0.161299,3.693568,0.069878,log2,0.050859,28,"{'rf__max_features': 'log2', 'rf__min_samples_...",0.9058,0.907417,...,0.934558,0.931674,0.003448,50,0.932787,0.933667,0.92646,0.933976,0.931723,0.00307
1,67.006356,1.068355,4.407224,0.096459,log2,0.016117,35,"{'rf__max_features': 'log2', 'rf__min_samples_...",0.915292,0.917427,...,0.944719,0.94436,0.000451,24,0.944941,0.943507,0.945197,0.943938,0.944396,0.000696
2,70.703537,0.63339,4.741167,0.028508,sqrt,0.005617,32,"{'rf__max_features': 'sqrt', 'rf__min_samples_...",0.924118,0.924224,...,0.946593,0.946004,0.00062,7,0.946469,0.946738,0.945966,0.945903,0.946269,0.000348
3,73.858474,0.736714,4.785088,0.163145,log2,0.034487,45,"{'rf__max_features': 'log2', 'rf__min_samples_...",0.909518,0.908032,...,0.943347,0.939048,0.003302,37,0.941436,0.934766,0.937404,0.942761,0.939092,0.003183
4,316.605899,4.635897,5.144775,0.033312,,0.043316,57,"{'rf__max_features': None, 'rf__min_samples_sp...",0.917009,0.918955,...,0.944888,0.944405,0.000356,10,0.944577,0.944422,0.944378,0.944244,0.944405,0.000119
5,374.004148,10.938436,5.87437,0.153322,,0.027227,62,"{'rf__max_features': None, 'rf__min_samples_sp...",0.922335,0.923486,...,0.944888,0.944405,0.000356,10,0.944577,0.944422,0.944378,0.944244,0.944405,0.000119
6,57.636729,0.803438,4.47696,0.046606,sqrt,0.002369,24,"{'rf__max_features': 'sqrt', 'rf__min_samples_...",0.929496,0.931112,...,0.948362,0.947411,0.000671,2,0.947636,0.947654,0.947517,0.947628,0.947609,5.4e-05
7,87.65288,0.394179,5.320474,0.075378,log2,0.022535,50,"{'rf__max_features': 'log2', 'rf__min_samples_...",0.914258,0.91474,...,0.944587,0.943611,0.001269,28,0.944723,0.944201,0.941993,0.944042,0.94374,0.001039
8,94.914609,1.307624,5.410754,0.136019,sqrt,0.039919,61,"{'rf__max_features': 'sqrt', 'rf__min_samples_...",0.908698,0.907329,...,0.938752,0.937129,0.001182,42,0.935939,0.936618,0.937954,0.938389,0.937225,0.000988
9,66.321976,1.665115,4.455423,0.055874,log2,0.041138,42,"{'rf__max_features': 'log2', 'rf__min_samples_...",0.907267,0.907259,...,0.939112,0.935422,0.002173,44,0.934328,0.933313,0.935608,0.938767,0.935504,0.002052


In [11]:
for i in range(2):
    dict_results_rf[i+1].to_csv(f"rf_pipeline_{i+1}_results.csv", index=False)