# 0 Daten & Packages laden

In [1]:
import pandas as pd
import os
import gc
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns

In [2]:
X_train = pd.read_parquet("/kaggle/input/axa-challenge-final/X_train_hyperparemeter_opt.parquet")
y_train = pd.read_parquet("/kaggle/input/axa-challenge-final/y_train_hyperparemeter_opt.parquet")

In [3]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 877416 entries, 3251228 to 8325804
Data columns (total 11 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   tripduration          877416 non-null  int32  
 1   start station id      877416 non-null  float64
 2   end station id        877416 non-null  float64
 3   gender                877416 non-null  object 
 4   age                   877416 non-null  int32  
 5   month                 877416 non-null  int64  
 6   weekday               877416 non-null  object 
 7   time_hours            877416 non-null  int64  
 8   distance_travelled_m  877416 non-null  float32
 9   same_start_end        877416 non-null  int64  
 10  is_holiday            877416 non-null  int8   
dtypes: float32(1), float64(2), int32(2), int64(3), int8(1), object(2)
memory usage: 64.4+ MB


Datentransformationen sind notwendig weil einige Datenformate durch Parquet verloren gegangen sind

In [4]:
X_train["start station id"] = X_train["start station id"].astype("object")
X_train["end station id"] = X_train["end station id"].astype("object")
X_train["month"] = X_train["month"].astype("object")
X_train["time_hours"] = X_train["time_hours"].astype("object")

X_train["same_start_end"] = X_train["time_hours"].astype("int8")

# 6 Model CV

### 6.3 Neural Network

In [5]:
%%capture
! pip install feature_engine

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import quantile_transform
from sklearn.preprocessing import FunctionTransformer
from feature_engine.encoding import RareLabelEncoder
from feature_engine.outliers import Winsorizer
from feature_engine.encoding import MeanEncoder
from sklearn.impute import SimpleImputer
from feature_engine.imputation import CategoricalImputer
from sklearn.preprocessing import KBinsDiscretizer

In [7]:
#Import der Pipelines und Definition der log_trans Funktion

import joblib

def log_trans(x):
    return np.log(x+0.01)

preprocessor_final_1 = joblib.load("/kaggle/input/axa-challenge-final/preprocessor_final_1.joblib")
preprocessor_final_2 = joblib.load("/kaggle/input/axa-challenge-final/preprocessor_final_2.joblib")

In [8]:
import tensorflow as tf

def auc(y_true, y_pred):
    auc = tf.keras.metrics.AUC(y_true, y_pred)[1]
    K.get_session().run(tf.local_variables_initializer())
    return auc

In [9]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=4, random_state=0, shuffle=True)

kf.get_n_splits(X_train)

4

# Pipeline 1

In [10]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation,Dropout
from tensorflow.keras.callbacks import EarlyStopping


early_stop = EarlyStopping(monitor='val_accuracy', mode='max', verbose=1, patience=10)

i=1
history_dict = {}


try:
    del X_train_tmp, y_train_tmp, X_test_tmp, y_test_tmp, NN
except:
    pass

gc.collect()


for train_index, test_index in kf.split(X_train):
    #Split test and train
    
    X_train_tmp, X_test_tmp = X_train.reset_index(drop=True).iloc[train_index, :], X_train.reset_index(drop=True).iloc[test_index,:]
    y_train_tmp, y_test_tmp = y_train.reset_index(drop=True)["usertype"][train_index], y_train.reset_index(drop=True)["usertype"][test_index]
    
    X_train_tmp = preprocessor_final_1.fit_transform(X_train_tmp, y_train_tmp)
    X_test_tmp = preprocessor_final_1.transform(X_test_tmp)
    
    #Create model
    NN = Sequential()
    NN.add(Dense(units=1024,activation='relu'))
    NN.add(Dropout(0.3))
    NN.add(Dense(units=512,activation='relu'))
    NN.add(Dropout(0.3))
    NN.add(Dense(units=256,activation='relu'))
    NN.add(Dropout(0.3))
    NN.add(Dense(units=128,activation='relu'))
    NN.add(Dropout(0.1))
    NN.add(Dense(units=32,activation='relu'))
    NN.add(Dropout(0.1))
    NN.add(Dense(units=1,activation='sigmoid'))
    
    NN.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', tf.keras.metrics.AUC()])
    
    print("Start Fitting")
    
    history = NN.fit(x=X_train_tmp, 
          y=y_train_tmp, 
          epochs=100,
          batch_size=10240,
          validation_data=(X_test_tmp, y_test_tmp), verbose=0,
          callbacks=[early_stop]
          )
    
    history_dict[i] = [np.max(history.history["val_accuracy"]), np.max(history.history[ list(history.history.keys())[-1] ])]
    print(f"Step {i} finished")
    i += 1
    
    del X_train_tmp, y_train_tmp, X_test_tmp, y_test_tmp, NN
    gc.collect()

mean_acc = (history_dict[1][0] + history_dict[2][0] + history_dict[3][0] + history_dict[4][0])/4
mean_auc = (history_dict[1][1] + history_dict[2][1] + history_dict[3][1] + history_dict[4][1])/4




2022-12-11 16:58:42.996711: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-12-11 16:58:43.106644: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-12-11 16:58:43.107440: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-12-11 16:58:43.109633: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil

Start Fitting


2022-12-11 16:58:46.492978: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 00044: early stopping
Step 1 finished
Start Fitting
Epoch 00025: early stopping
Step 2 finished
Start Fitting
Epoch 00035: early stopping
Step 3 finished
Start Fitting
Epoch 00027: early stopping
Step 4 finished


In [11]:
print(f"Mean_CV_Accuracy: {mean_acc}")
print(f"Mean_CV_AUC: {mean_auc}")

Mean_CV_Accuracy: 0.9505183398723602
Mean_CV_AUC: 0.943131148815155


# Pipeline 2

In [12]:
i=1
history_dict = {}


try:
    del X_train_tmp, y_train_tmp, X_test_tmp, y_test_tmp, NN
except:
    pass

gc.collect()


for train_index, test_index in kf.split(X_train):
    #Split test and train
    
    X_train_tmp, X_test_tmp = X_train.reset_index(drop=True).iloc[train_index, :], X_train.reset_index(drop=True).iloc[test_index,:]
    y_train_tmp, y_test_tmp = y_train.reset_index(drop=True)["usertype"][train_index], y_train.reset_index(drop=True)["usertype"][test_index]
    
    X_train_tmp = preprocessor_final_2.fit_transform(X_train_tmp, y_train_tmp)
    X_test_tmp = preprocessor_final_2.transform(X_test_tmp)
    
    #Create model
    NN = Sequential()
    NN.add(Dense(units=1024,activation='relu'))
    NN.add(Dropout(0.3))
    NN.add(Dense(units=512,activation='relu'))
    NN.add(Dropout(0.3))
    NN.add(Dense(units=256,activation='relu'))
    NN.add(Dropout(0.3))
    NN.add(Dense(units=128,activation='relu'))
    NN.add(Dropout(0.1))
    NN.add(Dense(units=32,activation='relu'))
    NN.add(Dropout(0.1))
    NN.add(Dense(units=1,activation='sigmoid'))
    
    NN.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', tf.keras.metrics.AUC()])
    
    print("Start Fitting")
    
    history = NN.fit(x=X_train_tmp, 
          y=y_train_tmp, 
          epochs=100,
          batch_size=10240,
          validation_data=(X_test_tmp, y_test_tmp), verbose=0,
          callbacks=[early_stop]
          )
    
    history_dict[i] = [np.max(history.history["val_accuracy"]), np.max(history.history[ list(history.history.keys())[-1] ])]
    print(f"Step {i} finished")
    i += 1
    
    del X_train_tmp, y_train_tmp, X_test_tmp, y_test_tmp, NN
    gc.collect()

mean_acc = (history_dict[1][0] + history_dict[2][0] + history_dict[3][0] + history_dict[4][0])/4
mean_auc = (history_dict[1][1] + history_dict[2][1] + history_dict[3][1] + history_dict[4][1])/4


Start Fitting
Epoch 00041: early stopping
Step 1 finished
Start Fitting
Epoch 00023: early stopping
Step 2 finished
Start Fitting
Epoch 00072: early stopping
Step 3 finished
Start Fitting
Epoch 00051: early stopping
Step 4 finished


In [13]:
print(f"Mean_CV_Accuracy: {mean_acc}")
print(f"Mean_CV_AUC: {mean_auc}")

Mean_CV_Accuracy: 0.9505092203617096
Mean_CV_AUC: 0.9418958574533463


# Save final model

In [14]:
del X_train, y_train
gc.collect()

42

In [15]:
X_train = pd.read_parquet("/kaggle/input/axa-challenge-final/X_train.parquet")
y_train = pd.read_parquet("/kaggle/input/axa-challenge-final/y_train.parquet")

In [16]:
X_train["start station id"] = X_train["start station id"].astype("object")
X_train["end station id"] = X_train["end station id"].astype("object")
X_train["month"] = X_train["month"].astype("object")
X_train["time_hours"] = X_train["time_hours"].astype("object")

X_train["same_start_end"] = X_train["time_hours"].astype("int8")

In [17]:
train_tmp = pd.concat([X_train, y_train], axis=1).sample(frac=0.5, random_state=1)

In [18]:
X_train = train_tmp.drop("usertype", axis=1)
y_train = train_tmp["usertype"]

In [19]:
X_train = preprocessor_final_1.fit_transform(X_train, y_train)


NN = Sequential()
NN.add(Dense(units=1024,activation='relu'))
NN.add(Dropout(0.3))
NN.add(Dense(units=512,activation='relu'))
NN.add(Dropout(0.3))
NN.add(Dense(units=256,activation='relu'))
NN.add(Dropout(0.3))
NN.add(Dense(units=128,activation='relu'))
NN.add(Dropout(0.1))
NN.add(Dense(units=32,activation='relu'))
NN.add(Dropout(0.1))
NN.add(Dense(units=1,activation='sigmoid'))

NN.compile(loss='binary_crossentropy', optimizer='adam')

NN.fit(x=X_train, 
          y=y_train, 
          epochs=30,
          batch_size=10240,
          verbose=1,
          )

NN.save('Final_NN.h5')

2022-12-11 17:06:25.248708: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 1400357304 exceeds 10% of free system memory.
2022-12-11 17:06:26.918281: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 1400357304 exceeds 10% of free system memory.


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
