In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder

In [2]:
df_train = pd.read_csv("notebook_insights/preprocessed_train.csv")
df_train.head()

Unnamed: 0,TARGET_FLAG,KIDSDRIV,AGE,HOMEKIDS,YOJ,INCOME,PARENT1,HOME_VAL,MSTATUS,EDUCATION,...,Student,z_Blue Collar,Commercial,Minivan,Panel Truck,Pickup,Sports Car,Van,z_SUV,Highly Urban/ Urban
0,0,0,60.0,0,11.0,11.117643,0,1.0,0,3,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0,0,43.0,0,11.0,11.423537,0,12.457811,0,0,...,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0,0,35.0,1,10.0,9.682779,0,11.729576,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
3,0,0,51.0,0,14.0,,0,12.63216,1,0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0,0,50.0,0,,11.652566,0,12.404616,1,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0


# Data preprocessing

In [3]:
TARGET = 'TARGET_FLAG'

y = df_train[TARGET].values
X = df_train.drop(columns=TARGET)

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

numeric_features = ['AGE',
                    'INCOME',
                    'HOME_VAL',
                    'BLUEBOOK',
                    'OLDCLAIM',
                    'CAR_AGE']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(missing_values = np.nan, strategy='constant', fill_value=0))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),],
    remainder='passthrough')

# Model definition

In [5]:
from xgboost import XGBClassifier

model = XGBClassifier(objective='binary:logistic',
                              use_label_encoder=False,
                              eval_metric='logloss',
                              random_state='42')

pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('clf', model)])

# Split the data

We split the data into two folds: one for searching hyperparameters, which we name X_for_gridsearch, the second is to measure the generalization power of our model, which we name X_future_validation.

In [6]:
from sklearn.model_selection import train_test_split, StratifiedKFold

random_state = 4
n_splits = 4

# We keep one fold for validation
X_for_gridsearch, X_future_validation, y_for_gridsearch, y_future_validation \
= train_test_split(X, y, test_size=0.1, random_state=random_state, stratify=y)

# We use k-fold on the remaining data to search over hyper-parameters
kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)

# Model training

We perform k-fold in order to obtain model performances.

In [7]:
from sklearn.model_selection import RandomizedSearchCV

param_grid  = dict(clf__n_estimators  = np.linspace(100, 300, 6).astype(int),
                           clf__max_depth     = [5, 8, 10, 15, 20, 30, 50, 75, 100],
                           clf__alpha         = [0.01, 0.05, 0.1, 0.3, 0.5, 1, 10],
                           clf__learning_rate = [0.1, 0.08, 0.05, 0.02, 0.01],
                 )

grid = RandomizedSearchCV(pipe,
                          param_distributions=param_grid,
                          cv=kf,
                          verbose=1,
                          scoring='f1',
                          random_state=42,
                          n_iter=20,
                         )
grid.fit(X_for_gridsearch, y_for_gridsearch)
print(grid.best_score_)

Fitting 4 folds for each of 20 candidates, totalling 80 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:  1.3min finished


0.5467310085748781


In [8]:
cv_results = pd.DataFrame(grid.cv_results_)
cv_results = cv_results.sort_values("mean_test_score", ascending=False)
cv_results[['split0_test_score',
            'split1_test_score',
            'split2_test_score',
            'mean_test_score',
            'std_test_score',
            'rank_test_score']].head(4)

Unnamed: 0,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
2,0.544578,0.576151,0.521845,0.546731,0.019334,1
6,0.526829,0.5549,0.532338,0.537015,0.010661,2
4,0.515971,0.560479,0.515152,0.531359,0.018396,3
8,0.517282,0.540925,0.526961,0.528734,0.008426,4


# Model evaluation

We evaluate the model on the never seen validation set. We got an improvement compared to the baseline. Further feature engineering could help increasing the F1 score.

In [54]:
from sklearn.metrics import f1_score

y_pred = grid.predict(X_future_validation)
f1_score(y_future_validation, y_pred)

0.6036745406824148