In this notebook, we set our workflow in order to train a machine learning model.

In [9]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder

In [10]:
df_train = pd.read_csv("notebook_insights/preprocessed_train.csv")
df_train.head()

Unnamed: 0,TARGET_FLAG,KIDSDRIV,AGE,HOMEKIDS,YOJ,INCOME,PARENT1,HOME_VAL,MSTATUS,EDUCATION,...,Student,z_Blue Collar,Commercial,Minivan,Panel Truck,Pickup,Sports Car,Van,z_SUV,Highly Urban/ Urban
0,0,0,60.0,0,11.0,11.117643,0,1.0,0,3,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0,0,43.0,0,11.0,11.423537,0,12.457811,0,0,...,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0,0,35.0,1,10.0,9.682779,0,11.729576,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
3,0,0,51.0,0,14.0,,0,12.63216,1,0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0,0,50.0,0,,11.652566,0,12.404616,1,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0


# Data preprocessing

In [11]:
TARGET = 'TARGET_FLAG'

y = df_train[TARGET].values
X = df_train.drop(columns=TARGET)

In [12]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

numeric_features = ['AGE',
                    'YOJ',
                    'INCOME',
                    'HOME_VAL',
                    'BLUEBOOK',
                    'OLDCLAIM',
                    'CAR_AGE']

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('imputer', SimpleImputer(missing_values = np.nan, strategy='constant', fill_value=0))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),],
    remainder='passthrough')

# Model definition

In [13]:
from sklearn.linear_model import LogisticRegression

pipe = Pipeline(steps=[('preprocessor', preprocessor),                # In order to perform preprocessing
                      ('clf', LogisticRegression(random_state=1,      # In order to instanciate a model
                                                solver='liblinear',
                                                max_iter=300))])

# Split the data

We split the data into two folds: one for searching hyperparameters, which we name X_for_gridsearch, the second is to measure the generalization power of our model, which we name X_future_validation.

In [14]:
from sklearn.model_selection import train_test_split, StratifiedKFold

random_state = 4
n_splits = 4

# We keep one fold for validation
X_for_gridsearch, X_future_validation, y_for_gridsearch, y_future_validation \
= train_test_split(X, y, test_size=0.1, random_state=random_state, stratify=y)

# We use k-fold on the remaining data to search over hyper-parameters
kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)

# Model training

We perform k-fold in order to obtain model performances.

In [15]:
from sklearn.model_selection import GridSearchCV

param_grid = dict(clf__penalty = ['l1', 'l2'],
                  clf__C       = np.logspace(-2, 3, 6))

grid = GridSearchCV(pipe, 
                    param_grid=param_grid,
                    cv=kf, 
                    n_jobs=1, 
                    verbose=1,
                    scoring='f1',
                    return_train_score=True)
grid.fit(X_for_gridsearch, y_for_gridsearch)
print(grid.best_score_)

Fitting 4 folds for each of 12 candidates, totalling 48 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.5110115064182642


[Parallel(n_jobs=1)]: Done  48 out of  48 | elapsed:    5.4s finished


In [16]:
cv_results = pd.DataFrame(grid.cv_results_)
cv_results = cv_results.sort_values("mean_test_score", ascending=False)
cv_results[['param_clf__C',
            'param_clf__penalty', 
            'split0_test_score',
            'split1_test_score',
            'split2_test_score',
            'mean_test_score',
            'std_test_score',
            'rank_test_score']].head(4)

Unnamed: 0,param_clf__C,param_clf__penalty,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
9,100,l2,0.505051,0.55164,0.503145,0.511012,0.024832,1
11,1000,l2,0.505051,0.55164,0.503145,0.511012,0.024832,1
7,10,l2,0.505051,0.54878,0.503145,0.510297,0.023665,3
8,100,l1,0.505051,0.552311,0.498113,0.509921,0.025598,4


# Model evaluation

We evaluate the model on the never seen validation set. The baseline is pretty low here we need either stronger model or feature engineering.

In [17]:
from sklearn.metrics import f1_score

y_pred = grid.predict(X_future_validation)
f1_score(y_future_validation, y_pred)

0.5166666666666667