In [1]:
# General data handling/io
import datetime
import pandas as pd
import numpy as np
import pickle

# ML libs
from sklearn.model_selection import (train_test_split,
                                     RepeatedStratifiedKFold,
                                     cross_val_score)
from sklearn.feature_selection import RFECV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import (GradientBoostingClassifier, 
                              AdaBoostClassifier, 
                              RandomForestClassifier)

# Importing libs for SMOTE technique
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

# Turning off future warnings
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

In [2]:
def base_data():
    
    return pd.read_csv('processed_data.csv')

In [3]:
def select_features(df):
    
    features = [f for f in df.columns if f != 'serious_delinquencies_in_past_2_years']
    
    X = df[features]
    Y = df['serious_delinquencies_in_past_2_years'].values
    
    return X, Y

In [4]:
def feature_select_recursive(df, model, name):
    
    X, Y = select_features(df)
        
    rfe = RFECV(model, cv=RepeatedStratifiedKFold(4),
              scoring='roc_auc')
    
    rfe = rfe.fit(X, Y)
    
    cols = list(X.columns)
    features = pd.Series(rfe.support_,index = cols)
    
    selected_features_rfe = features[features==True].index.tolist()
    
    print('\n')
    print('Selected Features for:', name)
    print(selected_features_rfe)
    print('\n')

    return selected_features_rfe

In [5]:
def model_selection():
    
    df = base_data()
    
    models = list()
    
    # Try these models first
    models.append(('Logistic', LogisticRegression())) 
    models.append(('KNN', KNeighborsClassifier(n_neighbors=5)))
    models.append(('GBC', GradientBoostingClassifier()))
    models.append(('Random Forest', RandomForestClassifier()))
    models.append(('CART', DecisionTreeClassifier()))
    
    for name, model in models:
        
        print('\n{}'.format(name))
        print('------------------------------------------------------')
        print('------------------------------------------------------')
        
        # Split out X, Y
        X, Y = select_features(df)
        
        if name != 'KNN':
        # Use RFECV
            features = feature_select_recursive(df, model, name)
        
        # Apply the feature selection
            X = X[features]
        
        # Break out train and test sets (using 80% train 20% test)
        X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=1)
        
        # Pipeline Steps
        steps = [('over', SMOTE()),
                 #('under', RandomUnderSampler(sampling_strategy='majority')), 
                 ('model', model)]
        
        # Pipeline
        pipeline = Pipeline(steps=steps)
        
        cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=0)
        
        scores = cross_val_score(pipeline, X, Y, scoring='roc_auc', cv=cv, n_jobs=-1)
        
        print('{}: Mean ROC AUC: {} Std Dev: {}'.format(name, np.mean(scores).round(3),
                                                        np.std(scores).round(3)))
        
        # Fit the model to the training split
        pipeline.fit(X_train, y_train)
        
        # Make predictions
        predictions = pipeline.predict(X_test)
                
        print('\n Confusion Matrix:')
        print(confusion_matrix(y_test, predictions))
        
        print('\n Classification Report')
        print(classification_report(y_test, predictions))
        
        print('\n------------------------------------------------------')

In [6]:
model_selection()


Logistic
------------------------------------------------------
------------------------------------------------------


Selected Features for: Logistic
['age', 'debt_ratio', 'composite_overdue']


Logistic: Mean ROC AUC: 0.813 Std Dev: 0.007

 Confusion Matrix:
[[6124  835]
 [ 798 1106]]

 Classification Report
              precision    recall  f1-score   support

           0       0.88      0.88      0.88      6959
           1       0.57      0.58      0.58      1904

    accuracy                           0.82      8863
   macro avg       0.73      0.73      0.73      8863
weighted avg       0.82      0.82      0.82      8863


------------------------------------------------------

KNN
------------------------------------------------------
------------------------------------------------------
KNN: Mean ROC AUC: 0.744 Std Dev: 0.009

 Confusion Matrix:
[[5270 1689]
 [ 742 1162]]

 Classification Report
              precision    recall  f1-score   support

           0       0.