In [43]:
# General data handling/io
import datetime
import pandas as pd
import numpy as np
import pickle

# ML libs
from sklearn.model_selection import (train_test_split,
                                     RepeatedStratifiedKFold,
                                     cross_val_score)
from sklearn.feature_selection import RFECV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import (GradientBoostingClassifier, 
                              AdaBoostClassifier, 
                              RandomForestClassifier)

# Importing libs for SMOTE technique
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

# Turning off future warnings
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

In [44]:
def base_data():
    
    return pd.read_csv('processed_data.csv')

In [45]:
def select_features(df):
    
    features = [f for f in df.columns if f != 'serious_delinquencies_in_past_2_years']
    
    X = df[features]
    Y = df['serious_delinquencies_in_past_2_years'].values
    
    return X, Y

In [46]:
def feature_select_recursive(df, model, name):
    
    X, Y = select_features(df)
        
    rfe = RFECV(model, cv=RepeatedStratifiedKFold(4),
              scoring='roc_auc')
    
    rfe = rfe.fit(X, Y)
    
    cols = list(X.columns)
    features = pd.Series(rfe.support_,index = cols)
    
    selected_features_rfe = features[features==True].index.tolist()
    
    print('\n')
    print('Selected Features for:', name)
    print(selected_features_rfe)
    print('\n')

    return selected_features_rfe

In [47]:
def model_selection():
    
    df = base_data()
    
    models = list()
    
    # Try these models first
    models.append(('CART', DecisionTreeClassifier()))
    models.append(('Random Forest', RandomForestClassifier()))
    models.append(('GBC', GradientBoostingClassifier()))
    models.append(('Logistic', LogisticRegression()))
    models.append(('KNN', KNeighborsClassifier()))
    
    for name, model in models:
        
        print('\n{}'.format(name))
        print('------------------------------------------------------')
        print('------------------------------------------------------')
        
        # Split out X, Y
        X, Y = select_features(df)
    
        # Use RFECV
        features = feature_select_recursive(df, model, name)
        
        # Apply the feature selection
        X = X[features]
        
        # Break out train and test sets (using 80% train 20% test)
        X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
        
        # Pipeline Steps
        steps = [('over', SMOTE(sampling_strategy=0.4)),
                 #('under', RandomUnderSampler(sampling_strategy=0.5)), 
                 ('model', model)]
        
        # Pipeline
        pipeline = Pipeline(steps=steps)
        
        cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
        scores = cross_val_score(pipeline, X, Y, scoring='roc_auc', cv=cv, n_jobs=-1)
        print('{}: Mean ROC AUC: {}'.format(name, np.mean(scores)))
        
        # Fit the model
        pipeline.fit(X_train, y_train)
        
        # Make predictions
        predictions = pipeline.predict(X_test)
                
        print('\n Confusion Matrix:')
        print(confusion_matrix(y_test, predictions))
        
        print('\n Classification Report')
        print(classification_report(y_test, predictions))
        
        print('\n------------------------------------------------------')

In [48]:
model_selection()


CART
------------------------------------------------------
------------------------------------------------------


Selected Features for: CART
['age', 'monthly_revenue', 'debt_ratio', 'rated_exposure', 'composite_overdue']


CART: Mean ROC AUC: 0.6544304396425424

 Confusion Matrix:
[[5610 1345]
 [ 910  998]]

 Classification Report
              precision    recall  f1-score   support

           0       0.86      0.81      0.83      6955
           1       0.43      0.52      0.47      1908

    accuracy                           0.75      8863
   macro avg       0.64      0.66      0.65      8863
weighted avg       0.77      0.75      0.75      8863


------------------------------------------------------

Random Forest
------------------------------------------------------
------------------------------------------------------


Selected Features for: Random Forest
['age', 'monthly_revenue', 'debt_ratio', 'rated_exposure', 'composite_overdue']


Random Forest: Mean ROC AUC: 0.79