In [1]:
import datetime
import pandas as pd
import numpy as np
import pickle

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (GradientBoostingClassifier, 
                              AdaBoostClassifier, 
                              RandomForestClassifier)

In [2]:
def base_data():
    
    return pd.read_csv('processed_data.csv')

In [3]:
def select_features(df):
    
    features = [f for f in df.columns if f != 'serious_delinquencies_in_past_2_years']
    
    X = df[features]
    Y = df['serious_delinquencies_in_past_2_years'].values
    
    return X, Y

In [4]:
def feature_select_recursive(df, model, name):
    
    X, Y = select_features(df)
        
    rfe = RFECV(model, cv=StratifiedKFold(4),
              scoring='accuracy')
    
    rfe = rfe.fit(X, Y)
    
    cols = list(X.columns)
    features = pd.Series(rfe.support_,index = cols)
    
    selected_features_rfe = features[features==True].index.tolist()
    
    print('\n')
    print('Selected Features for:', name)
    print(selected_features_rfe)
    print('\n')

    return selected_features_rfe

In [5]:
def model_selection():
    
    df = base_data()
    
    models = list()
    
    # Try these models first
    models.append(('CART', DecisionTreeClassifier()))
    models.append(('Random Forest', RandomForestClassifier()))
    models.append(('GBC', GradientBoostingClassifier()))
    models.append(('Logistic', LogisticRegression()))
    
    for name, model in models:
        
        # Split out X, Y
        X, Y = select_features(df)
    
        # Use RFECV
        features = feature_select_recursive(df, model, name)
        
        # Apply the feature selection
        X = X[features]
        
        # Break out train and test sets (using 80% train 20% test)
        X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
        
        # Fit the model
        model.fit(X_train, y_train)
        
        # Make predictions
        predictions = model.predict(X_test)
        
        print('\n{}'.format(name))
        print('------------------------------------------------------')
        print('------------------------------------------------------')
        
        print('\n Confusion Matrix:')
        print(confusion_matrix(y_test, predictions))
        
        print('\n Classification Report')
        print(classification_report(y_test, predictions))
        
        if name != 'Logistic':
            print('\n Feature Importances')
            print(sorted(zip(X.columns, model.feature_importances_),
                         key=lambda l: l[1],reverse=True))
        print('\n------------------------------------------------------')

In [6]:
model_selection()



Selected Features for: CART
['monthly_revenue', 'debt_ratio', 'composite_overdue']



CART
------------------------------------------------------
------------------------------------------------------

 Confusion Matrix:
[[5890 1065]
 [1006  902]]

 Classification Report
              precision    recall  f1-score   support

           0       0.85      0.85      0.85      6955
           1       0.46      0.47      0.47      1908

    accuracy                           0.77      8863
   macro avg       0.66      0.66      0.66      8863
weighted avg       0.77      0.77      0.77      8863


 Feature Importances
[('debt_ratio', 0.4445180512777122), ('composite_overdue', 0.3027487987737805), ('monthly_revenue', 0.2527331499485074)]

------------------------------------------------------


Selected Features for: Random Forest
['age', 'monthly_revenue', 'debt_ratio', 'rated_exposure', 'composite_overdue']



Random Forest
------------------------------------------------------
---------