In [51]:
import datetime
import pandas as pd
import numpy as np
import pickle

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (GradientBoostingClassifier, 
                              AdaBoostClassifier, 
                              RandomForestClassifier)

In [52]:
def base_data():
    
    return pd.read_csv('processed_data.csv')

In [53]:
def select_features(df):
    
    features = [f for f in df.columns if f != 'serious_delinquencies_in_past_2_years']
    
    X = df[features]
    Y = df['serious_delinquencies_in_past_2_years'].values
    
    return X, Y

In [54]:
def model_selection():
    
    df = base_data()
    
    models = list()
    
    # Try these models first
    models.append(('CART', DecisionTreeClassifier()))
    models.append(('Random Forest', RandomForestClassifier()))
    models.append(('GBC', GradientBoostingClassifier()))
    models.append(('Logistic', LogisticRegression()))
    
    for name, model in models:
        
        # Split out X, Y
        X, Y = select_features(df)

        # Break out train and test sets (using 80% train 20% test)
        X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
        
        # Fit the model
        model.fit(X_train, y_train)
        
        # Make predictions
        predictions = model.predict(X_test)
        
        print('\n{}'.format(name))
        print('------------------------------------------------------')
        print('------------------------------------------------------')
        
        print('\n Confusion Matrix:')
        print(confusion_matrix(y_test, predictions))
        
        print('\n Classification Report')
        print(classification_report(y_test, predictions))
        
        if name != 'Logistic':
            print('\n Feature Importances')
            print(sorted(zip(X.columns, model.feature_importances_),
                         key=lambda l: l[1],reverse=True))
        print('\n------------------------------------------------------')

In [55]:
model_selection()


CART
------------------------------------------------------
------------------------------------------------------

 Confusion Matrix:
[[5825 1130]
 [ 986  922]]

 Classification Report
              precision    recall  f1-score   support

           0       0.86      0.84      0.85      6955
           1       0.45      0.48      0.47      1908

    accuracy                           0.76      8863
   macro avg       0.65      0.66      0.66      8863
weighted avg       0.77      0.76      0.76      8863


 Feature Importances
[('debt_ratio', 0.30812112109114476), ('composite_overdue', 0.29233202853787194), ('monthly_revenue', 0.19963314326617004), ('age', 0.15076190519056235), ('rated_exposure', 0.049151801914250844)]

------------------------------------------------------

Random Forest
------------------------------------------------------
------------------------------------------------------

 Confusion Matrix:
[[6430  525]
 [1004  904]]

 Classification Report
              pr