In [24]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from sklearn.utils import resample

import time
import random

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, make_scorer, precision_score, accuracy_score,f1_score, recall_score
from sklearn.linear_model import SGDClassifier
from yellowbrick.classifier import ClassificationReport
from yellowbrick.classifier import ClassPredictionError
from yellowbrick.classifier import ConfusionMatrix
from yellowbrick.classifier import ROCAUC
from yellowbrick.classifier import PrecisionRecallCurve
from sklearn.model_selection import cross_val_predict, cross_val_score


from sklearn.feature_selection import RFE
from boruta import BorutaPy

In [27]:
random_seed=42

In [17]:
df_train = pd.read_csv('train.csv')


In [18]:
df_train.head()

Unnamed: 0.1,Unnamed: 0,acc_now_delinq,annual_inc,joint_app,chargeoff_within_12_mths,collection_recovery_fee,collections_12_mths_ex_med,delinq_2yrs,delinq_amnt,dti,...,tax_liens.1,term.1,total_acc.1,total_pymnt.1,total_pymnt_inv.1,total_rec_int.1,total_rec_late_fee.1,total_rec_prncp.1,earliest_cr_line_date.1,loan_status
0,52309,0.0,0.017296,0.0,0.0,0.0,0.0,0.0,0.0,0.042479,...,0.0,1.0,0.305085,0.556515,0.557348,0.299021,1.000981e-11,0.65,0.579996,0
1,38014,0.0,0.008719,0.0,0.0,0.0,0.0,0.0,0.0,0.017815,...,0.0,1.0,0.09322,0.732477,0.733149,0.595846,1.000981e-11,0.7275,0.738631,0
2,49445,0.0,0.008719,0.0,0.0,0.0,0.0,0.0,0.0,0.03174,...,0.0,0.0,0.398305,0.138887,0.140101,0.056731,1.000981e-11,0.175,0.713353,0
3,25480,0.0,0.012007,0.0,0.0,0.0,0.0,0.0,0.0,0.021917,...,0.0,0.0,0.237288,0.149915,0.151119,0.027469,1.000981e-11,0.21,0.80934,0
4,22883,0.0,0.008576,0.0,0.0,0.0,0.0,0.0,0.0,0.039267,...,0.0,1.0,0.381356,0.488459,0.489354,0.065163,1.000981e-11,0.695,0.693332,0


In [19]:
df_train.drop('Unnamed: 0',axis=1,inplace=True)

In [20]:
X_train=df_train.drop('loan_status',axis=1)
y_train=df_train.loan_status

In [21]:
X_train.shape 

(34563, 1652)

1652 variables, far too many to be implemented. Can this be reduced?

Take best model from imbalance strategy comparison (logistic regression with oversampling)

In [22]:
df_majority = df_train[df_train['loan_status']==0]
df_minority = df_train[df_train['loan_status']==1]

In [28]:
df_minority_oversampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=len(df_majority),    # to match majority class
                                 random_state=random_seed)
df_os = pd.concat([df_minority_oversampled, df_majority])

In [30]:
y_train_os = df_os['loan_status']
X_train_os = df_os.drop('loan_status', axis=1)

In [32]:
#best model
model = LogisticRegression(C=10, class_weight = 'balanced', max_iter = 100)

In [33]:
model.fit(X_train_os,y_train_os)

LogisticRegression(C=10, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [34]:
importance =abs( model.coef_[0])
importance

array([ 0.43554461,  0.36690862,  0.73423179, ...,  6.77633025,
       42.37786181,  0.09244833])

In [None]:
df_train.reset_index()

In [41]:
feature_name = X_train_os.columns.values
summary_table = pd.DataFrame(columns = ['Feature name'], data = feature_name)
summary_table['Coefficients'] = np.transpose(model.coef_)
summary_table.index = summary_table.index + 1
# summary_table.loc[0] = ['Intercept', model.intercept_[0]] # intercept is not in column names
summary_table = summary_table.sort_values(by='Coefficients',ascending=False).reset_index(drop=True)
summary_table

Unnamed: 0,Feature name,Coefficients
0,funded_amnt,19.079208
1,funded_amnt.1,19.079208
2,funded_amnt_inv.1,16.838856
3,funded_amnt_inv,16.838856
4,loan_amnt,15.842987
...,...,...
1647,total_pymnt,-18.111103
1648,total_pymnt_inv,-19.119052
1649,total_pymnt_inv.1,-19.119052
1650,total_rec_prncp,-42.377862


In [42]:
from sklearn.ensemble import RandomForestRegressor
forest = RandomForestRegressor(n_jobs = -1, max_depth = 5)

In [43]:
boruta_selector = BorutaPy(forest, n_estimators = 'auto', random_state = 0,perc=90,verbose=2)
boruta_selector.fit(np.array(X_train), np.array(y_train))

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	1652
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	1652
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	1652
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	1652
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	1652
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	1652
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	1652
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	0
Tentative: 	123
Rejected: 	1529
Iteration: 	9 / 100
Confirmed: 	39
Tentative: 	84
Rejected: 	1529
Iteration: 	10 / 100
Confirmed: 	39
Tentative: 	84
Rejected: 	1529
Iteration: 	11 / 100
Confirmed: 	39
Tentative: 	84
Rejected: 	1529
Iteration: 	12 / 100
Confirmed: 	45
Tentative: 	49
Rejected: 	1558
Iteration: 	13 / 100
Confirmed: 	45
Tentative: 	49
Rejected: 	1558
Iteration: 	14 / 100
Confirmed: 	45
Tentative: 	49
Rejected: 	1558
Iteration: 	15 / 100
Confirmed: 	45
Tentative: 	49
Rejected: 	1558
Iteration: 	16 / 1

BorutaPy(alpha=0.05,
         estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                         criterion='mse', max_depth=5,
                                         max_features='auto',
                                         max_leaf_nodes=None, max_samples=None,
                                         min_impurity_decrease=0.0,
                                         min_impurity_split=None,
                                         min_samples_leaf=1,
                                         min_samples_split=2,
                                         min_weight_fraction_leaf=0.0,
                                         n_estimators=220, n_jobs=-1,
                                         oob_score=False,
                                         random_state=RandomState(MT19937) at 0x129A75380,
                                         verbose=0, warm_start=False),
         max_iter=100, n_estimators='auto', perc=90,
         random_state=R

In [44]:
features = [f for f in X_train.columns]
len(features)

1652

In [45]:
final_features = []
indexes = np.where(boruta_selector.support_ == True) or np.where(boruta_selector.support_weak_ == True)
for x in np.nditer(indexes):
    final_features.append(features[x])
print(final_features)
len(final_features)

['annual_inc', 'funded_amnt', 'funded_amnt_inv', 'installment', 'issue_d', 'last_pymnt_amnt', 'loan_amnt', 'pub_rec_bankruptcies', 'recoveries', 'term', 'total_rec_int', 'total_rec_late_fee', 'total_rec_prncp', 'debt_settlement_flag_N', 'debt_settlement_flag_Y', 'earliest_cr_line_Aug-2005', 'earliest_cr_line_Dec-1999', 'earliest_cr_line_Dec-2006', 'earliest_cr_line_Feb-1985', 'earliest_cr_line_Jan-2006', 'earliest_cr_line_Jul-1985', 'earliest_cr_line_Jun-1976', 'earliest_cr_line_Jun-1997', 'earliest_cr_line_May-1969', 'earliest_cr_line_Nov-2003', 'earliest_cr_line_Sep-1983', 'zip_code_029xx', 'zip_code_076xx', 'zip_code_081xx', 'zip_code_119xx', 'zip_code_180xx', 'zip_code_184xx', 'zip_code_199xx', 'zip_code_372xx', 'zip_code_381xx', 'zip_code_386xx', 'zip_code_605xx', 'zip_code_751xx', 'zip_code_874xx', 'zip_code_954xx', 'zip_code_978xx', 'annual_inc.1', 'funded_amnt.1', 'funded_amnt_inv.1', 'installment.1', 'issue_d.1', 'last_pymnt_amnt.1', 'loan_amnt.1', 'recoveries.1', 'term.1', 't

52

In [47]:
X_boruta = X_train[final_features]

In [48]:
X_train_os_boruta = X_train_os[final_features]

In [49]:
X_train_os_boruta

Unnamed: 0,annual_inc,funded_amnt,funded_amnt_inv,installment,issue_d,last_pymnt_amnt,loan_amnt,pub_rec_bankruptcies,recoveries,term,...,funded_amnt.1,funded_amnt_inv.1,installment.1,issue_d.1,last_pymnt_amnt.1,loan_amnt.1,recoveries.1,term.1,total_rec_late_fee.1,total_rec_prncp.1
4197,0.010149,0.179487,0.20000,0.148906,0.755706,0.001118,0.179487,0.000000,0.032008,0.0,...,0.179487,0.20000,0.148906,0.755706,0.001118,0.179487,0.032008,0.0,2.944063e-02,0.060308
27419,0.007719,0.435897,0.45000,0.402978,0.526712,0.015970,0.435897,0.000000,0.100952,0.0,...,0.435897,0.45000,0.402978,0.526712,0.015970,0.435897,0.100952,0.0,1.000981e-11,0.019250
26554,0.007290,0.358974,0.37500,0.245321,0.725107,0.009990,0.358974,0.166667,0.000000,1.0,...,0.358974,0.37500,0.245321,0.725107,0.009990,0.358974,0.000000,1.0,1.000981e-11,0.070240
26379,0.017867,0.743590,0.75000,0.535441,0.534487,0.020995,0.743590,0.000000,0.162351,1.0,...,0.743590,0.75000,0.535441,0.534487,0.020995,0.743590,0.162351,1.0,1.000981e-11,0.081211
19190,0.013665,0.384615,0.40000,0.232383,0.649110,0.009499,0.384615,0.000000,0.053565,1.0,...,0.384615,0.40000,0.232383,0.649110,0.009499,0.384615,0.053565,1.0,1.000981e-11,0.089429
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34557,0.023728,0.871795,0.87500,0.482210,0.961625,0.864113,0.871795,0.000000,0.000000,1.0,...,0.871795,0.87500,0.482210,0.961625,0.864113,0.871795,0.000000,1.0,1.000981e-11,0.875000
34559,0.010577,0.410256,0.42500,0.327207,0.709807,0.211803,0.410256,0.000000,0.000000,0.0,...,0.410256,0.42500,0.327207,0.709807,0.211803,0.410256,0.000000,0.0,1.000981e-11,0.425000
34560,0.013722,0.343590,0.36000,0.274404,0.763230,0.238712,0.343590,0.000000,0.000000,0.0,...,0.343590,0.36000,0.274404,0.763230,0.238712,0.343590,0.000000,0.0,1.000981e-11,0.360000
34561,0.016581,1.000000,1.00000,0.856741,0.771006,0.348888,1.000000,0.000000,0.000000,0.0,...,1.000000,1.00000,0.856741,0.771006,0.348888,1.000000,0.000000,0.0,1.000981e-11,1.000000


In [50]:
#pipeline for logistic regression

pipe_logreg = Pipeline([
    ('clf', LogisticRegression())
])

parameters_logreg = {
    "clf__C": [0.1,1,10],
    "clf__class_weight": ['balanced'],
    "clf__solver": ['newton-cg','lbfgs','sag','saga'],
    "clf__max_iter":[10,50,100],
    "clf__penalty": ['l1','l2'],
    'clf__tol':[0.00005,0.0001, 0.0005]
}

gs_logreg = GridSearchCV(estimator=pipe_logreg,
            param_grid=parameters_logreg,
            scoring='recall',
            cv=KFold(5,shuffle=True,random_state=42), 
            return_train_score = True, verbose=1,n_jobs=-1)

# Fit using grid search
best_model_base = gs_logreg.fit(X_train_os_boruta, y_train_os)

# Best accuracy
print('Best recall score: %.3f' % gs_logreg.best_score_)

# Best params
print('\nBest params:\n', gs_logreg.best_params_)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    7.6s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   52.0s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  6.5min
[Parallel(n_jobs=-1)]: Done 1080 out of 1080 | elapsed:  9.8min finished


Best recall score: 0.991

Best params:
 {'clf__C': 10, 'clf__class_weight': 'balanced', 'clf__max_iter': 100, 'clf__penalty': 'l1', 'clf__solver': 'saga', 'clf__tol': 5e-05}


In [60]:
X_train_os_fs = X_train_os[['annual_inc', 'funded_amnt', 'funded_amnt_inv', 'installment', 'issue_d', 'last_pymnt_amnt', 'loan_amnt', 'pub_rec_bankruptcies', 'recoveries', 'term', 'total_rec_int', 'total_rec_late_fee', 'total_rec_prncp', 'debt_settlement_flag_N']]

In [61]:
#pipeline for logistic regression

pipe_logreg = Pipeline([
    ('clf', LogisticRegression())
])

parameters_logreg = {
    "clf__C": [0.1,1,10],
    "clf__class_weight": ['balanced'],
    "clf__solver": ['newton-cg','lbfgs','sag','saga'],
    "clf__max_iter":[10,50,100],
    "clf__penalty": ['l1','l2'],
    'clf__tol':[0.00005,0.0001, 0.0005]
}

gs_logreg = GridSearchCV(estimator=pipe_logreg,
            param_grid=parameters_logreg,
            scoring='recall',
            cv=KFold(5,shuffle=True,random_state=42), 
            return_train_score = True, verbose=1,n_jobs=-1)

# Fit using grid search
best_model_fs = gs_logreg.fit(X_train_os_fs, y_train_os)

# Best accuracy
print('Best recall score: %.3f' % gs_logreg.best_score_)

# Best params
print('\nBest params:\n', gs_logreg.best_params_)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  49 tasks      | elapsed:    7.8s
[Parallel(n_jobs=-1)]: Done 347 tasks      | elapsed:   59.3s
[Parallel(n_jobs=-1)]: Done 753 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 1080 out of 1080 | elapsed:  3.6min finished


Best recall score: 0.991

Best params:
 {'clf__C': 10, 'clf__class_weight': 'balanced', 'clf__max_iter': 100, 'clf__penalty': 'l1', 'clf__solver': 'saga', 'clf__tol': 5e-05}
