In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler 
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier


In [2]:
trainDF = pd.read_csv(Path('Resources/2019loans.csv'))
testDF = pd.read_csv(Path('Resources/2020Q1loans.csv'))

In [3]:
del trainDF['Unnamed: 0']
trainDF.head()

Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,dti,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,57107,13375.0,0.1797,483.34,MORTGAGE,223000.0,Not Verified,low_risk,n,29.99,...,100.0,50.0,0.0,0.0,577150.0,122018.0,32000.0,170200.0,N,N
1,141451,21000.0,0.1308,478.68,MORTGAGE,123000.0,Source Verified,low_risk,n,11.26,...,85.0,33.3,0.0,0.0,132750.0,27896.0,15900.0,35398.0,N,N
2,321143,20000.0,0.124,448.95,MORTGAGE,197000.0,Source Verified,low_risk,n,11.28,...,85.7,33.3,0.0,0.0,628160.0,114043.0,22600.0,90340.0,N,N
3,11778,3000.0,0.124,100.22,RENT,45000.0,Not Verified,low_risk,n,18.08,...,100.0,16.7,1.0,0.0,42006.0,20761.0,19900.0,15406.0,N,N
4,169382,30000.0,0.1612,1056.49,MORTGAGE,133000.0,Source Verified,low_risk,n,27.77,...,100.0,66.7,0.0,0.0,283248.0,109056.0,79500.0,58778.0,N,N


In [4]:
del testDF['Unnamed: 0']
testDF.head()

Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,dti,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,67991,40000.0,0.0819,814.7,MORTGAGE,140000.0,Not Verified,low_risk,n,19.75,...,97.7,0.0,0.0,0.0,527975.0,70914.0,74600.0,99475.0,N,N
1,25429,6000.0,0.1524,208.7,RENT,55000.0,Not Verified,low_risk,n,11.52,...,66.7,0.0,0.0,0.0,34628.0,23460.0,5900.0,23628.0,N,N
2,38496,3600.0,0.1695,128.27,RENT,42000.0,Not Verified,low_risk,n,6.74,...,100.0,0.0,0.0,0.0,23100.0,19183.0,7300.0,15000.0,N,N
3,19667,20000.0,0.1524,478.33,RENT,100000.0,Not Verified,low_risk,n,12.13,...,100.0,50.0,0.0,0.0,56481.0,43817.0,13800.0,35981.0,N,N
4,37505,3600.0,0.124,120.27,RENT,50000.0,Not Verified,low_risk,n,16.08,...,100.0,25.0,0.0,0.0,45977.0,32448.0,21000.0,24977.0,N,N


In [5]:
# Convert categorical data to numeric and separate target feature for training data
trainDF = pd.get_dummies(trainDF, columns=['home_ownership', 'verification_status', 'application_type', 'hardship_flag',
                                         'debt_settlement_flag', 'initial_list_status', 'pymnt_plan', 'loan_status'])
trainDF.head()

Unnamed: 0,index,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,...,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N,debt_settlement_flag_Y,initial_list_status_f,initial_list_status_w,pymnt_plan_n,loan_status_high_risk,loan_status_low_risk
0,57107,13375.0,0.1797,483.34,223000.0,29.99,0.0,0.0,15.0,0.0,...,0,1,0,1,0,0,1,1,0,1
1,141451,21000.0,0.1308,478.68,123000.0,11.26,2.0,0.0,16.0,0.0,...,0,1,0,1,0,0,1,1,0,1
2,321143,20000.0,0.124,448.95,197000.0,11.28,0.0,0.0,12.0,0.0,...,0,1,0,1,0,0,1,1,0,1
3,11778,3000.0,0.124,100.22,45000.0,18.08,0.0,0.0,12.0,1.0,...,0,1,0,1,0,0,1,1,0,1
4,169382,30000.0,0.1612,1056.49,133000.0,27.77,0.0,2.0,13.0,0.0,...,0,1,0,1,0,0,1,1,0,1


In [6]:
# Convert categorical data to numeric and separate target feature for testing data
testDF = pd.get_dummies(testDF, columns=['home_ownership', 'verification_status', 'application_type', 'hardship_flag',
                                         'debt_settlement_flag', 'initial_list_status', 'pymnt_plan', 'loan_status'])
testDF.head()

Unnamed: 0,index,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,...,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N,initial_list_status_f,initial_list_status_w,pymnt_plan_n,loan_status_high_risk,loan_status_low_risk
0,67991,40000.0,0.0819,814.7,140000.0,19.75,0.0,1.0,18.0,0.0,...,1,0,1,0,1,0,1,1,0,1
1,25429,6000.0,0.1524,208.7,55000.0,11.52,2.0,0.0,8.0,0.0,...,1,0,1,0,1,0,1,1,0,1
2,38496,3600.0,0.1695,128.27,42000.0,6.74,0.0,0.0,6.0,0.0,...,1,0,1,0,1,0,1,1,0,1
3,19667,20000.0,0.1524,478.33,100000.0,12.13,0.0,2.0,7.0,0.0,...,1,0,1,0,1,0,1,1,0,1
4,37505,3600.0,0.124,120.27,50000.0,16.08,0.0,3.0,6.0,0.0,...,1,0,1,0,1,0,1,1,0,1


In [7]:
# add missing dummy variables to testing set
testDF['debt_settlement_flag_Y'] = 0
testDF.head()

Unnamed: 0,index,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,...,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N,initial_list_status_f,initial_list_status_w,pymnt_plan_n,loan_status_high_risk,loan_status_low_risk,debt_settlement_flag_Y
0,67991,40000.0,0.0819,814.7,140000.0,19.75,0.0,1.0,18.0,0.0,...,0,1,0,1,0,1,1,0,1,0
1,25429,6000.0,0.1524,208.7,55000.0,11.52,2.0,0.0,8.0,0.0,...,0,1,0,1,0,1,1,0,1,0
2,38496,3600.0,0.1695,128.27,42000.0,6.74,0.0,0.0,6.0,0.0,...,0,1,0,1,0,1,1,0,1,0
3,19667,20000.0,0.1524,478.33,100000.0,12.13,0.0,2.0,7.0,0.0,...,0,1,0,1,0,1,1,0,1,0
4,37505,3600.0,0.124,120.27,50000.0,16.08,0.0,3.0,6.0,0.0,...,0,1,0,1,0,1,1,0,1,0


In [16]:
# Train the Logistic Regression model on the unscaled data and print the model score
ytrain = trainDF['loan_status_high_risk']
Xtrain = trainDF.drop('loan_status_high_risk', axis=1)

ytest = testDF['loan_status_high_risk']
Xtest = testDF.drop('loan_status_high_risk', axis=1)

unscaledClass = LogisticRegression()
unscaledClass.fit(Xtrain, ytrain)

print(f"Testing Data Score for unscaled Logistic Regression Model: {unscaledClass.score(Xtest, ytest)}")


Testing Data Score for unscaled Logistic Regression Model: 0.5161633347511697


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [17]:
# Train a Random Forest Classifier model and print the model score
unscaledRandForest = RandomForestClassifier(random_state=1, n_estimators=3).fit(Xtrain, ytrain)

print(f'Testing Score for unscaled Random Forest Classifier Model: {unscaledRandForest.score(Xtest, ytest)}')

Testing Score for unscaled Random Forest Classifier Model: 0.5068056146320714


In [11]:
# Scale the data
scaler = StandardScaler()
XtrainScaled = scaler.fit_transform(Xtrain)
XtestScaled = scaler.fit_transform(Xtest)

In [15]:
# Train the Logistic Regression model on the scaled data and print the model score
selLogReg = LogisticRegression().fit(XtrainScaled, ytrain)
print(f'Testing Score for selected scaled Logistic Regression model: {selLogReg.score(XtestScaled, ytest)}')


Testing Score for selected scaled Logistic Regression model: 0.6316461080391322


In [19]:
# Train a Random Forest Classifier model on the scaled data and print the model score
selRandForest = RandomForestClassifier(random_state=1, n_estimators=3).fit(XtrainScaled, ytrain)

print(f'Testing Score for selected scaled Random Forest Classifier Model: {selRandForest.score(XtestScaled, ytest)}')


Testing Score for selected scaled Random Forest Classifier Model: 0.5097830710336028


The better fitting model for the given dataframes between the two, looks to be favoring the Logistic Regression model by approx. 13%. Another important factor is that the Logistic Regression model provides a clearer understanding of the DF by scrutinizing between scaled/unscaled data. The Random Forest Classifier model displays insignificant changes between scaled/unscaled data. 