In [15]:
import numpy as np
import pandas as pd
from pathlib import Path
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier


In [16]:
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

In [17]:
train_df.head(10)

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,57107,57107,13375.0,0.1797,483.34,MORTGAGE,223000.0,Not Verified,low_risk,n,...,100.0,50.0,0.0,0.0,577150.0,122018.0,32000.0,170200.0,N,N
1,141451,141451,21000.0,0.1308,478.68,MORTGAGE,123000.0,Source Verified,low_risk,n,...,85.0,33.3,0.0,0.0,132750.0,27896.0,15900.0,35398.0,N,N
2,321143,321143,20000.0,0.124,448.95,MORTGAGE,197000.0,Source Verified,low_risk,n,...,85.7,33.3,0.0,0.0,628160.0,114043.0,22600.0,90340.0,N,N
3,11778,11778,3000.0,0.124,100.22,RENT,45000.0,Not Verified,low_risk,n,...,100.0,16.7,1.0,0.0,42006.0,20761.0,19900.0,15406.0,N,N
4,169382,169382,30000.0,0.1612,1056.49,MORTGAGE,133000.0,Source Verified,low_risk,n,...,100.0,66.7,0.0,0.0,283248.0,109056.0,79500.0,58778.0,N,N
5,31295,31295,6500.0,0.1614,228.98,MORTGAGE,65000.0,Not Verified,low_risk,n,...,92.3,50.0,1.0,0.0,64660.0,39487.0,12800.0,42060.0,N,N
6,173238,173238,40000.0,0.124,897.89,MORTGAGE,171000.0,Not Verified,low_risk,n,...,88.9,75.0,0.0,0.0,772030.0,83509.0,41200.0,81300.0,N,N
7,116755,116755,5000.0,0.1719,178.74,MORTGAGE,55000.0,Not Verified,low_risk,n,...,90.0,0.0,0.0,0.0,17075.0,11244.0,500.0,14275.0,N,N
8,170311,170311,6000.0,0.1102,196.49,OWN,36000.0,Not Verified,low_risk,n,...,100.0,0.0,1.0,0.0,65200.0,21103.0,43300.0,12100.0,N,N
9,254794,254794,10625.0,0.1102,347.95,RENT,73000.0,Source Verified,low_risk,n,...,91.9,30.0,0.0,0.0,41869.0,11859.0,24500.0,10169.0,N,N


In [18]:
# Convert categorical data to numeric and separate target feature for training data
yes_no = {'Y':1,'N':0}
train_df2 = train_df.replace({'hardship_flag':yes_no, 'debt_settlement_flag':yes_no})
home_ownership = {'ANY':0,'RENT':1,'MORTGAGE':2,'OWN':3}
train_df3 = train_df2.replace({'home_ownership':home_ownership})
verification = {'Not Verified':0,'Source Verified':1,'Verified':1}
train_df4 = train_df3.replace({'verification_status':verification})
loan_status = {'low_risk':1,'high_risk':0}
train_df5 = train_df4.replace({'loan_status':loan_status})
list_status = {'w':0,'f':1}
train_df6 = train_df5.replace({'initial_list_status':list_status})
app_type = {'Individual':1,'Joint App':0}
train_df7 = train_df6.replace({'application_type':app_type})

train_df8 = train_df7.drop(['index','pymnt_plan'],axis='columns')

file_path = Path('Resources/cleaned_2019_credit_data.csv')
train_df8.to_csv(file_path, index=False)

In [19]:
train_df9 = train_df8.drop(['Unnamed: 0'],axis='columns')
train_df9.head()

X_train = train_df9.drop('loan_status', axis=1)
y_train = train_df9['loan_status'].values
print(X_train.select_dtypes(include=[object]))

Empty DataFrame
Columns: []
Index: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, ...]

[12180 rows x 0 columns]


In [20]:
# Convert categorical data to numeric and separate target feature for testing data
test_df2 = test_df.replace({'hardship_flag':yes_no, 'debt_settlement_flag':yes_no})
test_df3 = test_df2.replace({'home_ownership':home_ownership})
test_df4 = test_df3.replace({'verification_status':verification})
test_df5 = test_df4.replace({'loan_status':loan_status})
test_df6 = test_df5.replace({'initial_list_status':list_status})
test_df7 = test_df6.replace({'application_type':app_type})

test_df8 = test_df7.drop(['index','pymnt_plan'],axis='columns')

file_path = Path('Resources/cleaned_2020_credit_data.csv')
test_df8.to_csv(file_path, index=False)

In [21]:
test_df9 = test_df8.drop(['Unnamed: 0'],axis='columns')
test_df9.head(10)

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,dti,delinq_2yrs,inq_last_6mths,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,40000.0,0.0819,814.7,2,140000.0,0,1,19.75,0.0,1.0,...,97.7,0.0,0.0,0.0,527975.0,70914.0,74600.0,99475.0,0,0
1,6000.0,0.1524,208.7,1,55000.0,0,1,11.52,2.0,0.0,...,66.7,0.0,0.0,0.0,34628.0,23460.0,5900.0,23628.0,0,0
2,3600.0,0.1695,128.27,1,42000.0,0,1,6.74,0.0,0.0,...,100.0,0.0,0.0,0.0,23100.0,19183.0,7300.0,15000.0,0,0
3,20000.0,0.1524,478.33,1,100000.0,0,1,12.13,0.0,2.0,...,100.0,50.0,0.0,0.0,56481.0,43817.0,13800.0,35981.0,0,0
4,3600.0,0.124,120.27,1,50000.0,0,1,16.08,0.0,3.0,...,100.0,25.0,0.0,0.0,45977.0,32448.0,21000.0,24977.0,0,0
5,27000.0,0.0819,549.93,1,57000.0,1,1,31.41,0.0,0.0,...,96.2,14.3,0.0,0.0,125240.0,52062.0,98400.0,11629.0,0,0
6,15000.0,0.0881,310.0,2,50000.0,0,1,24.44,0.0,1.0,...,100.0,0.0,0.0,0.0,192786.0,49901.0,43600.0,43276.0,0,0
7,19000.0,0.0881,602.52,2,70000.0,0,1,30.94,0.0,0.0,...,100.0,33.3,0.0,0.0,782289.0,161240.0,44000.0,105289.0,0,0
8,27575.0,0.1171,912.07,3,142000.0,1,1,17.9,0.0,0.0,...,100.0,22.2,1.0,0.0,488561.0,89586.0,89500.0,72659.0,0,0
9,31500.0,0.1774,795.45,2,100000.0,1,1,25.05,2.0,1.0,...,96.2,28.6,0.0,0.0,562627.0,108891.0,27900.0,115627.0,0,0


In [22]:
# add missing dummy variables to testing set
test_df10 = pd.get_dummies(test_df9)

X_test = test_df10.drop('loan_status', axis=1)
y_test = test_df10['loan_status'].values

In [23]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression(solver='lbfgs',max_iter=200)
classifier.fit(X_train, y_train)
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.6545155993431856
Testing Data Score: 0.5197788175244576


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [24]:
# Train a Random Forest Classifier model and print the model score
clf = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_train, y_train)
print(f'Training Score: {clf.score(X_train, y_train)}')
print(f'Testing Score: {clf.score(X_test, y_test)}')

Training Score: 1.0
Testing Score: 0.6399404508719694


In [25]:
# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

In [26]:
# Train the Logistic Regression model on the scaled data and print the model score
classifier.fit(X_train_scaled, y_train)
print(f"Training Data Score: {classifier.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test)}")

Training Data Score: 0.7061576354679803
Testing Data Score: 0.6601446193109315


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [27]:
# Train a Random Forest Classifier model on the scaled data and print the model score
print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')

Testing Score: 0.5


In [None]:
#Observations