In [86]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix

In [87]:
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

In [88]:
train_df.head()

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,pymnt_plan,dti,delinq_2yrs,inq_last_6mths,...,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag,target
0,7000.0,0.1894,256.38,MORTGAGE,75000.0,Not Verified,n,28.62,0.0,2.0,...,87.5,0.0,0.0,352260.0,62666.0,35000.0,10000.0,N,N,low_risk
1,40000.0,0.1614,975.71,MORTGAGE,102000.0,Source Verified,n,11.72,2.0,0.0,...,0.0,0.0,0.0,294664.0,109911.0,9000.0,71044.0,N,N,low_risk
2,11000.0,0.2055,294.81,RENT,45000.0,Verified,n,37.25,1.0,3.0,...,7.7,0.0,0.0,92228.0,36007.0,33000.0,46328.0,N,N,low_risk
3,4000.0,0.1612,140.87,MORTGAGE,38000.0,Not Verified,n,42.89,1.0,0.0,...,100.0,0.0,0.0,284273.0,52236.0,13500.0,52017.0,N,N,low_risk
4,14000.0,0.1797,505.93,MORTGAGE,43000.0,Source Verified,n,22.16,1.0,0.0,...,25.0,0.0,0.0,120280.0,88147.0,33300.0,78680.0,N,N,low_risk


In [89]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12180 entries, 0 to 12179
Data columns (total 84 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   loan_amnt                   12180 non-null  float64
 1   int_rate                    12180 non-null  float64
 2   installment                 12180 non-null  float64
 3   home_ownership              12180 non-null  object 
 4   annual_inc                  12180 non-null  float64
 5   verification_status         12180 non-null  object 
 6   pymnt_plan                  12180 non-null  object 
 7   dti                         12180 non-null  float64
 8   delinq_2yrs                 12180 non-null  float64
 9   inq_last_6mths              12180 non-null  float64
 10  open_acc                    12180 non-null  float64
 11  pub_rec                     12180 non-null  float64
 12  revol_bal                   12180 non-null  float64
 13  total_acc                   121

In [90]:
test_df.head()

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,pymnt_plan,dti,delinq_2yrs,inq_last_6mths,...,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag,target
0,40000.0,0.1033,856.4,RENT,128700.0,Source Verified,n,12.47,0.0,1.0,...,57.1,0.0,0.0,63915.0,49510.0,49400.0,14515.0,Y,N,low_risk
1,24450.0,0.143,572.72,MORTGAGE,44574.0,Not Verified,n,15.05,0.0,1.0,...,0.0,0.0,0.0,136425.0,19439.0,15500.0,18925.0,N,N,low_risk
2,13500.0,0.143,316.23,OWN,60000.0,Not Verified,n,28.72,0.0,0.0,...,0.0,0.0,0.0,82124.0,65000.0,5400.0,61724.0,Y,N,low_risk
3,10625.0,0.1774,268.31,RENT,60000.0,Verified,n,15.7,0.0,4.0,...,20.0,0.0,0.0,54855.0,50335.0,23200.0,26255.0,N,N,low_risk
4,6375.0,0.1862,232.46,RENT,60000.0,Source Verified,n,35.5,0.0,0.0,...,75.0,0.0,0.0,90445.0,56541.0,15300.0,72345.0,N,N,low_risk


In [91]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4702 entries, 0 to 4701
Data columns (total 84 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   loan_amnt                   4702 non-null   float64
 1   int_rate                    4702 non-null   float64
 2   installment                 4702 non-null   float64
 3   home_ownership              4702 non-null   object 
 4   annual_inc                  4702 non-null   float64
 5   verification_status         4702 non-null   object 
 6   pymnt_plan                  4702 non-null   object 
 7   dti                         4702 non-null   float64
 8   delinq_2yrs                 4702 non-null   float64
 9   inq_last_6mths              4702 non-null   float64
 10  open_acc                    4702 non-null   float64
 11  pub_rec                     4702 non-null   float64
 12  revol_bal                   4702 non-null   float64
 13  total_acc                   4702 

### Data Preprocessing

In [92]:
# Convert categorical data to numeric and separate target feature for training data
X_train = train_df.drop(columns=['target'])
y_train = train_df['target']
X_train_dummies = pd.get_dummies(X_train)
X_train_dummies.head()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N,debt_settlement_flag_Y
0,7000.0,0.1894,256.38,75000.0,28.62,0.0,2.0,20.0,0.0,40414.0,...,0,1,1,0,1,0,1,0,1,0
1,40000.0,0.1614,975.71,102000.0,11.72,2.0,0.0,10.0,0.0,43531.0,...,0,1,0,1,1,0,1,0,1,0
2,11000.0,0.2055,294.81,45000.0,37.25,1.0,3.0,23.0,0.0,8242.0,...,1,1,0,1,1,0,1,0,1,0
3,4000.0,0.1612,140.87,38000.0,42.89,1.0,0.0,7.0,0.0,12767.0,...,0,1,0,1,0,1,1,0,1,0
4,14000.0,0.1797,505.93,43000.0,22.16,1.0,0.0,22.0,0.0,11182.0,...,0,1,0,1,1,0,1,0,1,0


In [93]:
# Convert categorical data to numeric and separate target feature for testing data
X_test = test_df.drop(columns=['target'])
y_test = test_df['target']
X_test_dummies = pd.get_dummies(X_test)
X_test_dummies.head()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,verification_status_Source Verified,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N
0,40000.0,0.1033,856.4,128700.0,12.47,0.0,1.0,8.0,0.0,38113.0,...,1,0,1,0,1,1,0,0,1,1
1,24450.0,0.143,572.72,44574.0,15.05,0.0,1.0,6.0,0.0,1665.0,...,0,0,1,0,1,1,0,1,0,1
2,13500.0,0.143,316.23,60000.0,28.72,0.0,0.0,8.0,0.0,13857.0,...,0,0,1,0,1,1,0,0,1,1
3,10625.0,0.1774,268.31,60000.0,15.7,0.0,4.0,17.0,0.0,6216.0,...,0,1,1,0,1,1,0,1,0,1
4,6375.0,0.1862,232.46,60000.0,35.5,0.0,0.0,13.0,0.0,12681.0,...,1,0,1,0,1,1,0,1,0,1


In [94]:
# add missing dummy variables to testing set
print(X_train_dummies.columns.symmetric_difference(X_test_dummies.columns))
X_test_dummies['debt_settlement_flag_Y'] = 0
X_test_dummies.head()

Index(['debt_settlement_flag_Y'], dtype='object')


Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N,debt_settlement_flag_Y
0,40000.0,0.1033,856.4,128700.0,12.47,0.0,1.0,8.0,0.0,38113.0,...,0,1,0,1,1,0,0,1,1,0
1,24450.0,0.143,572.72,44574.0,15.05,0.0,1.0,6.0,0.0,1665.0,...,0,1,0,1,1,0,1,0,1,0
2,13500.0,0.143,316.23,60000.0,28.72,0.0,0.0,8.0,0.0,13857.0,...,0,1,0,1,1,0,0,1,1,0
3,10625.0,0.1774,268.31,60000.0,15.7,0.0,4.0,17.0,0.0,6216.0,...,1,1,0,1,1,0,1,0,1,0
4,6375.0,0.1862,232.46,60000.0,35.5,0.0,0.0,13.0,0.0,12681.0,...,0,1,0,1,1,0,1,0,1,0


### Modeling with Unscaled Data

#### PREDICTION
Random Forest does not perform well when the values of numerical features of the test data are outside the range of the training data, whereas Logistic regression can perform well even if the test data is not scaled to fit the range of the training data. For this reason, when using unscaled data, it is more likely that Logistic Regression will perform better than Random Forest.

In [95]:
# Train the Logistic Regression model on the unscaled data and print the model score
lr1 = LogisticRegression(max_iter=10000).fit(X_train_dummies, y_train)
print(f"Training Data Score: {lr1.score(X_train_dummies, y_train)}")
print(f"Testing Data Score: {lr1.score(X_test_dummies, y_test)}")

Training Data Score: 0.7048440065681445
Testing Data Score: 0.562951935346661


In [96]:
# Train a Random Forest Classifier model and print the model score
rf1 = RandomForestClassifier(random_state=42).fit(X_train_dummies, y_train)
print(f"Training Data Score: {rf1.score(X_train_dummies, y_train)}")
print(f"Testing Data Score: {rf1.score(X_test_dummies, y_test)}")

Training Data Score: 1.0
Testing Data Score: 0.644193960017014


In [97]:
lr1_y_pred = lr1.predict(X_test_dummies)
rf1_y_pred = rf1.predict(X_test_dummies)

print("Logistic Regression Confusion Matrix:")
print(confusion_matrix(y_test, lr1_y_pred, labels = ["high_risk", "low_risk"], normalize = "all"))
print("Random Forest Classifier Confusion Matrix:")
print(confusion_matrix(y_test, rf1_y_pred, labels = ["high_risk", "low_risk"], normalize = "all"))

Logistic Regression Confusion Matrix:
[[0.17481923 0.32518077]
 [0.11186729 0.38813271]]
Random Forest Classifier Confusion Matrix:
[[0.40748618 0.09251382]
 [0.26329222 0.23670778]]


#### RESULT

The prediction that logistic regression would perform better with this unscaled data was incorrect; Random Forest performed better in this instance (0.64 > 0.56). Likely, this means the numerical data in our test set does not vary considerably outside the range of the numerical data in our training set.

Considering the confusion matrices, the Logistic regression model predicts more loans to be low_risk, leading to a higher false negative rate (65%) whereas the Random Forest predicts more loans to be high risk, leading to a higher false positive rate (52.6%). In this context, this means using the Random Forest classifier to predict the risk of a new loan is a much more conservative approach. It is more likely to predict what may be a low risk loan to be a high risk loan.

### Modeling with Scaled Data

#### PREDICTION

As we already know the values of the numerical features in the test data likely do not differ significantly from those in the training data, it is unlikely that scaling will vastly improve the Random Forest model. However, as Logisitic Regression uses Gradient Descent to find the best result, it is highly sensitive to data scaling. When features vary greatly in magnitude, Logisitic Regression assumes features with a large magnitude are more important. Thus, feature scaling is a helpful tool to put all features of a dataset into the same range, regardless of their relevance to the model. For this reason, I think Logistic Regression will perform better than Random Forest with this scaled data.

In [98]:
# Scale the data
scaler = StandardScaler().fit(X_train_dummies)
X_train_scaled = scaler.transform(X_train_dummies)
X_test_scaled = scaler.transform(X_test_dummies)

In [99]:
# Train the Logistic Regression model on the scaled data and print the model score
lr2 = LogisticRegression(max_iter=10000).fit(X_train_scaled, y_train)
print(f"Training Data Score: {lr2.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {lr2.score(X_test_scaled, y_test)}")

Training Data Score: 0.710919540229885
Testing Data Score: 0.7601020842194811


In [100]:
# Train a Random Forest Classifier model on the scaled data and print the model score
rf2 = RandomForestClassifier().fit(X_train_scaled, y_train)
print(f"Training Data Score: {rf2.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {rf2.score(X_test_scaled, y_test)}")

Training Data Score: 1.0
Testing Data Score: 0.635899617184177


In [104]:
lr2_y_pred = lr2.predict(X_test_scaled)
rf2_y_pred = rf2.predict(X_test_scaled)

print("Logistic Regression Confusion Matrix:")
print(confusion_matrix(y_test, lr2_y_pred, labels = ["high_risk", "low_risk"], normalize = "all"))
print("Random Forest Classifier Confusion Matrix:")
print(confusion_matrix(y_test, rf2_y_pred, labels = ["high_risk", "low_risk"], normalize = "all"))

Logistic Regression Confusion Matrix:
[[0.37494683 0.12505317]
 [0.11484475 0.38515525]]
Random Forest Classifier Confusion Matrix:
[[0.41535517 0.08464483]
 [0.27945555 0.22054445]]


#### RESULT

The prediction that Logistic regression would perform better with this scaled data was correct (0.76 > 0.63).