In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [2]:
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

In [3]:
# previewing training data
train_df.head()

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,57107,57107,13375.0,0.1797,483.34,MORTGAGE,223000.0,Not Verified,low_risk,n,...,100.0,50.0,0.0,0.0,577150.0,122018.0,32000.0,170200.0,N,N
1,141451,141451,21000.0,0.1308,478.68,MORTGAGE,123000.0,Source Verified,low_risk,n,...,85.0,33.3,0.0,0.0,132750.0,27896.0,15900.0,35398.0,N,N
2,321143,321143,20000.0,0.124,448.95,MORTGAGE,197000.0,Source Verified,low_risk,n,...,85.7,33.3,0.0,0.0,628160.0,114043.0,22600.0,90340.0,N,N
3,11778,11778,3000.0,0.124,100.22,RENT,45000.0,Not Verified,low_risk,n,...,100.0,16.7,1.0,0.0,42006.0,20761.0,19900.0,15406.0,N,N
4,169382,169382,30000.0,0.1612,1056.49,MORTGAGE,133000.0,Source Verified,low_risk,n,...,100.0,66.7,0.0,0.0,283248.0,109056.0,79500.0,58778.0,N,N


In [4]:
# previewing testing data
test_df.head()

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,67991,67991,40000.0,0.0819,814.7,MORTGAGE,140000.0,Not Verified,low_risk,n,...,97.7,0.0,0.0,0.0,527975.0,70914.0,74600.0,99475.0,N,N
1,25429,25429,6000.0,0.1524,208.7,RENT,55000.0,Not Verified,low_risk,n,...,66.7,0.0,0.0,0.0,34628.0,23460.0,5900.0,23628.0,N,N
2,38496,38496,3600.0,0.1695,128.27,RENT,42000.0,Not Verified,low_risk,n,...,100.0,0.0,0.0,0.0,23100.0,19183.0,7300.0,15000.0,N,N
3,19667,19667,20000.0,0.1524,478.33,RENT,100000.0,Not Verified,low_risk,n,...,100.0,50.0,0.0,0.0,56481.0,43817.0,13800.0,35981.0,N,N
4,37505,37505,3600.0,0.124,120.27,RENT,50000.0,Not Verified,low_risk,n,...,100.0,25.0,0.0,0.0,45977.0,32448.0,21000.0,24977.0,N,N


In [5]:
# converting categorical data to numeric using pd.get_dummies() to identify which variables are missing for train data
train_df = pd.get_dummies(train_df)
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12180 entries, 0 to 12179
Data columns (total 96 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   Unnamed: 0                           12180 non-null  int64  
 1   index                                12180 non-null  int64  
 2   loan_amnt                            12180 non-null  float64
 3   int_rate                             12180 non-null  float64
 4   installment                          12180 non-null  float64
 5   annual_inc                           12180 non-null  float64
 6   dti                                  12180 non-null  float64
 7   delinq_2yrs                          12180 non-null  float64
 8   inq_last_6mths                       12180 non-null  float64
 9   open_acc                             12180 non-null  float64
 10  pub_rec                              12180 non-null  float64
 11  revol_bal                   

In [6]:
# converting categorical data to numeric using pd.get_dummies() to identify which variables are missing for test data
test_df = pd.get_dummies(test_df)
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4702 entries, 0 to 4701
Data columns (total 95 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   Unnamed: 0                           4702 non-null   int64  
 1   index                                4702 non-null   int64  
 2   loan_amnt                            4702 non-null   float64
 3   int_rate                             4702 non-null   float64
 4   installment                          4702 non-null   float64
 5   annual_inc                           4702 non-null   float64
 6   dti                                  4702 non-null   float64
 7   delinq_2yrs                          4702 non-null   float64
 8   inq_last_6mths                       4702 non-null   float64
 9   open_acc                             4702 non-null   float64
 10  pub_rec                              4702 non-null   float64
 11  revol_bal                     

In [7]:
# it seems in the test data we are missing debt_settlement_flag_Y, let's confirm
test_df[test_df['debt_settlement_flag_N'] == 0]

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,...,loan_status_high_risk,loan_status_low_risk,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N


In [8]:
# re-import csv's now that we know which variables are missing and need to be added to our testing set
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

In [9]:
# removing 'unnamed: 0' column in both dataframes
del train_df['Unnamed: 0']
del test_df['Unnamed: 0']

In [10]:
# removing 'index' column in both dataframes
del train_df['index']
del test_df['index']

In [11]:
# Convert categorical data to numeric and separate target feature for training data

X_train = train_df.drop(['loan_status'], axis = 1)
X_train = pd.get_dummies(X_train)
y_train = LabelEncoder().fit_transform(train_df['loan_status'])
y_train

array([1, 1, 1, ..., 0, 0, 0])

In [12]:
# Convert categorical data to numeric and separate target feature for testing data
X_test = test_df.drop(['loan_status'], axis = 1)
X_test = pd.get_dummies(X_test)
y_test = LabelEncoder().fit_transform(test_df['loan_status'])
y_test

array([1, 1, 1, ..., 0, 0, 0])

In [13]:
# add missing dummy variables to testing set
X_test['debt_settlement_flag_Y'] = 0
X_test.head()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N,debt_settlement_flag_Y
0,40000.0,0.0819,814.7,140000.0,19.75,0.0,1.0,18.0,0.0,9471.0,...,0,1,0,1,1,0,1,0,1,0
1,6000.0,0.1524,208.7,55000.0,11.52,2.0,0.0,8.0,0.0,1280.0,...,0,1,0,1,1,0,1,0,1,0
2,3600.0,0.1695,128.27,42000.0,6.74,0.0,0.0,6.0,0.0,4757.0,...,0,1,0,1,1,0,1,0,1,0
3,20000.0,0.1524,478.33,100000.0,12.13,0.0,2.0,7.0,0.0,12731.0,...,0,1,0,1,1,0,1,0,1,0
4,3600.0,0.124,120.27,50000.0,16.08,0.0,3.0,6.0,0.0,10413.0,...,0,1,0,1,1,0,1,0,1,0


## Prediction on Model Performance (Before Scaling) 

Personally, I see this as a classification problem because we're applying labels to data to be able to make predictions on a set of categories regarding loans (high risk/low risk). Thus, I see the Logistic Regression model (aka a classification model), producing a higher accuracy score.

In [14]:
# Train the Logistic Regression model on the unscaled data and print the model score
model = LogisticRegression(max_iter = 15000)
model.fit(X_train, y_train)

print(f"Training Data Score: {model.score(X_train, y_train)}")
print(f"Testing Data Score: {model.score(X_test, y_test)}")

Training Data Score: 0.6998357963875205
Testing Data Score: 0.5695448745214802


In [15]:
# Train a Random Forest Classifier model and print the model score
model = RandomForestClassifier(random_state = 1, n_estimators = 50).fit(X_train, y_train)
model.fit(X_train, y_train)

print(f"Training Data Score: {model.score(X_train, y_train)}")
print(f"Testing Data Score: {model.score(X_test, y_test)}")

Training Data Score: 0.9999178981937603
Testing Data Score: 0.6412165036154828


In [16]:
# Scale the data
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Prediction on Model Performance (After Scaling)

Using StandarScaler(), I predict our Logistic Regression model will produce a higher accuracy. However, in my opinion, since our Random Forest Classifier model was not the model best suited for making our predictions on high/low risk loans in the first place, I don't see our accuracy score very much. 

In [17]:
# Train the Logistic Regression model on the scaled data and print the model score
model = LogisticRegression(max_iter=15000).fit(X_train_scaled, y_train)
print(f'Training Score: {model.score(X_train_scaled, y_train)}')
print(f'Testing Score: {model.score(X_test_scaled, y_test)}')

Training Score: 0.7080459770114943
Testing Score: 0.7679710761378137


In [18]:
# Train a Random Forest Classifier model on the scaled data and print the model score
model = RandomForestClassifier(random_state = 1, n_estimators = 50).fit(X_train_scaled, y_train)
print(f'Training Score: {model.score(X_train_scaled, y_train)}')
print(f'Testing Score: {model.score(X_test_scaled, y_test)}')

Training Score: 0.9999178981937603
Testing Score: 0.6420672054444917


## Final Thoughts

I still believe the Logistic Regression model is the best option for making our predictions on high/low risk loans as I do think this is a classification problem. After seeing the change in accuracy scores for this model before and after scaling, it was made clear to me that scaling our data is absolutely a necessary step in preprocessing. 

As for our Random Forest Classifier model, again, I do not think this was the best model for making our prediction because this is not a clustering problem. After looking at the accuracy scores, I do think the model is overfitting our training data and the noisy parameters are confusing our model. 