# Credit Risk Evaluator
Jack Cohen

## Initialize Workspace

In [1]:
# Import dependencies.
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder

In [2]:
# Load data.
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

In [3]:
train_df.head()

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,pymnt_plan,dti,delinq_2yrs,inq_last_6mths,...,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag,target
0,7000.0,0.1894,256.38,MORTGAGE,75000.0,Not Verified,n,28.62,0.0,2.0,...,87.5,0.0,0.0,352260.0,62666.0,35000.0,10000.0,N,N,low_risk
1,40000.0,0.1614,975.71,MORTGAGE,102000.0,Source Verified,n,11.72,2.0,0.0,...,0.0,0.0,0.0,294664.0,109911.0,9000.0,71044.0,N,N,low_risk
2,11000.0,0.2055,294.81,RENT,45000.0,Verified,n,37.25,1.0,3.0,...,7.7,0.0,0.0,92228.0,36007.0,33000.0,46328.0,N,N,low_risk
3,4000.0,0.1612,140.87,MORTGAGE,38000.0,Not Verified,n,42.89,1.0,0.0,...,100.0,0.0,0.0,284273.0,52236.0,13500.0,52017.0,N,N,low_risk
4,14000.0,0.1797,505.93,MORTGAGE,43000.0,Source Verified,n,22.16,1.0,0.0,...,25.0,0.0,0.0,120280.0,88147.0,33300.0,78680.0,N,N,low_risk


In [4]:
test_df.head()

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,pymnt_plan,dti,delinq_2yrs,inq_last_6mths,...,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag,target
0,40000.0,0.1033,856.4,RENT,128700.0,Source Verified,n,12.47,0.0,1.0,...,57.1,0.0,0.0,63915.0,49510.0,49400.0,14515.0,Y,N,low_risk
1,24450.0,0.143,572.72,MORTGAGE,44574.0,Not Verified,n,15.05,0.0,1.0,...,0.0,0.0,0.0,136425.0,19439.0,15500.0,18925.0,N,N,low_risk
2,13500.0,0.143,316.23,OWN,60000.0,Not Verified,n,28.72,0.0,0.0,...,0.0,0.0,0.0,82124.0,65000.0,5400.0,61724.0,Y,N,low_risk
3,10625.0,0.1774,268.31,RENT,60000.0,Verified,n,15.7,0.0,4.0,...,20.0,0.0,0.0,54855.0,50335.0,23200.0,26255.0,N,N,low_risk
4,6375.0,0.1862,232.46,RENT,60000.0,Source Verified,n,35.5,0.0,0.0,...,75.0,0.0,0.0,90445.0,56541.0,15300.0,72345.0,N,N,low_risk


## Data Preprocessing

In [11]:
# Verify all test and train columns match.
for x in train_df.columns:
    if x not in test_df.columns:
        print(x)

In [33]:
# Convert categorical data to numeric.
# Separate target feature for training data.
y_train = LabelEncoder().fit_transform(train_df['target'])
X_train = train_df.drop('target', axis=1)
X_train = pd.get_dummies(X_train)
X_train.head()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N,debt_settlement_flag_Y
0,7000.0,0.1894,256.38,75000.0,28.62,0.0,2.0,20.0,0.0,40414.0,...,0,1,1,0,1,0,1,0,1,0
1,40000.0,0.1614,975.71,102000.0,11.72,2.0,0.0,10.0,0.0,43531.0,...,0,1,0,1,1,0,1,0,1,0
2,11000.0,0.2055,294.81,45000.0,37.25,1.0,3.0,23.0,0.0,8242.0,...,1,1,0,1,1,0,1,0,1,0
3,4000.0,0.1612,140.87,38000.0,42.89,1.0,0.0,7.0,0.0,12767.0,...,0,1,0,1,0,1,1,0,1,0
4,14000.0,0.1797,505.93,43000.0,22.16,1.0,0.0,22.0,0.0,11182.0,...,0,1,0,1,1,0,1,0,1,0


In [34]:
# Convert categorical data to numeric.
# Separate target feature for testing data.
y_test = LabelEncoder().fit_transform(test_df['target'])
X_test = test_df.drop('target', axis=1)
X_test = pd.get_dummies(X_test)
X_test.head()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,verification_status_Source Verified,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N
0,40000.0,0.1033,856.4,128700.0,12.47,0.0,1.0,8.0,0.0,38113.0,...,1,0,1,0,1,1,0,0,1,1
1,24450.0,0.143,572.72,44574.0,15.05,0.0,1.0,6.0,0.0,1665.0,...,0,0,1,0,1,1,0,1,0,1
2,13500.0,0.143,316.23,60000.0,28.72,0.0,0.0,8.0,0.0,13857.0,...,0,0,1,0,1,1,0,0,1,1
3,10625.0,0.1774,268.31,60000.0,15.7,0.0,4.0,17.0,0.0,6216.0,...,0,1,1,0,1,1,0,1,0,1
4,6375.0,0.1862,232.46,60000.0,35.5,0.0,0.0,13.0,0.0,12681.0,...,1,0,1,0,1,1,0,1,0,1


In [35]:
# Add missing dummy variables to testing set.
for x in X_train.columns:
    if x not in X_test.columns:
        print(x)

debt_settlement_flag_Y


In [36]:
# Show number of columns.
X_test['debt_settlement_flag_Y'] = 0
print(len(X_test.columns))

92


In [45]:
y_train

array([1, 1, 1, ..., 0, 0, 0])

## Consider Supervised Learning Models

### Prediction

 I predict that the Logistic Regression model will be better for credit risk prediction. The LR model finds the probability of output given specific inputs and finds which factors contribute more to the outcome. This is a robust method for the large number of features in this dataset. However, due to the varied order of magnitudes of the values across the features, we'd expect the accuracy of the LR model to increase when the data is scaled, as it allows the model to more accurately discover which features contribute more to the outcome.


 Random Forest Classifier will be very accurate on training data, as the decision trees are created with this data in mind. But we'd expect the performance to decrease with test data, as only one training set was used to create the model and this training set is undersampled. The model averages predictions from all trees, giving more weight to less important factors. We woud not expect scaling the data to increase accuracy for the RF model, as the accuracy will increase with more training sets.


### Logistic Regression

In [46]:
# Train the Logistic Regression model on the unscaled data.
# Print the model score.
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()

classifier.fit(X_train, y_train)

print(f"Training Data Score (LR): {classifier.score(X_train, y_train)}")
print(f"Testing Data Score (LR): {classifier.score(X_test, y_test)}")

Training Data Score (LR): 0.6529556650246305
Testing Data Score (LR): 0.5089323692045938


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [47]:
# Create confusion matrix.
from sklearn.metrics import confusion_matrix

y_true = y_test
y_pred = classifier.predict(X_test)
confusion_matrix(y_true, y_pred)

array([[ 715, 1636],
       [ 673, 1678]])

### Random Forest Classifier

In [48]:
# Train a Random Forest Classifier model.
# Print the model score.
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_train, y_train)
print(f'Training Score (RF): {clf.score(X_train, y_train)}')
print(f'Testing Score (RF): {clf.score(X_test, y_test)}')

Training Score (RF): 1.0
Testing Score (RF): 0.646958740961293


## Scaled Data Model

### Scale the Data

In [49]:
# Scale the data.
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Logistic Regression

In [50]:
# Train the Logistic Regression model on the scaled data.
# Print the model score.
classifier_scaled = LogisticRegression()

classifier_scaled.fit(X_train_scaled, y_train)

print(f"Training Data Score (Scaled LR): {classifier_scaled.score(X_train_scaled, y_train)}")
print(f"Testing Data Score (Scaled LR): {classifier_scaled.score(X_test_scaled, y_test)}")

Training Data Score (Scaled LR): 0.710919540229885
Testing Data Score (Scaled LR): 0.7598894087622289


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


### Random Forest Classifier

In [51]:
# Train a Random Forest Classifier model on the scaled data.
# Print the model score.
clf_scaled = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_train_scaled, y_train)
print(f'Training Score (Scaled RF): {clf_scaled.score(X_train_scaled, y_train)}')
print(f'Testing Score (Scaled RF): {clf_scaled.score(X_test_scaled, y_test)}')

Training Score (Scaled RF): 1.0
Testing Score (Scaled RF): 0.6480221182475542


## Results

#### Unscaled
* Training Data Score (LR): 0.6529556650246305
* Testing Data Score (LR): 0.5089323692045938
* Training Score (RF): 1.0
* Testing Score (RF): 0.646958740961293

#### Scaled
* Training Data Score (Scaled LR): 0.710919540229885
* Testing Data Score (Scaled LR): 0.7598894087622289
* Training Score (Scaled RF): 1.0
* Testing Score (Scaled RF): 0.6480221182475542

#### Discussion
 As expected, the Logistic Regression Model increased in performance after the data was scaled. Scaling the data gives the model the ability to compare each feature's contribution to the result without having inaccuracies from the actual size of the values. This LR model is clearly more robust for unseen datasets, as the test score is greater than the training score for the scaled data tests.
 
 The Random Forest Classifier model performed perfectly on the training data because the decision trees were created to fit the data. But since we don't have a lot of training data, or different datasets to train the model, the model performs poorly on the test data. This is also because the training data is undersamples and has 92 features, so the model is less able to find which features are more important. As expected, the size of the values within each feature does not matter as much. This can be seen by the fact the RF model's performance barely changed when the data was scaled.