# Credit Risk Resampling Techniques

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

# Read the CSV and Perform Basic Data Cleaning

In [3]:
columns = [
    "loan_amnt", "int_rate", "installment", "home_ownership",
    "annual_inc", "verification_status", "issue_d", "loan_status",
    "pymnt_plan", "dti", "delinq_2yrs", "inq_last_6mths",
    "open_acc", "pub_rec", "revol_bal", "total_acc",
    "initial_list_status", "out_prncp", "out_prncp_inv", "total_pymnt",
    "total_pymnt_inv", "total_rec_prncp", "total_rec_int", "total_rec_late_fee",
    "recoveries", "collection_recovery_fee", "last_pymnt_amnt", "next_pymnt_d",
    "collections_12_mths_ex_med", "policy_code", "application_type", "acc_now_delinq",
    "tot_coll_amt", "tot_cur_bal", "open_acc_6m", "open_act_il",
    "open_il_12m", "open_il_24m", "mths_since_rcnt_il", "total_bal_il",
    "il_util", "open_rv_12m", "open_rv_24m", "max_bal_bc",
    "all_util", "total_rev_hi_lim", "inq_fi", "total_cu_tl",
    "inq_last_12m", "acc_open_past_24mths", "avg_cur_bal", "bc_open_to_buy",
    "bc_util", "chargeoff_within_12_mths", "delinq_amnt", "mo_sin_old_il_acct",
    "mo_sin_old_rev_tl_op", "mo_sin_rcnt_rev_tl_op", "mo_sin_rcnt_tl", "mort_acc",
    "mths_since_recent_bc", "mths_since_recent_inq", "num_accts_ever_120_pd", "num_actv_bc_tl",
    "num_actv_rev_tl", "num_bc_sats", "num_bc_tl", "num_il_tl",
    "num_op_rev_tl", "num_rev_accts", "num_rev_tl_bal_gt_0",
    "num_sats", "num_tl_120dpd_2m", "num_tl_30dpd", "num_tl_90g_dpd_24m",
    "num_tl_op_past_12m", "pct_tl_nvr_dlq", "percent_bc_gt_75", "pub_rec_bankruptcies",
    "tax_liens", "tot_hi_cred_lim", "total_bal_ex_mort", "total_bc_limit",
    "total_il_high_credit_limit", "hardship_flag", "debt_settlement_flag"
]

target = ["loan_status"]

In [4]:
# Load the data
file_path = "https://kenw-data.s3.us-east-2.amazonaws.com/LoanStats_2019Q1.csv"
df = pd.read_csv(file_path, skiprows=1)[:-2]
df = df.loc[:, columns].copy()

# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()

# Remove the `Issued` loan status
issued_mask = df['loan_status'] != 'Issued'
df = df.loc[issued_mask]

# convert interest rate to numerical
df['int_rate'] = df['int_rate'].str.replace('%', '')
df['int_rate'] = df['int_rate'].astype('float') / 100


# Convert the target column values to low_risk and high_risk based on their values
x = {'Current': 'low_risk'}   
df = df.replace(x)

x = dict.fromkeys(['Late (31-120 days)', 'Late (16-30 days)', 'Default', 'In Grace Period'], 'high_risk')    
df = df.replace(x)

df.reset_index(inplace=True, drop=True)

df.head()

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,dti,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,10500.0,0.1719,375.35,RENT,66000.0,Source Verified,Mar-2019,low_risk,n,27.24,...,85.7,100.0,0.0,0.0,65687.0,38199.0,2000.0,61987.0,N,N
1,25000.0,0.2,929.09,MORTGAGE,105000.0,Verified,Mar-2019,low_risk,n,20.23,...,91.2,50.0,1.0,0.0,271427.0,60641.0,41200.0,49197.0,N,N
2,20000.0,0.2,529.88,MORTGAGE,56000.0,Verified,Mar-2019,low_risk,n,24.26,...,66.7,50.0,0.0,0.0,60644.0,45684.0,7500.0,43144.0,N,N
3,10000.0,0.164,353.55,RENT,92000.0,Verified,Mar-2019,low_risk,n,31.44,...,100.0,50.0,1.0,0.0,99506.0,68784.0,19700.0,76506.0,N,N
4,22000.0,0.1474,520.39,MORTGAGE,52000.0,Not Verified,Mar-2019,low_risk,n,18.76,...,100.0,0.0,0.0,0.0,219750.0,25919.0,27600.0,20000.0,N,N


In [17]:
encode_columns_df = df.select_dtypes(exclude=['float64'])
encode_columns_df.columns

Index(['home_ownership', 'verification_status', 'issue_d', 'loan_status',
       'pymnt_plan', 'initial_list_status', 'next_pymnt_d', 'application_type',
       'hardship_flag', 'debt_settlement_flag'],
      dtype='object')

In [130]:
# encode text columns
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

# transform text columns
df['home_ownership'] = le.fit_transform(df['home_ownership'])
df['verification_status'] = le.fit_transform(df['verification_status'])
df['issue_d'] = le.fit_transform(df['issue_d'])
df['pymnt_plan'] = le.fit_transform(df['pymnt_plan'])
df['initial_list_status'] = le.fit_transform(df['initial_list_status'])
df['next_pymnt_d'] = le.fit_transform(df['next_pymnt_d'])
df['application_type'] = le.fit_transform(df['application_type'])
df['hardship_flag'] = le.fit_transform(df['hardship_flag'])
df['debt_settlement_flag'] = le.fit_transform(df['debt_settlement_flag'])


# Split the Data into Training and Testing

In [131]:
# Create our features
X = df.drop(columns="loan_status")

# Create our target
y = df["loan_status"]

In [132]:
X.describe()

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,issue_d,pymnt_plan,dti,delinq_2yrs,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
count,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,...,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0
mean,16677.594562,0.127718,480.652863,1.812779,88213.71,0.669994,0.805542,0.0,21.778153,0.217766,...,95.057627,30.626217,0.125972,0.0,210033.2,61338.43,29734.128558,55722.4,0.0,0.0
std,10277.34859,0.04813,288.062432,0.941313,115580.0,0.719105,0.714932,0.0,20.199244,0.718367,...,8.326426,33.631463,0.336732,0.0,192808.8,57387.98,26795.394232,50958.45,0.0,0.0
min,1000.0,0.06,30.89,0.0,40.0,0.0,0.0,0.0,0.0,0.0,...,20.0,0.0,0.0,0.0,3600.0,235.0,100.0,127.0,0.0,0.0
25%,9000.0,0.0881,265.73,1.0,50000.0,0.0,0.0,0.0,13.89,0.0,...,93.0,0.0,0.0,0.0,66977.0,26503.0,11600.0,22880.0,0.0,0.0
50%,15000.0,0.118,404.56,1.0,73000.0,1.0,1.0,0.0,19.76,0.0,...,100.0,20.0,0.0,0.0,146710.0,45357.0,22100.0,42000.0,0.0,0.0
75%,24000.0,0.1557,648.1,3.0,104000.0,1.0,1.0,0.0,26.66,0.0,...,100.0,50.0,0.0,0.0,303640.0,76570.0,39300.0,72499.0,0.0,0.0
max,40000.0,0.3084,1676.23,3.0,8797500.0,2.0,2.0,0.0,999.0,18.0,...,100.0,100.0,4.0,0.0,3292782.0,1295455.0,509400.0,1426964.0,0.0,0.0


In [133]:
# Check the balance of our target values
y.value_counts()

low_risk     68470
high_risk      347
Name: loan_status, dtype: int64

In [134]:
# Create X_train, X_test, y_train, y_test

# Normal train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)
Counter(y_train)

Counter({'low_risk': 51352, 'high_risk': 260})

# Oversampling

In this section, you will compare two oversampling algorithms to determine which algorithm results in the best performance. You will oversample the data using the naive random oversampling algorithm and the SMOTE algorithm. For each algorithm, be sure to complete the folliowing steps:

1. View the count of the target classes using `Counter` from the collections library. 
3. Use the resampled data to train a logistic regression model.
3. Calculate the balanced accuracy score from sklearn.metrics.
4. Print the confusion matrix from sklearn.metrics.
5. Generate a classication report using the `imbalanced_classification_report` from imbalanced-learn.

Note: Use a random state of 1 for each sampling algorithm to ensure consistency between tests

### Naive Random Oversampling

In [135]:
# Resample the training data with the RandomOversampler

from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

Counter(y_resampled)

Counter({'low_risk': 51352, 'high_risk': 51352})

In [136]:
# Train the Logistic Regression model using the resampled data

from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=1, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [137]:
# Calculated the balanced accuracy score

from sklearn.metrics import balanced_accuracy_score

y_pred = model.predict(X_test)
bas = balanced_accuracy_score(y_test, y_pred)
print(bas)

0.6453289741389383


In [138]:
# Display the confusion matrix

from sklearn.metrics import confusion_matrix

# Calculating the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index=["Actual High Risk", "Actual Low Risk"], columns=["Predicted High Risk", "Predicted Low Risk"])
print(cm)

[[   56    31]
 [ 6043 11075]]


In [139]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced

# Displaying results
print("Random Over Sampler Confusion Matrix")
display(cm_df)
print(f"Random Over Sampler Balanced Accuracy Score: {bas}")
print("")
print("Random Over Sampler Classification Report")
print(classification_report_imbalanced(y_test, y_pred))

Random Over Sampler Confusion Matrix


Unnamed: 0,Predicted High Risk,Predicted Low Risk
Actual High Risk,56,31
Actual Low Risk,6043,11075


Random Over Sampler Balanced Accuracy Score: 0.6453289741389383

Random Over Sampler Classification Report
                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.01      0.64      0.65      0.02      0.65      0.42        87
   low_risk       1.00      0.65      0.64      0.78      0.65      0.42     17118

avg / total       0.99      0.65      0.64      0.78      0.65      0.42     17205



### SMOTE Oversampling

In [140]:
# Resample the training data with SMOTE
# YOUR CODE HERE
from imblearn.over_sampling import SMOTE

X_resampled, y_resampled = SMOTE(random_state=1, sampling_strategy='auto').fit_resample(X_train, y_train)

Counter(y_resampled)

Counter({'low_risk': 51352, 'high_risk': 51352})

In [141]:
# Train the Logistic Regression model using the resampled data

model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=1, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [142]:
# Calculated the balanced accuracy score

y_pred = model.predict(X_test)
bas = balanced_accuracy_score(y_test, y_pred)
print(bas)

0.648176349960316


In [143]:
# Display the confusion matrix

# Calculating the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index=["Actual High Risk", "Actual Low Risk"], columns=["Predicted High Risk", "Predicted Low Risk"])
print(cm)

[[   54    33]
 [ 5552 11566]]


In [144]:
# Print the imbalanced classification report

# Displaying results
print("SMOTE Confusion Matrix")
display(cm_df)
print(f"SMOTE Balanced Accuracy Score: {bas}")
print("")
print("SMOTE Classification Report")
print(classification_report_imbalanced(y_test, y_pred))

SMOTE Confusion Matrix


Unnamed: 0,Predicted High Risk,Predicted Low Risk
Actual High Risk,54,33
Actual Low Risk,5552,11566


SMOTE Balanced Accuracy Score: 0.648176349960316

SMOTE Classification Report
                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.01      0.62      0.68      0.02      0.65      0.42        87
   low_risk       1.00      0.68      0.62      0.81      0.65      0.42     17118

avg / total       0.99      0.68      0.62      0.80      0.65      0.42     17205



# Undersampling

In this section, you will test an undersampling algorithms to determine which algorithm results in the best performance compared to the oversampling algorithms above. You will undersample the data using the Cluster Centroids algorithm and complete the folliowing steps:

1. View the count of the target classes using `Counter` from the collections library. 
3. Use the resampled data to train a logistic regression model.
3. Calculate the balanced accuracy score from sklearn.metrics.
4. Print the confusion matrix from sklearn.metrics.
5. Generate a classication report using the `imbalanced_classification_report` from imbalanced-learn.

Note: Use a random state of 1 for each sampling algorithm to ensure consistency between tests

In [145]:
# Resample the data using the ClusterCentroids resampler

from imblearn.under_sampling import ClusterCentroids
cc = ClusterCentroids(random_state=1)
X_resampled, y_resampled = cc.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({'high_risk': 260, 'low_risk': 260})

In [146]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=1, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [147]:
# Calculated the balanced accuracy score

y_pred = model.predict(X_test)
bas = balanced_accuracy_score(y_test, y_pred)
print(bas)

0.5107992796451406


In [148]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index=["Actual High Risk", "Actual Low Risk"], columns=["Predicted High Risk", "Predicted Low Risk"])
print(cm)

[[   55    32]
 [10452  6666]]


In [149]:
# Print the imbalanced classification report

# Displaying results
print("Cluster Centroids Confusion Matrix")
display(cm_df)
print(f"Cluster Centroids Balanced Accuracy Score: {bas}")
print("")
print("Cluster Centroids Classification Report")
print(classification_report_imbalanced(y_test, y_pred))

Cluster Centroids Confusion Matrix


Unnamed: 0,Predicted High Risk,Predicted Low Risk
Actual High Risk,55,32
Actual Low Risk,10452,6666


Cluster Centroids Balanced Accuracy Score: 0.5107992796451406

Cluster Centroids Classification Report
                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.01      0.63      0.39      0.01      0.50      0.25        87
   low_risk       1.00      0.39      0.63      0.56      0.50      0.24     17118

avg / total       0.99      0.39      0.63      0.56      0.50      0.24     17205



# Combination (Over and Under) Sampling

In this section, you will test a combination over- and under-sampling algorithm to determine if the algorithm results in the best performance compared to the other sampling algorithms above. You will resample the data using the SMOTEENN algorithm and complete the folliowing steps:

1. View the count of the target classes using `Counter` from the collections library. 
3. Use the resampled data to train a logistic regression model.
3. Calculate the balanced accuracy score from sklearn.metrics.
4. Print the confusion matrix from sklearn.metrics.
5. Generate a classication report using the `imbalanced_classification_report` from imbalanced-learn.

Note: Use a random state of 1 for each sampling algorithm to ensure consistency between tests

In [150]:
# Resample the training data with SMOTEENN
from imblearn.combine import SMOTEENN
smote_enn = SMOTEENN(random_state=1)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)
Counter(y_resampled)

Counter({'high_risk': 68458, 'low_risk': 62022})

In [151]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=1, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [152]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
bas = balanced_accuracy_score(y_test, y_pred)
print(bas)

0.6567517152745044


In [153]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index=["Actual High Risk", "Actual Low Risk"], columns=["Predicted High Risk", "Predicted Low Risk"])
print(cm)

[[  64   23]
 [7226 9892]]


In [154]:
# Print the imbalanced classification report

# Displaying results
print("SMOTEENN Confusion Matrix")
display(cm_df)
print(f"SMOTEENN Balanced Accuracy Score: {bas}")
print("")
print("SMOTEENN Classification Report")
print(classification_report_imbalanced(y_test, y_pred))

SMOTEENN Confusion Matrix


Unnamed: 0,Predicted High Risk,Predicted Low Risk
Actual High Risk,64,23
Actual Low Risk,7226,9892


SMOTEENN Balanced Accuracy Score: 0.6567517152745044

SMOTEENN Classification Report
                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.01      0.74      0.58      0.02      0.65      0.43        87
   low_risk       1.00      0.58      0.74      0.73      0.65      0.42     17118

avg / total       0.99      0.58      0.73      0.73      0.65      0.42     17205



## Credit Risk Machine Learning Models

### Overview
The purpose of this project was to build and evaluate multiple machine learning models to assess loan risk.  The models use 85 features to predict whether a loan is a high or low risk.

#### Resources:
+ Data file:  https://kenw-data.s3.us-east-2.amazonaws.com/LoanStats_2019Q1.csv
+ Python 3.7.7
+ Scikit-learn 0.22.1
+ Imbalanced-learn 0.6.2

#### Jupyter Notebooks
+ credit_risk_resampling.ipynb:  contains all oversampling, undersampling, and combination sampling models
+ credit_risk_ensemble.ipynb:  contains all ensemble models

#### Machine Learning Models
•	Oversampling
o	Random Over Sampler
o	SMOTE
•	Undersampling
o	Cluster Centroids
•	Combination
o	SMOTEENN
•	Ensemble
o	Balanced Random Forest Classifier
o	Easy Ensemble Classifier

#### Machine Learning Model Results
<table class="table table-striped">
                        <thead class="thead-light">
                          <tr>
                            <th>Model</th>
                            <th>Balanced Accuracy Score</th>
                            <th>Precision</th>
                            <th>Recall</th>
                          </tr>
                        </thead>
                        <tbody>
                          <tr>
                            <td>Random Over Sampler</td>
                            <td>0.65</td>
                            <td>0.01</td>
                            <td>0.64</td>
                          </tr>
                          <tr>
                            <td>SMOTE</td>
                            <td>0.65</td>
                            <td>0.01</td>
                            <td>0.62</td>
                          </tr>
                          <tr>
                            <td>Cluster Centroids</td>
                            <td>0.51</td>
                            <td>0.01</td>
                            <td>0.63</td>
                          </tr>
	            <tr>
                            <td>SMOTEENN</td>
                            <td>0.66</td>
                            <td>0.01</td>
                            <td>0.74</td>
                          </tr>
            <tr>
                            <td>Balanced Random Forest Classifier</td>
                            <td>0.75</td>
                            <td>0.03</td>
                            <td>0.61</td>
                         </tr>
           <tr>
                            <td>Easy Ensemble Classifier</td>
                            <td>0.92</td>
                            <td>0.07</td>
                            <td>0.91</td>
                          </tr>
                        </tbody>
                    </table>

#### Resampling Models
The Cluster Centroids model had the lowest balanced accuracy score at 51%.  The three other resampling models all came in much closer to each other with the SMOTEENN model at 66%, and the Random Over Sampler and SMOTE models both at 65%.  Each of these models had a high number of false-positive predictions, ranging from 5.6K to 10.5K.  The high number of false-positive predictions drove the precision of each model to be 1%.  False-negative predictions came in between 23 and 33 out of the 87 total actual high-risk loans.  The high false-negative predictions lead to the recall (or sensitivity) scores ranging between 62% and 74% for the resampling models.  Based on the high number of false-positives, along with the high number of false-negatives, we can not recommend any of these models.

#### Ensemble Models
We found greater success when we tested the ensemble machine-learning models.  The Balanced Random Forest Classifier (BRFC) model returned a balanced accuracy score of 75%, a precision of 3%, and a recall of 61%, a marked improvement over the results of the resampling models.  The BRFC model still produced 34 false-negative (evidenced in the recall rate), which is concerning.  The Easy Ensemble Classifier (EEC) model was by far the best performing model.  The EEC model has a balance accuracy score of 92%, a precision of 7% (predicting 1K false-positives), and a recall of 91% (8 false-negative predictions).  We recommend the EEC model to support the decision making for loan officers and underwriters.  Additional analysis of the model may help inform the loan approval teams on the “grey-areas” to look for in an application – where the model may be giving a false-positive or false-negative reading.
