# Credit Risk Resampling Techniques

## Initial Imports

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import ClusterCentroids
from sklearn.metrics import balanced_accuracy_score
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import confusion_matrix
from collections import Counter

# Read the CSV into DataFrame

In [3]:
file_path = Path('Resources/lending_data.csv')

df = pd.read_csv(file_path)

df.head()

Unnamed: 0,loan_size,interest_rate,homeowner,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,own,52800,0.431818,5,1,22800,low_risk
1,8400.0,6.692,own,43600,0.311927,3,0,13600,low_risk
2,9000.0,6.963,rent,46100,0.349241,3,0,16100,low_risk
3,10700.0,7.664,own,52700,0.43074,5,1,22700,low_risk
4,10800.0,7.698,mortgage,53000,0.433962,5,1,23000,low_risk


# Split the Data into Training and Testing

We need to split our dataset to distinguish our features from our target variable

In [4]:
X = df.drop(columns="loan_status")

y = df["loan_status"]

In [5]:
X.describe()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
count,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0
mean,9805.562577,7.292333,49221.949804,0.377318,3.82661,0.392308,19221.949804
std,2093.223153,0.889495,8371.635077,0.081519,1.904426,0.582086,8371.635077
min,5000.0,5.25,30000.0,0.0,0.0,0.0,0.0
25%,8700.0,6.825,44800.0,0.330357,3.0,0.0,14800.0
50%,9500.0,7.172,48100.0,0.376299,4.0,0.0,18100.0
75%,10400.0,7.528,51400.0,0.416342,4.0,1.0,21400.0
max,23800.0,13.235,105200.0,0.714829,16.0,3.0,75200.0


Checking the balance of our target values - not quite balanced!

In [38]:
y.value_counts()

low_risk     75036
high_risk     2500
Name: loan_status, dtype: int64

Splitting the data observations into four subsets: X_train, X_test, y_train, y_test

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)

X_train.head()

Unnamed: 0,loan_size,interest_rate,homeowner,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
25729,10100.0,7.402,mortgage,50300,0.403579,4,1,20300
20176,9200.0,7.049,mortgage,46900,0.360341,3,0,16900
55080,8800.0,6.87,mortgage,45200,0.336283,3,0,15200
10410,8900.0,6.919,mortgage,45700,0.343545,3,0,15700
63110,8500.0,6.731,own,43900,0.316629,3,0,13900


## Data Pre-Processing

Let's find the text labels in our data set and encode them as integers

In [8]:
label_encoder = LabelEncoder()

In [9]:
label_encoder.fit(X_train['homeowner'])

print(label_encoder.classes_)

['mortgage' 'own' 'rent']


In [40]:
X_train['homeowner encoded'] = label_encoder.transform(X_train['homeowner'])

"""The labels are the same in the test data set so we can use the same LabelEncoder object we fitted our train dataset to"""
X_test['homeowner encoded'] =label_encoder.transform(X_test['homeowner'])

X_train.drop(columns='homeowner', inplace=True)

X_test.drop(columns='homeowner', inplace=True)

display(X_train.head())
display(X_test.head())

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,homeowner encoded
25729,10100.0,7.402,50300,0.403579,4,1,20300,0
20176,9200.0,7.049,46900,0.360341,3,0,16900,0
55080,8800.0,6.87,45200,0.336283,3,0,15200,0
10410,8900.0,6.919,45700,0.343545,3,0,15700,0
63110,8500.0,6.731,43900,0.316629,3,0,13900,1


Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,homeowner encoded
37214,11500.0,8.017,56000,0.464286,5,1,26000,1
2791,9700.0,7.227,48600,0.382716,4,0,18600,0
75901,16900.0,10.312,77600,0.613402,10,2,47600,1
70752,10600.0,7.632,52400,0.427481,5,1,22400,0
54071,11300.0,7.936,55300,0.457505,5,1,25300,0


Let's proceed with scaling our data set now that it's been transformed all to integers

In [11]:
data_scaler = StandardScaler()

In [12]:
data_scaler.fit(X_train)

StandardScaler()

In [13]:
loans_data_scaled = data_scaler.transform(X_train)

loans_data_scaled_test = data_scaler.transform(X_test)

print(f"Standardized training data set")
print(loans_data_scaled[:5])
print()
print(f"Standardized testing data set")
print(loans_data_scaled_test[:5])

Standardized training data set
[[ 0.14045627  0.1230916   0.12859477  0.32187276  0.09115188  1.04495557
   0.12859477 -0.90997878]
 [-0.28969527 -0.27393966 -0.27772578 -0.20875028 -0.43434275 -0.67471435
  -0.27772578 -0.90997878]
 [-0.48087373 -0.47526712 -0.48088605 -0.50399739 -0.43434275 -0.67471435
  -0.48088605 -0.90997878]
 [-0.43307912 -0.42015513 -0.42113303 -0.41487981 -0.43434275 -0.67471435
  -0.42113303 -0.90997878]
 [-0.62425758 -0.63160521 -0.63624391 -0.74520356 -0.43434275 -0.67471435
  -0.63624391  0.59151332]]

Standardized testing data set
[[ 0.80958089  0.81480327  0.80977921  1.06689092  0.61664652  1.04495557
   0.80977921  0.59151332]
 [-0.05072219 -0.07373693 -0.07456551  0.06584169  0.09115188 -0.67471435
  -0.07456551 -0.90997878]
 [ 3.39049014  3.39606878  3.39110974  2.89689483  3.24411968  2.76462549
   3.39110974  0.59151332]
 [ 0.37942935  0.38178051  0.37955746  0.61521057  0.61664652  1.04495557
   0.37955746 -0.90997878]
 [ 0.71399166  0.72369978  0

# Simple Logistic Regression

Let's instantiate our first logistic model and used an unaltered dataset to make predictions

In [42]:
model = LogisticRegression(solver='lbfgs', random_state=1)

model.fit(X_train, y_train)

LogisticRegression(random_state=1)

### Performance metrics

Balanced accuracy score

In [45]:
y_pred = model.predict(X_test)

balanced_accuracy_score(y_test, y_pred)

0.9543211898288821

Confusion matrix

In [51]:
confusion_matrix(y_test, y_pred)

array([[  571,    54],
       [   93, 18666]], dtype=int64)

Imbalanced classification report

In [52]:
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.86      0.91      1.00      0.89      0.95      0.90       625
   low_risk       1.00      1.00      0.91      1.00      0.95      0.92     18759

avg / total       0.99      0.99      0.92      0.99      0.95      0.92     19384



# Oversampling

In this section, we will compare two oversampling algorithms to determine which algorithm results in the best performance. 

### Naive Random Oversampling

In [53]:
ros = RandomOverSampler(random_state=3)

X_resampled_ros, y_resampled_ros = ros.fit_resample(X_train, y_train)



Let's check the balance in our target values - now this is a  balanced set!

In [54]:
Counter(y_resampled_ros)

Counter({'low_risk': 56277, 'high_risk': 56277})

Let's create and train a logistric regression model with our new **randomly oversampled** training data set

In [50]:
model_ros = LogisticRegression(solver='lbfgs', random_state=3)

model_ros.fit(X_resampled_ros, y_resampled_ros)

y_pred_ros = model_ros.predict(X_test)

0.9948279972279972

### Performance metrics

Balanced accuracy score

In [63]:
balanced_accuracy_score(y_test, y_pred_ros)

0.9948279972279972

Confusion matrix

In [64]:
confusion_matrix(y_test, y_pred_ros)

array([[  622,     3],
       [  104, 18655]], dtype=int64)


Imbalanced classification report

In [65]:

print(classification_report_imbalanced(y_test, y_pred_ros))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.86      1.00      0.99      0.92      0.99      0.99       625
   low_risk       1.00      0.99      1.00      1.00      0.99      0.99     18759

avg / total       1.00      0.99      1.00      0.99      0.99      0.99     19384



### SMOTE Oversampling

Resampling the training data with SMOTE

In [59]:
X_resampled_smote, y_resampled_smote = SMOTE(random_state=4, sampling_strategy=1.0).fit_resample(
    X_train, y_train)

Counter(y_resampled_smote)

Counter({'low_risk': 56277, 'high_risk': 56277})

Let's train our Logistic Regression model using the SMOTE resampled data

In [68]:
model_smote = LogisticRegression(solver='lbfgs', random_state=4)

model_smote.fit(X_resampled_smote, y_resampled_smote)

y_pred_smote = model_smote.predict(X_test)

### Performance metrics

Balanced Accuracy score

In [69]:

balanced_accuracy_score(y_test, y_pred_smote)

0.9948279972279972

Confusion matrix

In [61]:
confusion_matrix(y_test, y_pred_smote)

array([[  622,     3],
       [  104, 18655]], dtype=int64)

Imbalanced classification report

In [27]:

print(classification_report_imbalanced(y_test, y_pred_smote))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.86      1.00      0.99      0.92      0.99      0.99       625
   low_risk       1.00      0.99      1.00      1.00      0.99      0.99     18759

avg / total       1.00      0.99      1.00      0.99      0.99      0.99     19384



# Undersampling

In this section, we will test an undersampling algorithm to determine which algorithm results in the best performance compared to the oversampling algorithms above. You will undersample the data using the Cluster Centroids algorithm and complete the folliowing steps:

Let's resample the data using the ClusterCentroids resampler

In [66]:
cc = ClusterCentroids(random_state=5)

X_resampled_cc, y_resampled_cc = cc.fit_resample(X_train, y_train)

Counter(y_resampled_cc)

Counter({'high_risk': 1875, 'low_risk': 1875})

Let's train the Logistic Regression model using the resampled data

In [67]:
model_cc = LogisticRegression(solver='lbfgs', random_state=1)

model_cc.fit(X_resampled_cc, y_resampled_cc)

y_pred_cc=model_cc.predict(X_test)

### Performance metrics

Balanced accuracy score

In [70]:
balanced_accuracy_score(y_test, y_pred_cc)

0.9836813049736126

Confusion matrix

In [71]:
confusion_matrix(y_test, y_pred_cc)

array([[  608,    17],
       [  102, 18657]], dtype=int64)

Imbalanced classification report

In [72]:
print(classification_report_imbalanced(y_test, y_pred_cc))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.86      0.97      0.99      0.91      0.98      0.97       625
   low_risk       1.00      0.99      0.97      1.00      0.98      0.97     18759

avg / total       0.99      0.99      0.97      0.99      0.98      0.97     19384



# Combination (Over and Under) Sampling

In this section, we will test a combination over- and under-sampling algorithm to determine if the algorithm results in the best performance compared to the other sampling algorithms above.

Let's resample the training data with SMOTEENN

In [77]:
sm = SMOTEENN(random_state=1)

X_resampled_sm, y_resampled_sm = sm.fit_resample(X_train, y_train)

Counter(y_resampled_sm)

Counter({'high_risk': 55299, 'low_risk': 55918})

In [78]:
model_sm = LogisticRegression(solver='lbfgs', random_state=1)

model_sm.fit(X_resampled_sm, y_resampled_sm)

y_pred_smoteen = model_sm.predict(X_test)

### Performance metrics 

Balanced accuracy score

In [79]:

balanced_accuracy_score(y_test, y_pred_smoteen)

0.994748035609574

Confusion Matrix

In [36]:

confusion_matrix(y_test, y_pred_smoteen)

array([[  622,     3],
       [  107, 18652]], dtype=int64)

Imbalanced classification report

In [80]:
print(classification_report_imbalanced(y_test, y_pred_smoteen))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.85      1.00      0.99      0.92      0.99      0.99       625
   low_risk       1.00      0.99      1.00      1.00      0.99      0.99     18759

avg / total       1.00      0.99      1.00      0.99      0.99      0.99     19384



# Conclusions

1. Which model had the best balanced accuracy score?

- Although all models yielded relatively high balanced accurary scores (close to 1), the models with marginally highest (& equal) scores were the ones using oversampled data from SMOTE and Naive Random Sample techniques.

2. Which model had the best recall score?

- All models yield an average recall score of 0.99 which is great. The model with the highest recall score for "high risk" classifications was the model using oversampled and SMOTEEN sampled data, while the highest 
  recall for "low risk" classification was the simple logistic regression model using no resampled data.

3. Which model had the best geometric mean score?

 -   The models with oversampled and SMOTEEN sampled datasets produced an excellent geometric mean score of 0.99.
