# Credit Risk Ensemble Techniques

In [104]:
import warnings
warnings.filterwarnings('ignore')

In [105]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [106]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from imblearn.ensemble import EasyEnsembleClassifier

# Read the CSV and Perform Basic Data Cleaning

In [123]:
# Load the data
file_path = Path('Resources/lending_data.csv')
df = pd.read_csv(file_path)

# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()

df['interest_rate'] = df['interest_rate'].astype('float') / 100

# Convert the target column values to low_risk and high_risk based on their values
x = {'Current': 'low_risk'}   
df = df.replace(x)

x = dict.fromkeys(['Late (31-120 days)', 'Late (16-30 days)', 'Default', 'In Grace Period'], 'high_risk')    
df = df.replace(x)

df.reset_index(inplace=True, drop=True)

df.drop("homeowner", inplace=True, axis=1)

df.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700,0.07672,52800,0.431818,5,1,22800,low_risk
1,8400,0.06692,43600,0.311927,3,0,13600,low_risk
2,9000,0.06963,46100,0.349241,3,0,16100,low_risk
3,10700,0.07664,52700,0.43074,5,1,22700,low_risk
4,10800,0.07698,53000,0.433962,5,1,23000,low_risk


In [124]:
df.dtypes

loan_size             int64
interest_rate       float64
borrower_income       int64
debt_to_income      float64
num_of_accounts       int64
derogatory_marks      int64
total_debt            int64
loan_status          object
dtype: object

# Split the Data into Training and Testing

In [125]:
# Create our features
X = df.drop(columns="loan_status")

# Create our target
y = df["loan_status"]

In [126]:
X.describe()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
count,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0
mean,9805.562577,0.072923,49221.949804,0.377318,3.82661,0.392308,19221.949804
std,2093.223153,0.008895,8371.635077,0.081519,1.904426,0.582086,8371.635077
min,5000.0,0.0525,30000.0,0.0,0.0,0.0,0.0
25%,8700.0,0.06825,44800.0,0.330357,3.0,0.0,14800.0
50%,9500.0,0.07172,48100.0,0.376299,4.0,0.0,18100.0
75%,10400.0,0.07528,51400.0,0.416342,4.0,1.0,21400.0
max,23800.0,0.13235,105200.0,0.714829,16.0,3.0,75200.0


In [127]:
# Check the balance of our target values
df['loan_status'].value_counts()

low_risk     75036
high_risk     2500
Name: loan_status, dtype: int64

In [128]:
# Split the X and y into X_train, X_test, y_train, y_test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test= train_test_split(X, 
                                                   y, 
                                                   random_state=78, 
                                                   stratify=y)
X_train.shape

(58152, 7)

In [129]:
# Creating StandardScaler instance
scaler = StandardScaler()

# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Ensemble Learners

In this section, you will compare two ensemble algorithms to determine which algorithm results in the best performance. You will train a Balanced Random Forest Classifier and an Easy Ensemble classifier . For each algorithm, be sure to complete the folliowing steps:

1. Train the model using the training data. 
2. Calculate the balanced accuracy score from sklearn.metrics.
3. Print the confusion matrix from sklearn.metrics.
4. Generate a classication report using the `imbalanced_classification_report` from imbalanced-learn.
5. For the Balanced Random Forest Classifier onely, print the feature importance sorted in descending order (most important feature to least important) along with the feature score

Note: Use a random state of 1 for each algorithm to ensure consistency between tests

### Balanced Random Forest Classifier

In [130]:
# Resample the training data with the BalancedRandomForestClassifier
#rf_model = RandomForestClassifier(n_estimator= 100, random_state= 1)
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)
rf_model = rf_model.fit(X_train_scaled, y_train)

In [131]:
# Calculated the balanced accuracy score
predictions = rf_model.predict(X_test_scaled)

acc_score = accuracy_score(y_test, predictions)
acc_score

0.9925196037969459

In [132]:
# Display the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

In [134]:
# Print the imbalanced classification report
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,567,58
Actual 1,87,18672


Accuracy Score : 0.9925196037969459
Classification Report
              precision    recall  f1-score   support

   high_risk       0.87      0.91      0.89       625
    low_risk       1.00      1.00      1.00     18759

    accuracy                           0.99     19384
   macro avg       0.93      0.95      0.94     19384
weighted avg       0.99      0.99      0.99     19384



In [135]:
# List the features sorted in descending order by feature importance
importances = rf_model.feature_importances_
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.31438430043865656, 'interest_rate'),
 (0.163037390825129, 'debt_to_income'),
 (0.15797079298161734, 'total_debt'),
 (0.14910235058086385, 'borrower_income'),
 (0.11997828284918705, 'loan_size'),
 (0.09539228623823517, 'num_of_accounts'),
 (0.0001345960863110296, 'derogatory_marks')]

### Easy Ensemble Classifier

In [136]:
# Train the EasyEnsembleClassifier
eec = EasyEnsembleClassifier(n_estimators=100,random_state=1)
eec

eec = eec.fit(X_train_scaled, y_train)

In [137]:
predictions = eec.predict(X_test_scaled)

In [138]:
# Calculated the balanced accuracy score
acc_score2 = accuracy_score(y_test, predictions)
acc_score2

0.9942220387948824

In [139]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

y_pred = eec.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[  625,     0],
       [18759,     0]], dtype=int64)

In [140]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced

print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.03      1.00      0.00      0.06      0.00      0.00       625
   low_risk       0.00      0.00      1.00      0.00      0.00      0.00     18759

avg / total       0.00      0.03      0.97      0.00      0.00      0.00     19384

