# Credit Risk Ensemble Techniques

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [3]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from imblearn.metrics import classification_report_imbalanced

# Read the CSV and Perform Basic Data Cleaning

In [4]:
# Load the data
file_path = Path('lending_data.csv')
df = pd.read_csv(file_path)
binary_encoded = pd.get_dummies(df, columns=["homeowner", "loan_status"])
binary_encoded.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,homeowner_mortgage,homeowner_own,homeowner_rent,loan_status_high_risk,loan_status_low_risk
0,10700.0,7.672,52800,0.431818,5,1,22800,0,1,0,0,1
1,8400.0,6.692,43600,0.311927,3,0,13600,0,1,0,0,1
2,9000.0,6.963,46100,0.349241,3,0,16100,0,0,1,0,1
3,10700.0,7.664,52700,0.43074,5,1,22700,0,1,0,0,1
4,10800.0,7.698,53000,0.433962,5,1,23000,1,0,0,0,1


# Split the Data into Training and Testing

In [5]:
# Create our features
X = binary_encoded.copy().drop(columns=['loan_status_high_risk', 'loan_status_low_risk'])

# Create our target
y = binary_encoded['loan_status_high_risk'].ravel()

In [6]:
X.describe()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,homeowner_mortgage,homeowner_own,homeowner_rent
count,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0
mean,9805.562577,7.292333,49221.949804,0.377318,3.82661,0.392308,19221.949804,0.497472,0.398911,0.103616
std,2093.223153,0.889495,8371.635077,0.081519,1.904426,0.582086,8371.635077,0.499997,0.489678,0.304764
min,5000.0,5.25,30000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,8700.0,6.825,44800.0,0.330357,3.0,0.0,14800.0,0.0,0.0,0.0
50%,9500.0,7.172,48100.0,0.376299,4.0,0.0,18100.0,0.0,0.0,0.0
75%,10400.0,7.528,51400.0,0.416342,4.0,1.0,21400.0,1.0,1.0,0.0
max,23800.0,13.235,105200.0,0.714829,16.0,3.0,75200.0,1.0,1.0,1.0


In [7]:
# Check the balance of our target values
y[:5]

array([0, 0, 0, 0, 0], dtype=uint8)

In [8]:
# Split the X and y into X_train, X_test, y_train, y_test
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)
scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Ensemble Learners

In this section, you will compare two ensemble algorithms to determine which algorithm results in the best performance. You will train a Balanced Random Forest Classifier and an Easy Ensemble classifier . For each algorithm, be sure to complete the folliowing steps:

1. Train the model using the training data. 
2. Calculate the balanced accuracy score from sklearn.metrics.
3. Print the confusion matrix from sklearn.metrics.
4. Generate a classication report using the `imbalanced_classification_report` from imbalanced-learn.
5. For the Balanced Random Forest Classifier onely, print the feature importance sorted in descending order (most important feature to least important) along with the feature score

Note: Use a random state of 1 for each algorithm to ensure consistency between tests

### Balanced Random Forest Classifier

In [9]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
brf_model = BalancedRandomForestClassifier(n_estimators=500, random_state=78)
brf_model = brf_model.fit(X_train_scaled, y_train)

In [10]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = brf_model.predict(X_test_scaled)
balanced_accuracy_score(y_test, y_pred)

0.9934806218057921

In [11]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

In [12]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      0.99      0.99      1.00      0.99      0.99     18784
          1       0.80      0.99      0.99      0.89      0.99      0.99       600

avg / total       0.99      0.99      0.99      0.99      0.99      0.99     19384



In [13]:
# List the features sorted in descending order by feature importance
importances = brf_model.feature_importances_
sorted(zip(brf_model.feature_importances_, X.columns), reverse=True)

[(0.18605022327848544, 'debt_to_income'),
 (0.17629768153593894, 'borrower_income'),
 (0.17132595185964838, 'interest_rate'),
 (0.162703330196479, 'total_debt'),
 (0.15135371996694472, 'num_of_accounts'),
 (0.13053440139269215, 'loan_size'),
 (0.019202080501346416, 'derogatory_marks'),
 (0.0010665465358247945, 'homeowner_own'),
 (0.000994194344886187, 'homeowner_mortgage'),
 (0.00047187038775387344, 'homeowner_rent')]

### Easy Ensemble Classifier

In [14]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier 
eec_model = EasyEnsembleClassifier(random_state=78)
eec_model.fit(X_train, y_train) 

EasyEnsembleClassifier(random_state=78)

In [15]:
# Calculated the balanced accuracy score
eec_y_pred = eec_model.predict(X_test_scaled)
balanced_accuracy_score(y_test, eec_y_pred)

0.5

In [16]:
# Display the confusion matrix
print(confusion_matrix(y_test, eec_y_pred))

[[18784     0]
 [  600     0]]


In [17]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, eec_y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.97      1.00      0.00      0.98      0.00      0.00     18784
          1       0.00      0.00      1.00      0.00      0.00      0.00       600

avg / total       0.94      0.97      0.03      0.95      0.00      0.00     19384

