In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [2]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [3]:
file_path = "out.csv"
df = pd.read_csv(file_path)
df = df.drop("Unnamed: 0",axis = 1)
df.head()

Unnamed: 0,loan_amnt,term,int_rate,emp_length,home_ownership,annual_inc,verification_status,purpose,grades_value,label
0,5.0,0,10.65,12,1,24.0,2,1,5,0.0
1,2.5,0,15.27,1,1,30.0,1,6,4,1.0
2,2.4,0,15.96,12,1,12.252,0,8,4,0.0
3,10.0,0,13.49,12,1,49.2,1,0,4,0.0
4,3.0,0,12.69,2,1,80.0,1,0,5,0.0


In [4]:
target = ["label"]

In [5]:
df

Unnamed: 0,loan_amnt,term,int_rate,emp_length,home_ownership,annual_inc,verification_status,purpose,grades_value,label
0,5.0,0,10.65,12,1,24.000,2,1,5,0.0
1,2.5,0,15.27,1,1,30.000,1,6,4,1.0
2,2.4,0,15.96,12,1,12.252,0,8,4,0.0
3,10.0,0,13.49,12,1,49.200,1,0,4,0.0
4,3.0,0,12.69,2,1,80.000,1,0,5,0.0
...,...,...,...,...,...,...,...,...,...,...
887374,10.0,0,11.99,9,1,31.000,2,2,5,0.0
887375,24.0,0,11.99,12,2,79.000,2,4,5,0.0
887376,13.0,0,15.99,6,1,35.000,2,2,3,0.0
887377,12.0,0,19.99,2,1,64.400,1,2,2,0.0


# Split the Data into Training and Testing

In [6]:
# Create our features
X = df.drop("label", axis=1)
X = pd.get_dummies(X)

# Create our target
y = df.label.copy()

In [7]:
X.describe()

Unnamed: 0,loan_amnt,term,int_rate,emp_length,home_ownership,annual_inc,verification_status,purpose,grades_value
count,887379.0,887379.0,887379.0,887379.0,887379.0,887379.0,887379.0,887379.0,887379.0
mean,14.755265,0.0,13.24674,6.983232,1.696728,75.02725,1.027408,2.1289,4.201597
std,8.435456,0.0,4.381867,4.265429,0.639508,64.69835,0.79238,1.646615,1.312599
min,0.5,0.0,5.32,0.0,0.0,0.0,0.0,0.0,0.0
25%,8.0,0.0,9.99,3.0,1.0,45.0,0.0,1.0,3.0
50%,13.0,0.0,12.99,7.0,2.0,65.0,1.0,2.0,4.0
75%,20.0,0.0,16.2,12.0,2.0,90.0,2.0,2.0,5.0
max,35.0,0.0,28.99,12.0,3.0,9500.0,2.0,13.0,6.0


In [8]:
# Check the balance of our target values
y.value_counts()

0.0    811490
1.0     75889
Name: label, dtype: int64

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
print(Counter(y_train))
print(Counter(y_test))

Counter({0.0: 608823, 1.0: 56711})
Counter({0.0: 202667, 1.0: 19178})


# Balanced Random Forest Classifier


In [10]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
rf_model = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
rf_model.fit(X_train, y_train)
print(Counter(y_train))

Counter({0.0: 608823, 1.0: 56711})


In [11]:
# Calculate predictions
y_pred = rf_model.predict(X_test)

In [12]:
# Calculated the balanced accuracy score
from sklearn.metrics import confusion_matrix, accuracy_score
balanced_accuracy_score(y_test,y_pred)

0.6460359719780795

In [13]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index = ["Actual Low_Risk", "Actual High_Risk"],columns=["Predicted Low_risk","Predicted High_risk"])
cm_df

Unnamed: 0,Predicted Low_risk,Predicted High_risk
Actual Low_Risk,129870,72797
Actual High_Risk,6688,12490


In [14]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.95      0.64      0.65      0.77      0.65      0.42    202667
        1.0       0.15      0.65      0.64      0.24      0.65      0.42     19178

avg / total       0.88      0.64      0.65      0.72      0.65      0.42    221845



In [15]:
# List the features sorted in descending order by feature importance
importances = sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)
for importance in importances:
    print(f'{importance[1]}:  {importance[0]*100:.1f}%')

annual_inc:  24.9%
int_rate:  24.3%
loan_amnt:  22.8%
emp_length:  11.2%
purpose:  7.0%
grades_value:  3.7%
home_ownership:  3.1%
verification_status:  3.0%
term:  0.0%


# Easy Ensemble AdaBoost Classifier

In [17]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
EE_model = EasyEnsembleClassifier(n_estimators=100, random_state=1)
EE_model.fit(X_train, y_train)
print(Counter(y_train))

Counter({0.0: 608823, 1.0: 56711})


In [18]:
# Calculated the balanced accuracy score
balanced_accuracy_score(y_test,y_pred)

0.6460359719780795

In [21]:
# Display the confusion matrix
cm2 = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
cm2_df = pd.DataFrame(
    cm2, index=["Actual Low_Risk", "Actual High_Risk"], columns=["Predicted Low_Risk", "Predicted High_Risk"])

cm2_df

Unnamed: 0,Predicted Low_Risk,Predicted High_Risk
Actual Low_Risk,129870,72797
Actual High_Risk,6688,12490


In [20]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.95      0.64      0.65      0.77      0.65      0.42    202667
        1.0       0.15      0.65      0.64      0.24      0.65      0.42     19178

avg / total       0.88      0.64      0.65      0.72      0.65      0.42    221845

