# Ensemble Learning

## Initial Imports

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
import calendar

In [57]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.ensemble import EasyEnsembleClassifier

## Read the CSV and Perform Basic Data Cleaning

In [4]:
# Load the data
file_path = Path('Resources/LoanStats_2019Q1.csv')
df_loan_stats = pd.read_csv(file_path)

#Basic cleaning
df_loan_stats.dropna(inplace=True)
# Preview the data
df_loan_stats.head()

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,dti,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,10500.0,0.1719,375.35,RENT,66000.0,Source Verified,Mar-2019,low_risk,n,27.24,...,85.7,100.0,0.0,0.0,65687.0,38199.0,2000.0,61987.0,N,N
1,25000.0,0.2,929.09,MORTGAGE,105000.0,Verified,Mar-2019,low_risk,n,20.23,...,91.2,50.0,1.0,0.0,271427.0,60641.0,41200.0,49197.0,N,N
2,20000.0,0.2,529.88,MORTGAGE,56000.0,Verified,Mar-2019,low_risk,n,24.26,...,66.7,50.0,0.0,0.0,60644.0,45684.0,7500.0,43144.0,N,N
3,10000.0,0.164,353.55,RENT,92000.0,Verified,Mar-2019,low_risk,n,31.44,...,100.0,50.0,1.0,0.0,99506.0,68784.0,19700.0,76506.0,N,N
4,22000.0,0.1474,520.39,MORTGAGE,52000.0,Not Verified,Mar-2019,low_risk,n,18.76,...,100.0,0.0,0.0,0.0,219750.0,25919.0,27600.0,20000.0,N,N


## Split the Data into Training and Testing

In [27]:
# Create our features
X = df_loan_stats.copy()
X.drop("loan_status", axis=1, inplace=True)


#We would need to encode the data for it be able to scale correctly.  Without enconding we can get error of converting STR to INT
# Fitting and encoding the columns with the LabelEncoder
# Create the LabelEncoder instance
le = LabelEncoder()

# Encode Home Ownership column
le.fit(X["home_ownership"])
X["home_ownership"] = le.transform(X["home_ownership"])

# Encode Verification Status Column
le.fit(X["verification_status"])
X["verification_status"] = le.transform(X["verification_status"])

# Encode Payment Plan Column
le.fit(X["pymnt_plan"])
X["pymnt_plan"] = le.transform(X["pymnt_plan"])

# Encode Hardship Flag Column
le.fit(X["hardship_flag"])
X["hardship_flag"] = le.transform(X["hardship_flag"])

# Encode Debt Settlement Column
le.fit(X["debt_settlement_flag"])
X["debt_settlement_flag"] = le.transform(X["debt_settlement_flag"])

# Encode Application Type Column
le.fit(X["application_type"])
X["application_type"] = le.transform(X["application_type"])

# Encode Initial list STatus Column
le.fit(X["initial_list_status"])
X["initial_list_status"] = le.transform(X["initial_list_status"])

#We will split month and year to enable encoding. Currently the format is Mar-2019 and we want to split to Mar, 2019 in two columns
X[['issue_mon', 'issue_year']] = X['issue_d'].str.split('-', expand=True)
X[['next_pay_mon', 'next_pay_year']] = X['next_pymnt_d'].str.split('-', expand=True)


#We can drop the original issue_d, next_pymnt_d columns now as we have split into two columns.
X.drop(columns=["issue_d","next_pymnt_d"] , axis=1, inplace=True)
# Preview the DataFrame
X.head()

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,pymnt_plan,dti,delinq_2yrs,inq_last_6mths,...,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag,issue_mon,issue_year,next_pay_mon,next_pay_year
0,10500.0,0.1719,375.35,3,66000.0,1,0,27.24,0.0,0.0,...,65687.0,38199.0,2000.0,61987.0,0,0,Mar,2019,May,2019
1,25000.0,0.2,929.09,1,105000.0,2,0,20.23,0.0,0.0,...,271427.0,60641.0,41200.0,49197.0,0,0,Mar,2019,May,2019
2,20000.0,0.2,529.88,1,56000.0,2,0,24.26,0.0,0.0,...,60644.0,45684.0,7500.0,43144.0,0,0,Mar,2019,May,2019
3,10000.0,0.164,353.55,3,92000.0,2,0,31.44,0.0,1.0,...,99506.0,68784.0,19700.0,76506.0,0,0,Mar,2019,May,2019
4,22000.0,0.1474,520.39,1,52000.0,0,0,18.76,0.0,1.0,...,219750.0,25919.0,27600.0,20000.0,0,0,Mar,2019,May,2019


In [28]:
# Months dictionary, also the data has only first three characters of the month and the full month name
name_to_num = {name[0:3]: num for num, name in enumerate(calendar.month_name) if num}
name_to_num

{'Jan': 1,
 'Feb': 2,
 'Mar': 3,
 'Apr': 4,
 'May': 5,
 'Jun': 6,
 'Jul': 7,
 'Aug': 8,
 'Sep': 9,
 'Oct': 10,
 'Nov': 11,
 'Dec': 12}

In [29]:
#Apply the months encoding
X["issue_mon"] = X["issue_mon"].apply(lambda x: name_to_num[x])
X["next_pay_mon"] = X["next_pay_mon"].apply(lambda x: name_to_num[x])
# Preview the DataFrame
X.head()

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,pymnt_plan,dti,delinq_2yrs,inq_last_6mths,...,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag,issue_mon,issue_year,next_pay_mon,next_pay_year
0,10500.0,0.1719,375.35,3,66000.0,1,0,27.24,0.0,0.0,...,65687.0,38199.0,2000.0,61987.0,0,0,3,2019,5,2019
1,25000.0,0.2,929.09,1,105000.0,2,0,20.23,0.0,0.0,...,271427.0,60641.0,41200.0,49197.0,0,0,3,2019,5,2019
2,20000.0,0.2,529.88,1,56000.0,2,0,24.26,0.0,0.0,...,60644.0,45684.0,7500.0,43144.0,0,0,3,2019,5,2019
3,10000.0,0.164,353.55,3,92000.0,2,0,31.44,0.0,1.0,...,99506.0,68784.0,19700.0,76506.0,0,0,3,2019,5,2019
4,22000.0,0.1474,520.39,1,52000.0,0,0,18.76,0.0,1.0,...,219750.0,25919.0,27600.0,20000.0,0,0,3,2019,5,2019


In [30]:
# Create our target

#y = df_loan_stats["loan_status"].values.reshape(-1, 1)
y = df_loan_stats["loan_status"]
y 

0        low_risk
1        low_risk
2        low_risk
3        low_risk
4        low_risk
           ...   
68812    low_risk
68813    low_risk
68814    low_risk
68815    low_risk
68816    low_risk
Name: loan_status, Length: 68817, dtype: object

In [31]:
X.describe()

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,pymnt_plan,dti,delinq_2yrs,inq_last_6mths,...,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag,issue_mon,next_pay_mon
count,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,...,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0
mean,16677.594562,0.127718,480.652863,1.812779,88213.71,0.669994,0.0,21.778153,0.217766,0.497697,...,0.125972,0.0,210033.2,61338.43,29734.128558,55722.4,0.0,0.0,1.726172,4.616839
std,10277.34859,0.04813,288.062432,0.941313,115580.0,0.719105,0.0,20.199244,0.718367,0.758122,...,0.336732,0.0,192808.8,57387.98,26795.394232,50958.45,0.0,0.0,0.743862,0.486161
min,1000.0,0.06,30.89,0.0,40.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3600.0,235.0,100.0,127.0,0.0,0.0,1.0,4.0
25%,9000.0,0.0881,265.73,1.0,50000.0,0.0,0.0,13.89,0.0,0.0,...,0.0,0.0,66977.0,26503.0,11600.0,22880.0,0.0,0.0,1.0,4.0
50%,15000.0,0.118,404.56,1.0,73000.0,1.0,0.0,19.76,0.0,0.0,...,0.0,0.0,146710.0,45357.0,22100.0,42000.0,0.0,0.0,2.0,5.0
75%,24000.0,0.1557,648.1,3.0,104000.0,1.0,0.0,26.66,0.0,1.0,...,0.0,0.0,303640.0,76570.0,39300.0,72499.0,0.0,0.0,2.0,5.0
max,40000.0,0.3084,1676.23,3.0,8797500.0,2.0,0.0,999.0,18.0,5.0,...,4.0,0.0,3292782.0,1295455.0,509400.0,1426964.0,0.0,0.0,3.0,5.0


In [32]:
# Check the balance of our target values
Counter(y)

Counter({'low_risk': 68470, 'high_risk': 347})

In [38]:
# Split the X and y into X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
Counter(y_train)

Counter({'low_risk': 51366, 'high_risk': 246})

## Data Pre-Processing

Scale the training and testing data using the `StandardScaler` from `sklearn`. Remember that when scaling the data, you only scale the features data (`X_train` and `X_testing`).

In [39]:
# Create the StandardScaler instance
scaler = StandardScaler()

In [40]:
X_train

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,pymnt_plan,dti,delinq_2yrs,inq_last_6mths,...,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag,issue_mon,issue_year,next_pay_mon,next_pay_year
3903,1600.0,0.0881,50.74,2,35964.0,0,0,24.12,0.0,0.0,...,148200.0,14292.0,58300.0,15000.0,0,0,3,2019,5,2019
28390,9000.0,0.0756,280.21,2,41000.0,0,0,16.89,1.0,0.0,...,68292.0,52587.0,24500.0,40492.0,0,0,2,2019,5,2019
15470,10000.0,0.1033,214.10,1,112000.0,2,0,17.75,0.0,0.0,...,416023.0,78944.0,65400.0,80642.0,0,0,2,2019,5,2019
4279,36000.0,0.1033,1167.21,3,120000.0,0,0,19.95,0.0,1.0,...,116200.0,73826.0,85200.0,30000.0,0,0,3,2019,5,2019
57514,18000.0,0.0881,570.81,1,51000.0,2,0,19.11,1.0,0.0,...,221075.0,126195.0,36500.0,96715.0,0,0,1,2019,5,2019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49100,8400.0,0.0646,257.30,1,37992.0,0,0,33.16,0.0,0.0,...,119052.0,67055.0,52800.0,66252.0,0,0,1,2019,4,2019
20609,30000.0,0.0646,918.93,1,217000.0,0,0,19.56,0.0,2.0,...,358467.0,206670.0,137300.0,84967.0,0,0,2,2019,4,2019
21440,20000.0,0.1180,442.88,1,35000.0,0,0,16.19,0.0,0.0,...,140700.0,7641.0,13700.0,12000.0,0,0,2,2019,4,2019
50057,11625.0,0.1447,273.34,3,30000.0,0,0,18.96,0.0,2.0,...,96441.0,79650.0,22700.0,66941.0,0,0,1,2019,4,2019


In [41]:
# Fit the Standard Scaler with the training data
# When fitting scaling functions, only train on the training dataset
X_scaler = scaler.fit(X_train)

In [42]:
# Scale the training and testing data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Ensemble Learners

In this section, you will compare two ensemble algorithms to determine which algorithm results in the best performance. You will train a Balanced Random Forest Classifier and an Easy Ensemble classifier . For each algorithm, be sure to complete the folliowing steps:

1. Train the model using the training data. 
2. Calculate the balanced accuracy score from sklearn.metrics.
3. Display the confusion matrix from sklearn.metrics.
4. Generate a classication report using the `imbalanced_classification_report` from imbalanced-learn.
5. For the Balanced Random Forest Classifier only, print the feature importance sorted in descending order (most important feature to least important) along with the feature score

Note: Use a random state of 1 for each algorithm to ensure consistency between tests

### Balanced Random Forest Classifier

In [46]:
# Resample the training data with the BalancedRandomForestClassifier
brf_model = BalancedRandomForestClassifier(n_estimators=100, random_state=1)

# Fit the model
brf_model = brf_model.fit(X_train_scaled, y_train)

# Making predictions using the testing data
brf_predictions = brf_model.predict(X_test_scaled)

In [62]:
# Calculated the balanced accuracy score
balanced_acc_score = balanced_accuracy_score(y_test, brf_predictions)
print(f"BRF Balanced Accuracy Score : {balanced_acc_score}")

BRF Balanced Accuracy Score : 0.776501530531912


In [58]:
# Display the confusion matrix
# Calculating the confusion matrix
cm = confusion_matrix(y_test, brf_predictions)
brf_cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)
print("Confusion Matrix")
display(brf_cm_df)

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,68,33
Actual 1,2057,15047


In [59]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, brf_predictions))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.03      0.67      0.88      0.06      0.77      0.58       101
   low_risk       1.00      0.88      0.67      0.94      0.77      0.60     17104

avg / total       0.99      0.88      0.67      0.93      0.77      0.60     17205



In [56]:
# List the features sorted in descending order by feature importance

importances = brf_model.feature_importances_

importances_sorted = sorted(zip(importances, X.columns), reverse=True)
importances_sorted

[(0.07508896270423472, 'total_rec_prncp'),
 (0.06805552433207127, 'last_pymnt_amnt'),
 (0.06428598030847282, 'total_rec_int'),
 (0.05517002732483654, 'total_pymnt_inv'),
 (0.04576012545878166, 'total_pymnt'),
 (0.03582960682662024, 'issue_mon'),
 (0.03378148146507487, 'int_rate'),
 (0.022134116871346667, 'installment'),
 (0.018146655637864096, 'mths_since_recent_inq'),
 (0.01705516099533549, 'dti'),
 (0.016492012697462866, 'out_prncp'),
 (0.016360760716168665, 'mo_sin_old_il_acct'),
 (0.01596492187550957, 'mths_since_rcnt_il'),
 (0.015946136249899674, 'total_bal_il'),
 (0.015777836767279015, 'avg_cur_bal'),
 (0.01504552488282979, 'total_bal_ex_mort'),
 (0.014472580412664717, 'tot_hi_cred_lim'),
 (0.014399542546896321, 'mo_sin_old_rev_tl_op'),
 (0.014126392995969989, 'max_bal_bc'),
 (0.013800328307379596, 'il_util'),
 (0.013740051065680584, 'bc_util'),
 (0.013721854719226443, 'annual_inc'),
 (0.013586903085491039, 'total_rev_hi_lim'),
 (0.013458478325583138, 'all_util'),
 (0.01345344447

### Easy Ensemble Classifier

In [60]:
# Train the Classifier
ensemble_model = EasyEnsembleClassifier(n_estimators=100, random_state=1)

# Fit the model
ensemble_model = ensemble_model.fit(X_train_scaled, y_train)

# Making predictions using the testing data
ensemble_predictions = ensemble_model.predict(X_test_scaled)

In [61]:
# Calculated the balanced accuracy score
ensemble_balanced_acc_score = balanced_accuracy_score(y_test, ensemble_predictions)
print(f"Ensemble Balanced Accuracy Score : {ensemble_balanced_acc_score}")

Ensemble Balanced Accuracy Score : 0.9320985653289371


In [64]:
# Display the confusion matrix
# Calculating the confusion matrix
cm = confusion_matrix(y_test, ensemble_predictions)
ensemble_cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)
print("Confusion Matrix")
display(ensemble_cm_df)

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,93,8
Actual 1,968,16136


In [65]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, ensemble_predictions))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.09      0.92      0.94      0.16      0.93      0.87       101
   low_risk       1.00      0.94      0.92      0.97      0.93      0.87     17104

avg / total       0.99      0.94      0.92      0.97      0.93      0.87     17205



### Final Questions

1. Which model had the best balanced accuracy score?

    The Easy Ensemble Classifier model has the best balanced accuracy score viz 0.93 for Easy Ensemble Classifier vs 0.77 for Balanced Random Forest Classifier.

2. Which model had the best recall score?

    The Easy Ensemble Classifier model has the best recall with avg score of 0.94 compared to 0.88 for Balanced Random Forest Classifier.

3. Which model had the best geometric mean score?

    Again, the Easy Ensemble Classifier model has the best recall with avg score of 0.93 compared to 0.77 for Balanced Random Forest Classifier.

4. What are the top three features?

    The top three features are:
    - total_rec_prncp
    - last_pymnt_amnt
    - total_rec_int