# Ensemble Models

In [180]:
import pandas as pd
import numpy as np
import pickle
import plotly.express as px
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.model_selection import learning_curve
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from mlxtend.evaluate import bias_variance_decomp
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve, roc_auc_score, auc


### Reading the preprocessed Phishing Websites dataset

In [2]:
df = pd.read_csv("../PhishingWebsites-ReducedAndStandardized.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,...,f10,f11,f12,f13,f14,f15,f16,f17,f18,t
0,0,-0.432366,-1.926665,0.958237,0.402154,-1.486897,-0.589921,1.151029,0.827776,-0.361705,...,-1.063235,-0.704374,-1.55527,-1.383683,-1.371855,-0.390849,0.4196,-1.301501,-1.291381,-1.0
1,1,0.518745,-0.34113,0.958237,0.402154,-1.486897,-0.589921,1.151029,0.827776,-0.361705,...,-1.063235,-0.704374,-0.347097,0.722709,0.821486,-0.390849,0.4196,-0.078232,0.107014,-1.0
2,2,-0.432366,-1.133898,0.178515,0.402154,-1.486897,-0.589921,-0.603608,0.827776,-0.361705,...,0.940526,-0.704374,0.861076,0.722709,-1.371855,-0.390849,0.4196,-1.301501,0.107014,-1.0
3,3,0.518745,-0.34113,0.178515,0.402154,-1.486897,-0.589921,-2.358245,-1.208056,-0.361705,...,-1.063235,1.4197,0.861076,0.722709,-1.371855,-0.390849,0.4196,-1.301501,0.107014,-1.0
4,4,-0.432366,-0.34113,0.178515,0.402154,-1.486897,-0.589921,1.151029,0.827776,-0.361705,...,-1.063235,-0.704374,-0.347097,0.722709,0.821486,-0.390849,0.4196,1.145038,0.107014,1.0


The first column must be removed as it only holds row indices

In [3]:
df.drop(['Unnamed: 0'], axis = 1, inplace = True)
df.head()

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17,f18,t
0,-0.432366,-1.926665,0.958237,0.402154,-1.486897,-0.589921,1.151029,0.827776,-0.361705,1.463648,-1.063235,-0.704374,-1.55527,-1.383683,-1.371855,-0.390849,0.4196,-1.301501,-1.291381,-1.0
1,0.518745,-0.34113,0.958237,0.402154,-1.486897,-0.589921,1.151029,0.827776,-0.361705,-1.154365,-1.063235,-0.704374,-0.347097,0.722709,0.821486,-0.390849,0.4196,-0.078232,0.107014,-1.0
2,-0.432366,-1.133898,0.178515,0.402154,-1.486897,-0.589921,-0.603608,0.827776,-0.361705,-1.154365,0.940526,-0.704374,0.861076,0.722709,-1.371855,-0.390849,0.4196,-1.301501,0.107014,-1.0
3,0.518745,-0.34113,0.178515,0.402154,-1.486897,-0.589921,-2.358245,-1.208056,-0.361705,0.154642,-1.063235,1.4197,0.861076,0.722709,-1.371855,-0.390849,0.4196,-1.301501,0.107014,-1.0
4,-0.432366,-0.34113,0.178515,0.402154,-1.486897,-0.589921,1.151029,0.827776,-0.361705,0.154642,-1.063235,-0.704374,-0.347097,0.722709,0.821486,-0.390849,0.4196,1.145038,0.107014,1.0


In [4]:
df.shape

(11055, 20)

The data can be split into the feautures and the target class

In [5]:
X = df.iloc[:,0:-1]
y = df['t']

In [6]:
y.value_counts()

 1.0    6157
-1.0    4898
Name: t, dtype: int64

Training and testing sets must be produced

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y, test_size=0.3)

## Ensemble Model 1: XGBoost

XGBoost is a gradient-boosted tree algorithm, which will be used since the best performing of the weak learners was found to be the decision tree (with max_depth=10)

In [141]:
xgb_model = xgb.XGBClassifier(max_depth=10, objective='binary:logistic', random_state=42)
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)



In [142]:
print("Accuracy on training set: {:.3f}".format(xgb_model.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(xgb_model.score(X_test, y_test)))

Accuracy on training set: 0.990
Accuracy on test set: 0.971


In [143]:
cf_rep_xgb = classification_report(y_test, y_pred_xgb, target_names = ['Not Phishing', 'Phishing'])
print(cf_rep_xgb)

              precision    recall  f1-score   support

Not Phishing       0.97      0.96      0.97      1470
    Phishing       0.97      0.98      0.97      1847

    accuracy                           0.97      3317
   macro avg       0.97      0.97      0.97      3317
weighted avg       0.97      0.97      0.97      3317



In [144]:
confusion_matrix = metrics.confusion_matrix(y_test, y_pred_xgb)
confusion_matrix = confusion_matrix.astype(int)

layout = {
    "title": "Confusion Matrix", 
    "title_x": 0.5,
    "xaxis": {"title": "Predicted"}, 
    "yaxis": {"title": "Real"},
    "width": 500
}

fig = go.Figure(data=go.Heatmap(z=confusion_matrix,
                                x=['Not Phishing','Phishing'],
                                y=['Not Phishing','Phishing']),
                layout=layout)
fig.show()

In [107]:
train_sizes, train_scores, test_scores = learning_curve(estimator = xgb_model, X = X, y = y, cv = 5, scoring = "accuracy", train_sizes = [0.2,0.4,0.6,0.8,1])
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

fig = go.Figure([
    go.Scatter(
        name = 'Training Set',
        x = train_sizes,
        y = train_mean,
        mode = 'lines',
        line = dict(color='rgb(123,104,238)', dash = 'dot')
    ),
    go.Scatter(
        name='Train Upper Bound',
        x = train_sizes, 
        y = train_mean + train_std,
        mode = 'lines',
        line=dict(width=0),
        showlegend = False
    ),
    go.Scatter(
        name='Train Lower Bound',
        x = train_sizes,
        y = train_mean - train_std,
        line=dict(width=0),
        mode='lines',
        fillcolor='rgba(123,104,238,0.3)',
        fill='tonexty',
        showlegend = False
    ),
    go.Scatter(
        name = 'Test Set',
        x = train_sizes,
        y = test_mean,
        mode = 'lines',
        line = dict(color='rgb(102,205,170)')
    ),
    go.Scatter(
        name='Test Upper Bound',
        x = train_sizes, 
        y = test_mean + test_std,
        mode = 'lines',
        line=dict(width=0),
        showlegend = False
    ),
    go.Scatter(
        name='Test Lower Bound',
        x = train_sizes,
        y = test_mean - test_std,
        line=dict(width=0),
        mode='lines',
        fillcolor='rgba(102,205,170,0.3)',
        fill='tonexty',
        showlegend = False
    )
])
fig.update_layout(
    xaxis_title = 'Training Set Size',
    yaxis_title = 'Accuracy',
    title='XGBoost Learning Curve',
    title_x = 0.5,
    hovermode = "x"
)
fig.show()



In [87]:
y_train_loss = y_train
y_train_loss = y_train_loss.replace(-1,0) #param for decomp takes 0-1 loss, classes must be 0 and 1 not -1 and 1 
y_test_loss = y_test
y_test_loss = y_test_loss.replace(-1,0)

7157     1.0
4485     1.0
10269    0.0
485      0.0
127      1.0
        ... 
10730    0.0
7739     0.0
5886     1.0
8018     1.0
10311    0.0
Name: t, Length: 7738, dtype: float64


In [108]:
loss, bias, var = bias_variance_decomp(xgb_model, X_train.values, y_train_loss.values, X_test.values, y_test_loss.values, num_rounds=20, random_seed=42)
print('Loss: %.3f' % loss)
print('Bias: %.3f' % bias)
print('Variance: %.3f' % var)

Loss: 0.035
Bias: 0.029
Variance: 0.017


In [109]:
fpr, tpr, thresholds = roc_curve(y_test, y_pred_xgb)

fig = px.area(
    x=fpr, y=tpr,
    title=f'XGBoost ROC Curve (AUC={auc(fpr, tpr):.4f})',
    labels=dict(x='False Positive Rate', y='True Positive Rate'),
    width=700, height=500
)
fig.add_shape(
    type='line', line=dict(dash='dash'),
    x0=0, x1=1, y0=0, y1=1
)

fig.update_layout(title_x=0.5)
fig.update_yaxes(scaleanchor="x", scaleratio=1)
fig.update_xaxes(constrain='domain')
fig.show()

## Ensemble Model 2: Stacking Classifier

Stacking implements a set of heterogeneous weak learners, and can be used to combine the 3 weak learners (with their best hyperparameter values). In stacking, the predictions each all the base models are fed into a final model (meta-classifier). The final prediction is then made by the meta-classifier. This differs from the voting classifier, which simply aggregates the predictions of the base models.  

In [36]:
models = [
    ('svc',SVC(kernel='rbf', C=1)),
    ('dt',DecisionTreeClassifier(max_depth=10, random_state=42)),
    ('lr',LogisticRegression(solver='liblinear', C=1, random_state=42))
]
stacking = StackingClassifier(estimators=models) #final estimator default: LR
stacking.fit(X_train, y_train)
y_pred_st = stacking.predict(X_test)

In [37]:
print("Accuracy on training set: {:.3f}".format(stacking.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(stacking.score(X_test, y_test)))

Accuracy on training set: 0.967
Accuracy on test set: 0.961


In [45]:
cf_rep_st = classification_report(y_test, y_pred_st, target_names = ['Not Phishing', 'Phishing'])
print(cf_rep_st)

              precision    recall  f1-score   support

Not Phishing       0.96      0.95      0.95      1470
    Phishing       0.96      0.97      0.96      1847

    accuracy                           0.96      3317
   macro avg       0.96      0.96      0.96      3317
weighted avg       0.96      0.96      0.96      3317



In [38]:
confusion_matrix = metrics.confusion_matrix(y_test, y_pred_st)
confusion_matrix = confusion_matrix.astype(int)

layout = {
    "title": "Confusion Matrix", 
    "title_x": 0.5,
    "xaxis": {"title": "Predicted"}, 
    "yaxis": {"title": "Real"},
    "width": 500
}

fig = go.Figure(data=go.Heatmap(z=confusion_matrix,
                                x=['Not Phishing','Phishing'],
                                y=['Not Phishing','Phishing']),
                layout=layout)
fig.show()

In [147]:
train_sizes, train_scores, test_scores = learning_curve(estimator = stacking, X = X, y = y, cv = 5, scoring = "accuracy", train_sizes = [0.2,0.4,0.6,0.8,1])
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

In [90]:
loss, bias, var = bias_variance_decomp(stacking, X_train.values, y_train_loss.values, X_test.values, y_test_loss.values, num_rounds=20, random_seed=42)
print('Loss: %.3f' % loss)
print('Bias: %.3f' % bias)
print('Variance: %.3f' % var)

MSE: 0.047
Bias: 0.039
Variance: 0.023


In [65]:
fpr, tpr, thresholds = roc_curve(y_test, y_pred_st)

fig = px.area(
    x=fpr, y=tpr,
    title=f'Stacking ROC Curve (AUC={auc(fpr, tpr):.4f})',
    labels=dict(x='False Positive Rate', y='True Positive Rate'),
    width=700, height=500
)
fig.add_shape(
    type='line', line=dict(dash='dash'),
    x0=0, x1=1, y0=0, y1=1
)

fig.update_layout(title_x=0.5)
fig.update_yaxes(scaleanchor="x", scaleratio=1)
fig.update_xaxes(constrain='domain')
fig.show()

## Ensemble Model 3: Random Forest

Random forests uses bagging on decision trees, and will be utilized because once again the decision tree was the best performing of the 3 weak learners.

In [8]:
scores = []
criteria = ['gini', 'entropy', 'log_loss']
combinations = []
maxScore = 0
bestEst = 0
bestCri = ''
for i in range(100,1050,50): #no of trees
    for c in criteria: #no of leaf nodes
        rf = RandomForestClassifier(n_estimators=i, criterion=c, n_jobs=-1, random_state=42)
        rf.fit(X_train, y_train)
        y_pred_rf = rf.predict(X_test)
        x = metrics.accuracy_score(y_test, y_pred_rf)
        scores.append(x)
        combinations.append('%d %s' % (i,c))
        if x > maxScore:
            maxScore = x
            bestEst = i
            bestCri = c
fig = px.line(x = combinations, y = scores)
fig.update_layout(xaxis_title = 'No. of Estimators & Criterion', yaxis_title = 'Accuracy')
fig.show()

In [9]:
print('The best model used %d estimators with %s criterion, producing an accuracy of %f' % (bestEst, bestCri, maxScore))
rf = RandomForestClassifier(n_estimators=i, criterion=c, n_jobs=-1, random_state=42)
rf.fit(X_train, y_train)

The best model used 450 estimators with entropy criterion, producing an accuracy of 0.970757


In [14]:
print("Accuracy on training set: {:.3f}".format(rf.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(rf.score(X_test, y_test)))

Accuracy on training set: 0.990
Accuracy on test set: 0.970


In [46]:
cf_rep_rf = classification_report(y_test, y_pred_rf, target_names = ['Not Phishing', 'Phishing'])
print(cf_rep_rf)

              precision    recall  f1-score   support

Not Phishing       0.97      0.96      0.97      1470
    Phishing       0.97      0.98      0.97      1847

    accuracy                           0.97      3317
   macro avg       0.97      0.97      0.97      3317
weighted avg       0.97      0.97      0.97      3317



In [33]:
confusion_matrix = metrics.confusion_matrix(y_test, y_pred_rf)
confusion_matrix = confusion_matrix.astype(int)

layout = {
    "title": "Confusion Matrix", 
    "title_x": 0.5,
    "xaxis": {"title": "Predicted"}, 
    "yaxis": {"title": "Real"},
    "width": 500
}

fig = go.Figure(data=go.Heatmap(z=confusion_matrix,
                                x=['Not Phishing','Phishing'],
                                y=['Not Phishing','Phishing']),
                layout=layout)
fig.show()

In [48]:
train_sizes, train_scores, test_scores = learning_curve(estimator = rf, X = X, y = y, cv = 5, scoring = "accuracy", train_sizes = [0.2,0.4,0.6,0.8,1])
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

fig = go.Figure([
    go.Scatter(
        name = 'Training Set',
        x = train_sizes,
        y = train_mean,
        mode = 'lines',
        line = dict(color='rgb(123,104,238)', dash = 'dot')
    ),
    go.Scatter(
        name='Train Upper Bound',
        x = train_sizes, 
        y = train_mean + train_std,
        mode = 'lines',
        line=dict(width=0),
        showlegend = False
    ),
    go.Scatter(
        name='Train Lower Bound',
        x = train_sizes,
        y = train_mean - train_std,
        line=dict(width=0),
        mode='lines',
        fillcolor='rgba(123,104,238,0.3)',
        fill='tonexty',
        showlegend = False
    ),
    go.Scatter(
        name = 'Test Set',
        x = train_sizes,
        y = test_mean,
        mode = 'lines',
        line = dict(color='rgb(102,205,170)')
    ),
    go.Scatter(
        name='Test Upper Bound',
        x = train_sizes, 
        y = test_mean + test_std,
        mode = 'lines',
        line=dict(width=0),
        showlegend = False
    ),
    go.Scatter(
        name='Test Lower Bound',
        x = train_sizes,
        y = test_mean - test_std,
        line=dict(width=0),
        mode='lines',
        fillcolor='rgba(102,205,170,0.3)',
        fill='tonexty',
        showlegend = False
    )
])
fig.update_layout(
    xaxis_title = 'Training Set Size',
    yaxis_title = 'Accuracy',
    title='Random Forest Learning Curve',
    title_x = 0.5,
    hovermode = "x"
)
fig.show()

In [92]:
loss, bias, var = bias_variance_decomp(rf, X_train.values, y_train_loss.values, X_test.values, y_test_loss.values, num_rounds=20, random_seed=42)
print('Loss: %.3f' % loss)
print('Bias: %.3f' % bias)
print('Variance: %.3f' % var)

Loss: 0.036
Bias: 0.030
Variance: 0.016


In [64]:
fpr, tpr, thresholds = roc_curve(y_test, y_pred_rf)

fig = px.area(
    x=fpr, y=tpr,
    title=f'Random Forest ROC Curve (AUC={auc(fpr, tpr):.4f})',
    labels=dict(x='False Positive Rate', y='True Positive Rate'),
    width=700, height=500
)
fig.add_shape(
    type='line', line=dict(dash='dash'),
    x0=0, x1=1, y0=0, y1=1
)

fig.update_layout(title_x=0.5)
fig.update_yaxes(scaleanchor="x", scaleratio=1)
fig.update_xaxes(constrain='domain')
fig.show()

### Comparing the 3 Models

- Accuracy: % of correct classifications (no. of correctly classified instances/total no. of instances)
- Precision: ratio of correctly classified +ve samples (TP/TP+FP)
- Recall: measures ability to detective +ve samples (TP/TP+FN) [ratio of +ve samples correctly classified to total no. of +ve sample] 
- F1-Score: harmonic mean of precision and recall (2pr/p+r)

In [110]:
accuracies = [accuracy_score(y_test, y_pred_xgb), accuracy_score(y_test, y_pred_st), accuracy_score(y_test, y_pred_rf)]
precisions = [precision_score(y_test, y_pred_xgb), precision_score(y_test, y_pred_st), precision_score(y_test, y_pred_rf,)] 
recalls = [recall_score(y_test, y_pred_xgb), recall_score(y_test, y_pred_st), recall_score(y_test, y_pred_rf)]
f1scores = [f1_score(y_test, y_pred_xgb), f1_score(y_test, y_pred_st), f1_score(y_test, y_pred_rf)]
pMetrics = {'Model': ['XGBoost', 'Stacking', 'RF'], 'Accuracy': accuracies, 'Precision': precisions, 'Recall': recalls, 'F1-Score': f1scores}
PMdf = pd.DataFrame(pMetrics)
fig = px.bar(PMdf, x = 'Model', y = ['Accuracy', 'Precision', 'Recall', 'F1-Score'], barmode = 'group', text_auto = True, title = 'Comparison of the Ensemble Models', color_discrete_sequence=px.colors.qualitative.T10)
fig.update_layout(title_x=0.5)
fig.show()

## Hyper-Tuning Best Ensemble Model

Since the XGBoost was found to be the highest perfoming of all 3 ensemble models, it will be further refined by adjusting the hyperparameters. 

In [None]:
paramters = {
    'n_estimators':[50, 100, 150],
    'max_depth':[4, 6, 8, 10],
    'learning_rate':[0.01, 0.1, 0.3, 0.4],
    'gamma':[0, 2, 4, 6], #min loss reduction to make split
    'objective':['binary:logistic']
}
xgb_tuned = GridSearchCV(estimator=xgb.XGBClassifier(),param_grid=paramters,scoring='accuracy') #default cv: 5-fold
xgb_tuned.fit(X_train,y_train)

In [165]:
xgb_tuned.best_params_

{'gamma': 0,
 'learning_rate': 0.4,
 'max_depth': 6,
 'n_estimators': 150,
 'objective': 'binary:logistic'}

In [166]:
y_pred_xgbt = xgb_tuned.predict(X_test)
print("Accuracy on training set: {:.3f}".format(xgb_tuned.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(xgb_tuned.score(X_test, y_test)))
cf_rep_xgbt = classification_report(y_test, y_pred_xgbt, target_names = ['Not Phishing', 'Phishing'])
print(cf_rep_xgbt)

Accuracy on training set: 0.989
Accuracy on test set: 0.971
              precision    recall  f1-score   support

Not Phishing       0.98      0.96      0.97      1470
    Phishing       0.97      0.98      0.97      1847

    accuracy                           0.97      3317
   macro avg       0.97      0.97      0.97      3317
weighted avg       0.97      0.97      0.97      3317



In [167]:
confusion_matrix = metrics.confusion_matrix(y_test, y_pred_xgbt)
confusion_matrix = confusion_matrix.astype(int)

layout = {
    "title": "Confusion Matrix", 
    "title_x": 0.5,
    "xaxis": {"title": "Predicted"}, 
    "yaxis": {"title": "Real"},
    "width": 500
}

fig = go.Figure(data=go.Heatmap(z=confusion_matrix,
                                x=['Not Phishing','Phishing'],
                                y=['Not Phishing','Phishing']),
                layout=layout)
fig.show()

Since the grid search produced a worse performing model, hyper-tuning will be reattempted using randomized search.

In [None]:
paramters = {
    'n_estimators':[20, 40, 60, 80, 100, 120],
    'max_depth':[2, 4, 6, 8, 10],
    'learning_rate':[0.01, 0.05, 0.1, 0.2, 0.3, 0.4],
    'gamma':[0, 0.5, 1, 2, 5, 10], #min loss reduction to make split
    'objective':['binary:logistic']
}
xgb_tuned2 = RandomizedSearchCV(estimator=xgb.XGBClassifier(objective='binary:logistic'),param_distributions=paramters,scoring='accuracy') #default cv: 5-fold
xgb_tuned2.fit(X_train,y_train)

In [169]:
xgb_tuned2.best_params_

{'objective': 'binary:logistic',
 'n_estimators': 120,
 'max_depth': 6,
 'learning_rate': 0.4,
 'gamma': 0}

In [170]:
y_pred_xgbt2 = xgb_tuned2.predict(X_test)
print("Accuracy on training set: {:.3f}".format(xgb_tuned2.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(xgb_tuned2.score(X_test, y_test)))
cf_rep_xgbt2 = classification_report(y_test, y_pred_xgbt2, target_names = ['Not Phishing', 'Phishing'])
print(cf_rep_xgbt2)

Accuracy on training set: 0.989
Accuracy on test set: 0.971
              precision    recall  f1-score   support

Not Phishing       0.98      0.96      0.97      1470
    Phishing       0.97      0.98      0.97      1847

    accuracy                           0.97      3317
   macro avg       0.97      0.97      0.97      3317
weighted avg       0.97      0.97      0.97      3317



In [146]:
confusion_matrix = metrics.confusion_matrix(y_test, y_pred_xgbt2)
confusion_matrix = confusion_matrix.astype(int)

layout = {
    "title": "Confusion Matrix", 
    "title_x": 0.5,
    "xaxis": {"title": "Predicted"}, 
    "yaxis": {"title": "Real"},
    "width": 500
}

fig = go.Figure(data=go.Heatmap(z=confusion_matrix,
                                x=['Not Phishing','Phishing'],
                                y=['Not Phishing','Phishing']),
                layout=layout)
fig.show()

In [171]:
accuracies = [accuracy_score(y_test, y_pred_xgb), accuracy_score(y_test, y_pred_xgbt), accuracy_score(y_test, y_pred_xgbt2)]
precisions = [precision_score(y_test, y_pred_xgb), precision_score(y_test, y_pred_xgbt), precision_score(y_test, y_pred_xgbt2)] 
recalls = [recall_score(y_test, y_pred_xgb), recall_score(y_test, y_pred_xgbt), recall_score(y_test, y_pred_xgbt2)]
f1scores = [f1_score(y_test, y_pred_xgb), f1_score(y_test, y_pred_xgbt),f1_score(y_test, y_pred_xgbt2)]
pMetrics = {'Model': ['XGBoost', 'XGB Grid Search', 'XGB Randomized Search'], 'Accuracy': accuracies, 'Precision': precisions, 'Recall': recalls, 'F1-Score': f1scores}
PMdf = pd.DataFrame(pMetrics)
fig = px.bar(PMdf, x = 'Model', y = ['Accuracy', 'Precision', 'Recall', 'F1-Score'], barmode = 'group', text_auto = True, title = 'XGB Before and After Hyper-Tuning', color_discrete_sequence=px.colors.qualitative.T10)
fig.update_layout(title_x=0.5)

Thus, the final, hyper-tuned ensemble model is xgb_tuned2; an XGBooster with 120 estimators, max depth of 6, 0.4 learning rate and gamma of 0 (as well as obviously objective = 'binary:logistic')

In [181]:
finalEnsemble = pickle.dump(xgb_tuned2, open('finalEnsemble.sav','wb'))

## Comparing Hyper-Tuned Ensemble Model With Best Weak Learner

In [183]:
#best weak learner: decision tree with max depth 10
tree = DecisionTreeClassifier(random_state=42, max_depth=10)
tree.fit(X_train, y_train)
y_pred_dt = tree.predict(X_test)

In [185]:
accuracies = [accuracy_score(y_test, y_pred_xgbt2), accuracy_score(y_test, y_pred_dt)]
precisions = [precision_score(y_test, y_pred_xgbt2), precision_score(y_test, y_pred_dt)] 
recalls = [recall_score(y_test, y_pred_xgbt2), recall_score(y_test, y_pred_dt)]
f1scores = [f1_score(y_test, y_pred_xgbt2), f1_score(y_test, y_pred_dt)]
pMetrics = {'Model': ['Ensemble', 'DTree'], 'Accuracy': accuracies, 'Precision': precisions, 'Recall': recalls, 'F1-Score': f1scores}
PMdf = pd.DataFrame(pMetrics)
fig = px.bar(PMdf, x = 'Model', y = ['Accuracy', 'Precision', 'Recall', 'F1-Score'], barmode = 'group', text_auto = True, title = 'Ensemble Model vs Weak Learner', color_discrete_sequence=px.colors.qualitative.T10)
fig.update_layout(title_x=0.5)