[Github](https://github.com/leon-pscl/CPE312_Predictive_Analytics_Using_Machine_Learning/tree/621558b0e773c4ceca8564f39841ed8b4177a9d3/Midterm/HOA5.3)

# Activity 5.3: Bagging, Boosting, and Stacking

Intended Learning Outcomes (ILOs):
* Demonstrate the use of bagging technique for classification and regression tasks
* Demonstrate boosting and stacking models in solving an identified problem.

Resources:

* Jupyter Notebook<br>
* emails.csv

## Procedure:

For this activity, you need to perform the following tasks:

1. Check the following resources for a review of Bagging and Boosting and Stacking.
   * [Bagging_Exercises.ipynb](https://drive.google.com/file/d/1O-xLD-n1lgqMoXL79FHZcO_ePJd3RgDA/view?usp=sharing)
   * [Boosting_and_Stacking_Exercises_ANSWERS.ipynb](https://drive.google.com/file/d/1jswsZAkeoWJV8TBM3hdB16tbB-KKlh75/view?usp=sharing)

2. Using your own dataset, perform bagging.

### Data Wrangling setup

In [35]:
#libraries needed
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm
from sklearn.model_selection import train_test_split
#KNN
from sklearn.neighbors import KNeighborsClassifier
#SVM
from sklearn.svm import SVC
#Logistic Regression
from sklearn.linear_model import LogisticRegression
#Decision Tree
from sklearn.tree import DecisionTreeClassifier
#Random Forest and Bagging
from sklearn.ensemble import (RandomForestClassifier, BaggingClassifier)
from sklearn.metrics import (
    classification_report,
    accuracy_score,
    confusion_matrix,
    roc_auc_score
)
#gridsearch
from sklearn.model_selection import GridSearchCV

In [37]:
#import data
email = pd.read_csv('emails.csv')
email.head()

Unnamed: 0,Email No.,the,to,ect,and,for,of,a,you,hou,...,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry,Prediction
0,Email 1,0,0,1,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Email 2,8,13,24,6,6,2,102,1,27,...,0,0,0,0,0,0,0,1,0,0
2,Email 3,0,0,1,0,0,0,8,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Email 4,0,5,22,0,5,1,51,2,10,...,0,0,0,0,0,0,0,0,0,0
4,Email 5,7,6,17,1,5,2,57,0,9,...,0,0,0,0,0,0,0,1,0,0


In [39]:
#read dimensions and dtypes
print("Number of datapoints (rows):", email.shape[0])
print("Number of columns:", email.shape[1])
print("\nData types:\n", email.dtypes)

Number of datapoints (rows): 5172
Number of columns: 3002

Data types:
 Email No.     object
the            int64
to             int64
ect            int64
and            int64
               ...  
military       int64
allowing       int64
ff             int64
dry            int64
Prediction     int64
Length: 3002, dtype: object


In [41]:
#Data cleaning
#we don't need the email identifier, so remove it
no_label_email = email.drop(['Email No.'], axis=1)
no_label_email.head()

Unnamed: 0,the,to,ect,and,for,of,a,you,hou,in,...,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry,Prediction
0,0,0,1,0,0,0,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,8,13,24,6,6,2,102,1,27,18,...,0,0,0,0,0,0,0,1,0,0
2,0,0,1,0,0,0,8,0,0,4,...,0,0,0,0,0,0,0,0,0,0
3,0,5,22,0,5,1,51,2,10,1,...,0,0,0,0,0,0,0,0,0,0
4,7,6,17,1,5,2,57,0,9,3,...,0,0,0,0,0,0,0,1,0,0


In [43]:
#check for missing values
print("\nMissing values per column:\n", no_label_email.isnull().sum())


Missing values per column:
 the           0
to            0
ect           0
and           0
for           0
             ..
military      0
allowing      0
ff            0
dry           0
Prediction    0
Length: 3001, dtype: int64


In [45]:
#Implement point-biserial correlation to see which columns may have the highest correlation to the prediction
from scipy.stats import pointbiserialr
import pandas as pd

target = 'Prediction'
results = []

#Loop through predictor columns
for col in no_label_email.drop(columns=[target]).select_dtypes(include=np.number).columns:
    corr, p_val = pointbiserialr(no_label_email[target], no_label_email[col])
    results.append({"Feature": col, "Correlation": corr, "AbsCorrelation": abs(corr), "p-value": p_val})

#Put into DataFrame and sort
results_df = pd.DataFrame(results)
top_20 = results_df.sort_values(by="AbsCorrelation", ascending=False).head(20)

print(top_20[['Feature', 'Correlation', 'p-value']])

        Feature  Correlation       p-value
160      thanks    -0.271433  4.926148e-88
37          hpl    -0.266518  7.952302e-85
418       hanks    -0.266070  1.547126e-84
785       thank    -0.262384  3.521933e-82
99         more     0.258152  1.607936e-79
81     attached    -0.236558  1.048551e-66
68        daren    -0.236180  1.711801e-66
52    forwarded    -0.230765  1.761133e-63
42          our     0.228187  4.495713e-62
317     subject    -0.227754  7.714997e-62
2311         hp    -0.225846  8.229589e-61
363        able     0.222219  6.968407e-59
290        best     0.221703  1.301847e-58
2833         ur     0.220253  7.483086e-58
1092        sex     0.220092  9.079039e-58
1361        sec     0.217402  2.241652e-56
242       money     0.217215  2.799293e-56
647        soft     0.213382  2.498362e-54
1615         dr     0.212413  7.671000e-54
2041         mo     0.210056  1.146765e-52


In [47]:
#prep for modelling
X = no_label_email.drop('Prediction', axis=1)
y = no_label_email['Prediction']

#split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42)

In [49]:
#base learners
knn_model = KNeighborsClassifier(n_neighbors=11)

logreg_model = LogisticRegression(max_iter=1000, random_state=42)

### Bagging with KNN

In [23]:
#base performance - KNN
for _ in tqdm(range(1), desc="Training & Evaluating Base KNN"):
    knn_model.fit(X_train, y_train)
    knn_pred = knn_model.predict(X_test)
    knn_proba = knn_model.predict_proba(X_test)[:, 1]
    print("Base KNN Accuracy:", accuracy_score(y_test, knn_pred))
    print("Base KNN ROC-AUC:", roc_auc_score(y_test, knn_proba))

Training & Evaluating Base KNN: 100%|████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.10it/s]

Base KNN Accuracy: 0.8698453608247423
Base KNN ROC-AUC: 0.9333887625592274





In [38]:
#Bagging with KNN as base
bagging_knn = BaggingClassifier(
    estimator=KNeighborsClassifier(n_neighbors=11),
    n_estimators=50,
    max_samples=0.7,
    bootstrap=True,
    random_state=42
)
for _ in tqdm(range(1), desc="Bagging with KNN as base"):
    bagging_knn.fit(X_train, y_train)
    bag_knn_pred = bagging_knn.predict(X_test)
    bag_knn_proba = bagging_knn.predict_proba(X_test)[:, 1]
    
    print("Bagging + KNN Accuracy:", accuracy_score(y_test, bag_knn_pred))
    print("Bagging + KNN ROC-AUC:", roc_auc_score(y_test, bag_knn_proba))

Bagging with KNN as base: 100%|██████████████████████████████████████████████████████████| 1/1 [00:29<00:00, 29.42s/it]

Bagging + KNN Accuracy: 0.8646907216494846
Bagging + KNN ROC-AUC: 0.9338766065292956





### Bagging with Logistic Regression

In [37]:
#base performance - Logistic Regression
for _ in tqdm(range(1), desc="Training & Evaluating Base Logistic Regression"):
    logreg_model.fit(X_train, y_train)
    logreg_pred = logreg_model.predict(X_test)
    logreg_proba = logreg_model.predict_proba(X_test)[:, 1]
    print("Base Logistic Regression Accuracy:", accuracy_score(y_test, logreg_pred))
    print("Base Logistic Regression ROC-AUC:", roc_auc_score(y_test, logreg_proba))

Training & Evaluating Base Logistic Regression: 100%|████████████████████████████████████| 1/1 [00:05<00:00,  5.23s/it]

Base Logistic Regression Accuracy: 0.9710051546391752
Base Logistic Regression ROC-AUC: 0.9910986005790017





In [39]:
#Bagging with Logistic Regression as base
bagging_logreg = BaggingClassifier(
    estimator=LogisticRegression(max_iter=1000, random_state=42),
    n_estimators=50,
    max_samples=0.8,
    bootstrap=True,
    random_state=42
)
for _ in tqdm(range(1), desc="Bagging with Logistic Regression as base"):
    bagging_logreg.fit(X_train, y_train)
    bag_logreg_pred = bagging_logreg.predict(X_test)
    bag_logreg_proba = bagging_logreg.predict_proba(X_test)[:, 1]
    
    print("Bagging + Logistic Regression Accuracy:", accuracy_score(y_test, bag_logreg_pred))
    print("Bagging + Logistic Regression ROC-AUC:", roc_auc_score(y_test, bag_logreg_proba))

Bagging with Logistic Regression as base: 100%|█████████████████████████████████████████| 1/1 [01:43<00:00, 103.26s/it]

Bagging + Logistic Regression Accuracy: 0.9697164948453608
Bagging + Logistic Regression ROC-AUC: 0.9927274184338907





### Bagging with Decision Tree Classifier

In [19]:
#base: DecisionTreeClassifier w/ Gridsearch
dt_param_grid = {
    "max_depth": [None, 5, 10, 20],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 5]
}

dt_grid = GridSearchCV(
    DecisionTreeClassifier(random_state=42),
    dt_param_grid,
    cv=3,
    scoring="roc_auc",
    n_jobs=-1,
    verbose=1
)

for _ in tqdm(range(1), desc="GridSearch for Base Decision Tree"):
    dt_grid.fit(X_train, y_train)

best_dt = dt_grid.best_estimator_
dt_pred = best_dt.predict(X_test)
dt_proba = best_dt.predict_proba(X_test)[:, 1]

print("Best Params (Base Decision Tree):", dt_grid.best_params_)
print("Base Decision Tree Accuracy:", accuracy_score(y_test, dt_pred))
print("Base Decision Tree ROC-AUC:", roc_auc_score(y_test, dt_proba))

GridSearch for Base Decision Tree:   0%|                                                         | 0/1 [00:00<?, ?it/s]

Fitting 3 folds for each of 36 candidates, totalling 108 fits


GridSearch for Base Decision Tree: 100%|█████████████████████████████████████████████████| 1/1 [00:11<00:00, 11.07s/it]

Best Params (Base Decision Tree): {'max_depth': 10, 'min_samples_leaf': 5, 'min_samples_split': 2}
Base Decision Tree Accuracy: 0.9052835051546392
Base Decision Tree ROC-AUC: 0.9461237941639036





In [None]:
# Bagging with Decision Tree
bagging_param_grid = {
    "n_estimators": [100, 300, 600],#number of trees
    "max_samples": [0.7],#fraction of training data
    "estimator__max_depth": [None, 5, 10],#tune tree depth
    "estimator__min_samples_split": [2],#tune split rules
}

bagging_dt = BaggingClassifier(
    estimator=DecisionTreeClassifier(random_state=42),
    bootstrap=True,
    random_state=42
)

bagging_grid = GridSearchCV(
    bagging_dt,
    bagging_param_grid,
    cv=3,
    scoring="roc_auc",
    n_jobs=-1,
    verbose=1
)

for _ in tqdm(range(1), desc="GridSearch for Bagging with Decision Tree"):
    bagging_grid.fit(X_train, y_train)

best_bagging_dt = bagging_grid.best_estimator_
bag_dt_pred = best_bagging_dt.predict(X_test)
bag_dt_proba = best_bagging_dt.predict_proba(X_test)[:, 1]

print("Best Params (Bagging + Decision Tree):", bagging_grid.best_params_)
print("Bagging + Decision Tree Accuracy:", accuracy_score(y_test, bag_dt_pred))
print("Bagging + Decision Tree ROC-AUC:", roc_auc_score(y_test, bag_dt_proba))

### **Analysis**

As seen from the accuracy scores above, bagging yields slightly worse results when using KNN and Logistic Regression as base models. This may be because bagging just reduces variance to improve stability and consequently, improve the results. That's helpful for models that are unstable, such as decision trees, but is actually not helpful for those that are already stable and have low variance, such as KNN and Logistic Regression. As a result, bagging multiple models of these on bootstrapped samples doesn't add much, and may even hurt performance slightly. Bagging also doesn't reduce bias, only variance, so it's not helpful in Logister Regression either.

Furthermore, bootstrap sampling reduces the effective traning size. KNN and Logistic Regression benefit more from utilizing the full dataset, so that hurts their performance too.

As seen from the results of using bagging on decision tree classifiers, it greatly improves performance in terms of accuracy and ROC-AUC, meaning that it can differentiate and classify properly at an accurate level.

3. Using your own dataset, perform boosting (AdaBoost, XGBoost, and etc) and stacking.

### Boosting

#### Base

In [None]:
#base: DecisionTreeClassifier w/ Gridsearch
dt_param_grid = {
    "max_depth": [None, 5, 10, 20],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 5]
}

dt_grid = GridSearchCV(
    DecisionTreeClassifier(random_state=42),
    dt_param_grid,
    cv=3,
    scoring="roc_auc",
    n_jobs=-1,
    verbose=1
)

for _ in tqdm(range(1), desc="GridSearch for Base Decision Tree"):
    dt_grid.fit(X_train, y_train)

best_dt = dt_grid.best_estimator_
dt_pred = best_dt.predict(X_test)
dt_proba = best_dt.predict_proba(X_test)[:, 1]

print("Best Params (Base Decision Tree):", dt_grid.best_params_)
print("Base Decision Tree Accuracy:", accuracy_score(y_test, dt_pred))
print("Base Decision Tree ROC-AUC:", roc_auc_score(y_test, dt_proba))

#### AdaBoost

In [None]:
from tqdm import tqdm
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score

ada_param_grid = {
    "n_estimators": [50, 100, 300],
    "learning_rate": [0.01, 0.1, 1.0],
    "estimator__max_depth": [1, 2]#depth of decision stump/tree
}

adaboost = AdaBoostClassifier(
    estimator=DecisionTreeClassifier(random_state=42),
    random_state=42
)

grid_search_ada = GridSearchCV(
    adaboost,
    ada_param_grid,
    cv=3,
    scoring="roc_auc",
    n_jobs=-1,
    verbose=1
)

for _ in tqdm(range(1), desc="GridSearch for AdaBoost"):
    grid_search_ada.fit(X_train, y_train)

best_adaboost = grid_search_ada.best_estimator_
ada_pred = best_adaboost.predict(X_test)
ada_proba = best_adaboost.predict_proba(X_test)[:, 1]

print("Best Params (AdaBoost):", grid_search_ada.best_params_)
print("AdaBoost Accuracy:", accuracy_score(y_test, ada_pred))
print("AdaBoost ROC-AUC:", roc_auc_score(y_test, ada_proba))


#### XGBoost

In [None]:
from tqdm import tqdm
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score

xgb_param_grid = {
    "n_estimators": [100, 300, 500],
    "learning_rate": [0.01, 0.1, 0.2],
    "max_depth": [3, 5, 7],
    "subsample": [0.7, 0.8, 1.0],
    "colsample_bytree": [0.7, 0.8, 1.0]
}

xgb = XGBClassifier(
    use_label_encoder=False,
    eval_metric="logloss",
    random_state=42
)

grid_search_xgb = GridSearchCV(
    xgb,
    xgb_param_grid,
    cv=3,
    scoring="roc_auc",
    n_jobs=-1,
    verbose=1
)

for _ in tqdm(range(1), desc="GridSearch for XGBoost"):
    grid_search_xgb.fit(X_train, y_train)

best_xgb = grid_search_xgb.best_estimator_
xgb_pred = best_xgb.predict(X_test)
xgb_proba = best_xgb.predict_proba(X_test)[:, 1]

print("Best Params (XGBoost):", grid_search_xgb.best_params_)
print("XGBoost Accuracy:", accuracy_score(y_test, xgb_pred))
print("XGBoost ROC-AUC:", roc_auc_score(y_test, xgb_proba))

#### CatBoost

In [None]:
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score
from tqdm import tqdm
cat_param_grid = {
    "iterations": [200, 300, 500],
    "learning_rate": [0.01, 0.05, 0.1],
    "depth": [4, 6, 8],
    "l2_leaf_reg": [1, 3, 5]
}

catboost = CatBoostClassifier(
    verbose=0,
    random_state=42
)

grid_search_cat = GridSearchCV(
    catboost,
    cat_param_grid,
    cv=3,
    scoring="roc_auc",
    n_jobs=-1,
    verbose=1
)

for _ in tqdm(range(1), desc="GridSearch for CatBoost"):
    grid_search_cat.fit(X_train, y_train)

best_cat = grid_search_cat.best_estimator_
cat_pred = best_cat.predict(X_test)
cat_proba = best_cat.predict_proba(X_test)[:, 1]

print("Best Params (CatBoost):", grid_search_cat.best_params_)
print("CatBoost Accuracy:", accuracy_score(y_test, cat_pred))
print("CatBoost ROC-AUC:", roc_auc_score(y_test, cat_proba))


### Stacking

I used logistic regression, KNN, and SVM, then stacked RandomForest on top of it as the final estimator. Then, I ran GridSearch on the final estimator (which is Random Forest).

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from tqdm import tqdm

stacking_clf = StackingClassifier(
    estimators=[
        ('lr', LogisticRegression(max_iter=1000, random_state=42)),
        ('knn', KNeighborsClassifier(n_neighbors=11)),
        ('svc', SVC(probability=True, random_state=42))
    ],
    final_estimator=RandomForestClassifier(random_state=43),
    cv=5,
    n_jobs=-1
)

#GridSearch for final estimator
param_grid = {
    "final_estimator__n_estimators": [50, 100, 200, 300],
    "final_estimator__max_depth": [None, 5, 10, 20],
    "final_estimator__min_samples_split": [2, 5, 10]
}

for _ in tqdm(range(1), desc="Tuning Meta-Learner with GridSearch"):
    grid_search = GridSearchCV(
        estimator=stacking_clf,
        param_grid=param_grid,
        cv=3,
        scoring="roc_auc",
        n_jobs=-1,
        verbose=2
    )

    grid_search.fit(X_train, y_train)

    print("Best Params for Meta-Learner (RF):", grid_search.best_params_)
    print("Best ROC-AUC Score (CV):", grid_search.best_score_)

    best_model = grid_search.best_estimator_
    test_pred = best_model.predict(X_test)
    test_proba = best_model.predict_proba(X_test)[:, 1]

    print("Stacking Test Accuracy:", accuracy_score(y_test, test_pred))
    print("Stacking Test ROC-AUC:", roc_auc_score(y_test, test_proba))


4. For stacking, identify the different models you used and their performances vs their performance when stacked.

#### Logistic Regression Only vs when stacked

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score
from tqdm import tqdm

logreg_model = LogisticRegression(max_iter=1000, random_state=42)

for _ in tqdm(range(1), desc="Training & Evaluating Base Logistic Regression"):
    logreg_model.fit(X_train, y_train)
    logreg_pred = logreg_model.predict(X_test)
    logreg_proba = logreg_model.predict_proba(X_test)[:, 1]

    base_acc = accuracy_score(y_test, logreg_pred)
    base_auc = roc_auc_score(y_test, logreg_proba)

    print("Base Logistic Regression Accuracy:", base_acc)
    print("Base Logistic Regression ROC-AUC:", base_auc)

#Compare with stacking model
stack_pred = best_model.predict(X_test)#best_model is from GridSearch
stack_proba = best_model.predict_proba(X_test)[:, 1]

stack_acc = accuracy_score(y_test, stack_pred)
stack_auc = roc_auc_score(y_test, stack_proba)

print("Comparison")
print(f"Logistic Regression Accuracy: {base_acc:.4f} | Stacking Accuracy: {stack_acc:.4f}")
print(f"Logistic Regression ROC-AUC:  {base_auc:.4f} | Stacking ROC-AUC:  {stack_auc:.4f}")


#### KNN Only vs when stacked

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier(n_neighbors=11)

for _ in tqdm(range(1), desc="Training & Evaluating Base KNN"):
    knn_model.fit(X_train, y_train)
    knn_pred = knn_model.predict(X_test)
    knn_proba = knn_model.predict_proba(X_test)[:, 1]

    base_acc = accuracy_score(y_test, knn_pred)
    base_auc = roc_auc_score(y_test, knn_proba)

    print("Base KNN Accuracy:", base_acc)
    print("Base KNN ROC-AUC:", base_auc)

#Compare with Stacking Model
stack_pred = best_model.predict(X_test)#best_model is from GridSearch
stack_proba = best_model.predict_proba(X_test)[:, 1]

stack_acc = accuracy_score(y_test, stack_pred)
stack_auc = roc_auc_score(y_test, stack_proba)

print("\n--- Comparison ---")
print(f"KNN Accuracy: {base_acc:.4f} | Stacking Accuracy: {stack_acc:.4f}")
print(f"KNN ROC-AUC:  {base_auc:.4f} | Stacking ROC-AUC:  {stack_auc:.4f}")


#### SVM Only vs when stacked

In [None]:
from sklearn.svm import SVC

svm_model = SVC(probability=True, random_state=42)

for _ in tqdm(range(1), desc="Training & Evaluating Base SVM"):
    svm_model.fit(X_train, y_train)
    svm_pred = svm_model.predict(X_test)
    svm_proba = svm_model.predict_proba(X_test)[:, 1]

    base_acc = accuracy_score(y_test, svm_pred)
    base_auc = roc_auc_score(y_test, svm_proba)

    print("Base SVM Accuracy:", base_acc)
    print("Base SVM ROC-AUC:", base_auc)


#Compare with stacking Model
stack_pred = best_model.predict(X_test)#best_model is from GridSearch
stack_proba = best_model.predict_proba(X_test)[:, 1]

stack_acc = accuracy_score(y_test, stack_pred)
stack_auc = roc_auc_score(y_test, stack_proba)

print("\n--- Comparison ---")
print(f"SVM Accuracy: {base_acc:.4f} | Stacking Accuracy: {stack_acc:.4f}")
print(f"SVM ROC-AUC:  {base_auc:.4f} | Stacking ROC-AUC:  {stack_auc:.4f}")

5. Evaluate the different ensemble learning methods used in this activity.

### Bagging

#### Classification Report

#### Confusion Matrix

#### Visualizations

### Boosting

#### Classification Report

#### Confusion Matrix

#### Visualizations

### Stacking

#### Classification Report

#### Confusion Matrix

#### Visualizations

## Conclusion