# PROJECT 4.1 - ENSEMBLE LEARNING TECHNIQUE
## `Ensemble Model Processing:`
`DecisionTree, AdaBoost, XGBoost, GradienBoost & Adaboost test`

In [1]:
# FOR PROJECT ROOT
# Setup cell
import sys
import os
import warnings

import pandas as pd
warnings.filterwarnings('ignore')

# Add project root to path
if '..' not in sys.path:
    sys.path.append('../')

# Verify path
print("Current working directory:", os.getcwd())
print("Python path includes:", [p for p in sys.path if 'Project' in p])

Current working directory: /media/anhvt/DATA/10_AIO_VN/AIOVN_Main/Project 4.1_Ensemble Learning/notebooks
Python path includes: ['/media/anhvt/DATA/10_AIO_VN/AIOVN_Main/Project 4.1_Ensemble Learning', '/media/anhvt/DATA/10_AIO_VN/AIOVN_Main/Project 4.1_Ensemble Learning/notebooks']


## Load Library

In [2]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from xgboost import XGBClassifier, XGBRegressor
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier, AdaBoostRegressor, GradientBoostingRegressor
from lightgbm import LGBMRegressor, LGBMClassifier
import optuna

## 1. Import Data

Use `joblib` to export embedding vectorizer

In [4]:
from utils.data_storage import load_all_vectorizers
# tfidf_data, bow_data, embeddings_data, targets = load_all_vectorizers()
Xtrain_tfidf, Xtest_tfidf, Xtrain_bow, Xtest_bow, Xtrain_em, Xtest_em, y_train, y_test = load_all_vectorizers()

Succesfully loaded all vectorizers and vector representations


In [5]:
print(Xtrain_tfidf.shape)
print(Xtest_tfidf.shape)

(1600, 18450)
(400, 18450)


## 2. Test Model

In [8]:
def decision_tree_model(X_train, X_test, y_train, y_test):
    """
    This model to verify the accuracy of the decision tree in classification model
    :return: prediction, accuracy of the decision tree
    """
    param_grid = {
        "max_depth": 10,
        "min_samples_split": 20,
        "min_samples_leaf": 10,
        "criterion": "gini",
        "max_features": "sqrt",
        "random_state": 42
    }
    dtc = DecisionTreeClassifier(**param_grid)
    dtc.fit(X_train, y_train)

    y_pred = dtc.predict(X_test)
    score = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)

    return y_pred, score, report

In [9]:
print("Decision Tree with Empirical Parameters...")
_, dt_tf_accuracy, dt_tf_report = decision_tree_model(Xtrain_tfidf, Xtest_tfidf, y_train, y_test)
_, dt_bow_accuracy, dt_bow_report = decision_tree_model(Xtrain_bow, Xtest_bow, y_train, y_test)
_, dt_em_accuracy, dt_em_report = decision_tree_model(Xtrain_em, Xtest_em, y_train, y_test)

print(f" TfIdf vectorizer with DT accuracy: {dt_tf_accuracy}")
print(f" BoW vectorizer with DT accuracy: {dt_bow_accuracy}")
print(f" Emb. vectorizer with DT accuracy: {dt_em_accuracy}")

print(f" TfIdf vectorizer with DT report: \n{dt_tf_report}")
print(f" BoW vectorizer with DT report: \n{dt_bow_report}")
print(f" Emb. vectorizer with DT report: \n{dt_em_report}")

Decision Tree with Empirical Parameters...
 TfIdf vectorizer with DT accuracy: 0.4675
 BoW vectorizer with DT accuracy: 0.46
 Emb. vectorizer with DT accuracy: 0.5475
 TfIdf vectorizer with DT report: 
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         9
           1       0.35      0.31      0.33        59
           2       0.49      0.24      0.32        75
           3       0.69      0.50      0.58       113
           4       0.41      0.86      0.56       109
           5       0.00      0.00      0.00        35

    accuracy                           0.47       400
   macro avg       0.32      0.32      0.30       400
weighted avg       0.45      0.47      0.42       400

 BoW vectorizer with DT report: 
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         9
           1       0.33      0.37      0.35        59
           2       0.49      0.23      0.31        75
     

In [14]:
def random_forest_model(X_train, X_test, y_train, y_test):
    """
    Random Forest model shall learn the data from Vectorizers (TfIdf, BoW, EmbeddingVectorizer) then perform the trial on test set. After learning, from test set, the model will raise the error value then compare the performance of Random Forest to each method
    :return accuracy report & classification report for model with each method
    """
    para_grid = {
        "n_estimators": 100,
        "max_depth": 15,
        "min_samples_split": 10,
        "min_samples_leaf":5,
        "max_features": "sqrt",
        "oob_score": True
    }
    # first train
    rfc = RandomForestClassifier(**para_grid)
    rfc.fit(X_train, y_train)
    y_pred = rfc.predict(X_test)

    # Calculate metrics then export report:
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    return y_pred, accuracy, report

In [15]:
print("Random Forest with Empirical Parameters...")
_, rf_tf_accuracy, rf_tf_report = random_forest_model(Xtrain_tfidf, Xtest_tfidf, y_train, y_test)
_, rf_bow_accuracy, rf_bow_report = random_forest_model(Xtrain_bow, Xtest_bow, y_train, y_test)
_, rf_em_accuracy, rf_em_report = random_forest_model(Xtrain_em, Xtest_em, y_train, y_test)

print(f" TfIdf vectorizer with RF accuracy: {rf_tf_accuracy}")
print(f" BoW vectorizer with RF accuracy: {rf_bow_accuracy}")
print(f" Emb. vectorizer with RF accuracy: {rf_em_accuracy}")

print(f" TfIdf vectorizer with RF report: \n{rf_tf_report}")
print(f" BoW vectorizer with RF report: \n{rf_bow_report}")
print(f" Emb. vectorizer with RF report: \n{rf_em_report}")

Random Forest with Empirical Parameters...
 TfIdf vectorizer with RF accuracy: 0.7275
 BoW vectorizer with RF accuracy: 0.7325
 Emb. vectorizer with RF accuracy: 0.8175
 TfIdf vectorizer with RF report: 
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         9
           1       0.98      0.85      0.91        59
           2       0.76      0.59      0.66        75
           3       0.84      0.82      0.83       113
           4       0.58      0.95      0.72       109
           5       0.00      0.00      0.00        35

    accuracy                           0.73       400
   macro avg       0.53      0.54      0.52       400
weighted avg       0.68      0.73      0.69       400

 BoW vectorizer with RF report: 
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         9
           1       1.00      0.83      0.91        59
           2       0.77      0.57      0.66        75
   

In [12]:
def ada_boost_model(X_train, X_test, y_train, y_test):
    """
    This model to verify the accuracy of the ada_boost in classification model
    :return: prediction, accuracy of the adaboost in classification model
    """
    param_grid = {
        "n_estimators": 100,
        "learning_rate": 0.1,
        "algorithm": "SAMME",
        "random_state": 42
    }
    adb = AdaBoostClassifier(**param_grid)
    adb.fit(X_train, y_train)

    # fit the parameters then report
    y_pred = adb.predict(X_test)
    score = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)

    return y_pred, score, report

In [13]:
print("AdaBoost with Empirical Parameters...")
_, ada_tf_accuracy, ada_tf_report = ada_boost_model(Xtrain_tfidf, Xtest_tfidf, y_train, y_test)
_, ada_bow_accuracy, ada_bow_report = ada_boost_model(Xtrain_bow, Xtest_bow, y_train, y_test)
_, ada_em_accuracy, ada_em_report = ada_boost_model(Xtrain_em, Xtest_em, y_train, y_test)

print(f" TfIdf vectorizer with AdaBoost accuracy: {ada_tf_accuracy}")
print(f" BoW vectorizer with AdaBoost accuracy: {ada_bow_accuracy}")
print(f" Emb. vectorizer with AdaBoost accuracy: {ada_em_accuracy}")

print(f" TfIdf vectorizer with AdaBoost report: \n{ada_tf_report}")
print(f" BoW vectorizer with AdaBoost report: \n{ada_bow_report}")
print(f" Emb. vectorizer with AdaBoost report: \n{ada_em_report}")

AdaBoost with Empirical Parameters...
 TfIdf vectorizer with AdaBoost accuracy: 0.5
 BoW vectorizer with AdaBoost accuracy: 0.5
 Emb. vectorizer with AdaBoost accuracy: 0.7175
 TfIdf vectorizer with AdaBoost report: 
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         9
           1       1.00      0.37      0.54        59
           2       0.00      0.00      0.00        75
           3       0.86      0.63      0.72       113
           4       0.36      0.98      0.53       109
           5       0.00      0.00      0.00        35

    accuracy                           0.50       400
   macro avg       0.37      0.33      0.30       400
weighted avg       0.49      0.50      0.43       400

 BoW vectorizer with AdaBoost report: 
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         9
           1       1.00      0.39      0.56        59
           2       0.00      0.00     

In [16]:
def gradient_boost_model(X_train, X_test, y_train, y_test):
    """
    This model to verify the accuracy of the gradient boost in classification model
    :return: prediction, accuracy of the gradient boost in classification model
    """
    param_grid = {
        "max_depth": 5,
        "learning_rate": 0.1,
        "subsample": 0.8,
        "min_samples_split": 20, # 20-100
        "min_samples_leaf": 10, # 10-20
        "validation_fraction": 0.1 # 0.1-0.2
        }
    gb = GradientBoostingClassifier(**param_grid)
    gb.fit(X_train, y_train)

    # fit the parameters then report
    y_pred = gb.predict(X_test)
    score = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)

    return y_pred, score, report

In [17]:
print("Gradient Boost with Empirical Parameters...")
_, gb_tf_accuracy, gb_tf_report = gradient_boost_model(Xtrain_tfidf, Xtest_tfidf, y_train, y_test)
_, gb_bow_accuracy, gb_bow_report = gradient_boost_model(Xtrain_bow, Xtest_bow, y_train, y_test)
_, gb_em_accuracy, gb_em_report = gradient_boost_model(Xtrain_em, Xtest_em, y_train, y_test)

print(f" TfIdf vectorizer with GradientBoost accuracy: {gb_tf_accuracy}")
print(f" BoW vectorizer with GradientBoost accuracy: {gb_bow_accuracy}")
print(f" Emb. vectorizer with GradientBoost accuracy: {gb_em_accuracy}")

print(f" TfIdf vectorizer with GradientBoost report: \n{gb_tf_report}")
print(f" BoW vectorizer with GradientBoost report: \n{gb_bow_report}")
print(f" Emb. vectorizer with GradientBoost report: \n{gb_em_report}")

Gradient Boost with Empirical Parameters...
 TfIdf vectorizer with GradientBoost accuracy: 0.8125
 BoW vectorizer with GradientBoost accuracy: 0.7975
 Emb. vectorizer with GradientBoost accuracy: 0.8325
 TfIdf vectorizer with GradientBoost report: 
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         9
           1       0.93      0.97      0.95        59
           2       0.77      0.80      0.78        75
           3       0.89      0.82      0.86       113
           4       0.75      0.92      0.82       109
           5       0.68      0.43      0.53        35

    accuracy                           0.81       400
   macro avg       0.67      0.66      0.66       400
weighted avg       0.80      0.81      0.80       400

 BoW vectorizer with GradientBoost report: 
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         9
           1       0.95      0.97      0.96        59
 

In [78]:
def xgboost_model(X_train, X_test, y_train, y_test):
    """
    This model to verify the accuracy of the XGboost in classification model
    :return: prediction, accuracy of the XGboost in classification model
    """
    param_grid = {
        "max_depth": 6,
        "learning_rate": 0.1,
        "colsample_bytree": 0.1,
        "reg_lambda": 1
        }
    xgb = XGBClassifier(**param_grid)
    xgb.fit(X_train, y_train)

    # fit the parameters then report
    y_pred = xgb.predict(X_test)
    score = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)

    return y_pred, score, report

In [79]:
print("XGBoost with Empirical Parameters...")
_, xgb_tf_accuracy, xgb_tf_report = xgboost_model(Xtrain_tfidf, Xtest_tfidf, y_train, y_test)
_, xgb_bow_accuracy, xgb_bow_report = xgboost_model(Xtrain_bow, Xtest_bow, y_train, y_test)
_, xgb_em_accuracy, xgb_em_report = xgboost_model(Xtrain_em, Xtest_em, y_train, y_test)

print(f" TfIdf vectorizer with XGBoost accuracy: {gb_tf_accuracy}")
print(f" BoW vectorizer with XGBoost accuracy: {gb_bow_accuracy}")
print(f" Emb. vectorizer with XGBoost accuracy: {gb_em_accuracy}")

XGBoost with Empirical Parameters...
 TfIdf vectorizer with XGBoost accuracy: 0.7875
 BoW vectorizer with XGBoost accuracy: 0.7675
 Emb. vectorizer with XGBoost accuracy: 0.8225


![LightGBM_Core_Parameters](lightGBM_EmpiricalParameters.png)

In [82]:
def lightGBM_model(X_train, X_test, y_train, y_test):
    """
    This model to verify the accuracy of the lightGBM in classification model
    :return: accuracy of the lightGBM in classification model
    """
    lgb_params = {
            # Core boosting parameters
            'n_estimators': 100,              # Good starting point, fast training
            'learning_rate': 0.1,             # Standard learning rate
            'max_depth': -1,                  # No limit (LightGBM uses num_leaves instead)
            'num_leaves': 31,                 # 2^5 - 1, good default for most datasets

            # Tree structure control
            'min_child_samples': 20,          # Prevent overfitting on small datasets
            'min_child_weight': 0.001,        # Minimum sum of hessian in child
            'min_split_gain': 0.0,            # Minimum loss reduction for split

            # Feature sampling (regularization)
            'feature_fraction': 0.8,          # Use 80% of features per tree
            'bagging_fraction': 0.8,          # Use 80% of data per tree
            'bagging_freq': 5,                # Perform bagging every 5 iterations

            # Regularization
            'lambda_l1': 0.0,                 # L1 regularization
            'lambda_l2': 0.0,                 # L2 regularization (LightGBM has natural reg)

            # Performance
            'objective': 'binary',            # Will be auto-detected for multiclass
            'metric': 'binary_logloss',       # Will be auto-detected
            'boosting_type': 'gbdt',          # Gradient Boosting Decision Tree
            'verbosity': -1,                  # No output
            'random_state': 42,
            'n_jobs': -1,                     # Use all CPU cores

            # Early stopping (optional, for large datasets)
            'early_stopping_rounds': None,   # Disabled for baseline
            'categorical_feature': 'auto'     # Auto-detect categorical features
        }
    # fit model
    lgb = LGBMClassifier(**lgb_params)
    lgb.fit(X_train, y_train)

    # prediction & export accuracy
    y_pred = lgb.predict(X_test)
    score = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)

    return y_pred, score, report

In [83]:
print("lightGBM with Empirical Parameters...")
_, lgb_tf_accuracy, lgb_tf_report = xgboost_model(Xtrain_tfidf, Xtest_tfidf, y_train, y_test)
_, lgb_bow_accuracy, lgb_bow_report = xgboost_model(Xtrain_bow, Xtest_bow, y_train, y_test)
_, lgb_em_accuracy, lgb_em_report = xgboost_model(Xtrain_em, Xtest_em, y_train, y_test)

print(f" TfIdf vectorizer with lightGBM accuracy: {lgb_tf_accuracy}")
print(f" BoW vectorizer with lightGBM accuracy: {lgb_bow_accuracy}")
print(f" Emb. vectorizer with lightGBM accuracy: {lgb_em_accuracy}")

lightGBM with Empirical Parameters...
 TfIdf vectorizer with lightGBM accuracy: 0.8025
 BoW vectorizer with lightGBM accuracy: 0.8075
 Emb. vectorizer with lightGBM accuracy: 0.835
