# PROJECT 4.1 - ENSEMBLE LEARNING TECHNIQUE
## `XGBoost, GradienBoost & Adaboost test`

In [1]:
# FOR PROJECT ROOT
# Setup cell
import sys
import os
import warnings

from pandas.core.common import random_state
from xgboost.callback import EarlyStopping

warnings.filterwarnings('ignore')

# Add project root to path
if '..' not in sys.path:
    sys.path.append('../')

# Verify path
print("Current working directory:", os.getcwd())
print("Python path includes:", [p for p in sys.path if 'Project' in p])

Current working directory: /media/anhvt/DATA/10_AIO_VN/AIOVN_Main/Project 4.1_Ensemble Learning/notebooks
Python path includes: ['/media/anhvt/DATA/10_AIO_VN/AIOVN_Main/Project 4.1_Ensemble Learning', '/media/anhvt/DATA/10_AIO_VN/AIOVN_Main/Project 4.1_Ensemble Learning/notebooks']


## Load Library

In [14]:
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from xgboost import XGBClassifier, XGBRegressor
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier, AdaBoostRegressor, GradientBoostingRegressor
import lightgbm as lgb
import optuna

In [3]:
import pandas as pd
import numpy as np
from utils.EmbeddingVectorizer import tfidf_vectorizer, bow_vectorizer, embedding_vectorizer
from utils.text_preprocessing import category_numerical

## 1. Import Data

In [4]:
train_data = pd.read_csv("../dataset/train_data.csv")
test_data = pd.read_csv("../dataset/test_data.csv")
X_train = train_data.iloc[:, 0]
X_test = test_data.iloc[:, 0]
y_train = train_data.iloc[:, 1].map(category_numerical)
y_test = test_data.iloc[:, 1].map(category_numerical)

Check the value of `y_train, y_test` to be converted to number & ensure no blank. If blank cell is available, we need to use `mean` or `prediction` this missing value

In [5]:
check_train = y_train[y_train == 0].count()
check_test = y_train[y_train == 0].count()
if check_test == 0 and check_train == 0:
    print("✅ No blank value from train & test target/ label")
print(y_train[:10])

0    3
1    5
2    1
3    2
4    1
5    1
6    3
7    4
8    3
9    5
Name: categories, dtype: int64


## 2.Embedding Data

In [6]:
Xtrain_tfidf, Xtest_tfidf = tfidf_vectorizer(X_train=X_train, X_test=X_test)
Xtrain_bow, Xtest_bow = bow_vectorizer(X_train=X_train, X_test=X_test)
Xtrain_em, Xtest_em = embedding_vectorizer(X_train=X_train.tolist(), X_test=X_test.tolist())

EmbeddingVectorizer initialized with model: intfloat/multilingual-e5-base on device: cpu


## 3. Test Model

We try Optuna for DT & RF because low speed of local calculation in laptop. The purpose of using Optuna is to verify the function of Optuna with Tree



In [76]:
def decision_tree_model(X_train, X_test, y_train, y_test):
    """
    This model to verify the accuracy of the decision tree in classification model
    :return: prediction, accuracy of the decision tree
    """
    param_grid = {"max_depth": 10,
                  "min_samples_split": 20,
                  "min_samples_leaf": 10,
                  "criterion": "gini",
                  "max_features": "sqrt",
                  "random_state": 42,

                  }
    dtc = DecisionTreeClassifier(**param_grid)
    dtc.fit(X_train, y_train)

    y_pred = dtc.predict(X_test)
    score = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)

    return y_pred, score, report

In [77]:
print("Decision Tree with Empirical Parameters...")
_, dt_tf_accuracy, dt_tf_report = decision_tree_model(Xtrain_tfidf, Xtest_tfidf, y_train, y_test)
_, dt_bow_accuracy, dt_bow_report = decision_tree_model(Xtrain_bow, Xtest_bow, y_train, y_test)
_, dt_em_accuracy, dt_em_report = decision_tree_model(Xtrain_em, Xtest_em, y_train, y_test)

print(f" TfIdf vectorizer with DT accuracy: {dt_tf_accuracy}")
print(f" BoW vectorizer with DT accuracy: {dt_bow_accuracy}")
print(f" Emb. vectorizer with DT accuracy: {dt_em_accuracy}")

print(f" TfIdf vectorizer with DT report: \n{dt_tf_report}")
print(f" BoW vectorizer with DT report: \n{dt_bow_report}")
print(f" Emb. vectorizer with DT report: \n{dt_em_report}")

Decision Tree with Empirical Parameters...
 TfIdf vectorizer with DT accuracy: 0.4675
 BoW vectorizer with DT accuracy: 0.46
 Emb. vectorizer with DT accuracy: 0.5475
 TfIdf vectorizer with DT report: 
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         9
           1       0.35      0.31      0.33        59
           2       0.49      0.24      0.32        75
           3       0.69      0.50      0.58       113
           4       0.41      0.86      0.56       109
           5       0.00      0.00      0.00        35

    accuracy                           0.47       400
   macro avg       0.32      0.32      0.30       400
weighted avg       0.45      0.47      0.42       400

 BoW vectorizer with DT report: 
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         9
           1       0.33      0.37      0.35        59
           2       0.49      0.23      0.31        75
     

In [74]:
def random_forest_model(X_train, X_test, y_train, y_test):
    """
    Random Forest model shall learn the data from Vectorizers (TfIdf, BoW, EmbeddingVectorizer) then perform the trial on test set. After learning, from test set, the model will raise the error value then compare the performance of Random Forest to each method
    :return accuracy report & classification report for model with each method
    """
    para_grid = {
        "n_estimators": 100,
        "max_depth": 15,
        "min_samples_split": 10,
        "min_samples_leaf":5,
        "max_features": "sqrt",
        "oob_score": True
    }
    # first train
    rfc = RandomForestClassifier(**para_grid)
    rfc.fit(X_train, y_train)
    y_pred = rfc.predict(X_test)

    # Calculate metrics then export report:
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    return y_pred, accuracy, report

In [75]:
print("Random Forest with Empirical Parameters...")
_, rf_tf_accuracy, rf_tf_report = random_forest_model(Xtrain_tfidf, Xtest_tfidf, y_train, y_test)
_, rf_bow_accuracy, rf_bow_report = random_forest_model(Xtrain_bow, Xtest_bow, y_train, y_test)
_, rf_em_accuracy, rf_em_report = random_forest_model(Xtrain_em, Xtest_em, y_train, y_test)

print(f" TfIdf vectorizer with RF accuracy: {rf_tf_accuracy}")
print(f" BoW vectorizer with RF accuracy: {rf_bow_accuracy}")
print(f" Emb. vectorizer with RF accuracy: {rf_em_accuracy}")

print(f" TfIdf vectorizer with RF report: \n{rf_tf_report}")
print(f" BoW vectorizer with RF report: \n{rf_bow_report}")
print(f" Emb. vectorizer with RF report: \n{rf_em_report}")

Random Forest with Empirical Parameters...
 TfIdf vectorizer with RF accuracy: 0.715
 BoW vectorizer with RF accuracy: 0.7125
 Emb. vectorizer with RF accuracy: 0.8075
 TfIdf vectorizer with RF report: 
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         9
           1       1.00      0.81      0.90        59
           2       0.75      0.59      0.66        75
           3       0.89      0.80      0.84       113
           4       0.54      0.95      0.69       109
           5       0.00      0.00      0.00        35

    accuracy                           0.71       400
   macro avg       0.53      0.53      0.51       400
weighted avg       0.69      0.71      0.68       400

 BoW vectorizer with RF report: 
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         9
           1       0.98      0.83      0.90        59
           2       0.75      0.55      0.63        75
    

In [62]:
def ada_boost_model(X_train, X_test, y_train, y_test):
    """
    This model to verify the accuracy of the ada_boost in classification model
    :return: prediction, accuracy of the adaboost in classification model
    """
    param_grid = {
        "n_estimators": 100,
        "learning_rate": 0.1,
        "algorithm": "SAMME",
        "random_state": 42
    }
    adb = AdaBoostClassifier(**param_grid)
    adb.fit(X_train, y_train)

    # fit the parameters then report
    y_pred = adb.predict(X_test)
    score = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)

    return y_pred, score, report

In [63]:
print("AdaBoost with Empirical Parameters...")
_, ada_tf_accuracy, ada_tf_report = ada_boost_model(Xtrain_tfidf, Xtest_tfidf, y_train, y_test)
_, ada_bow_accuracy, ada_bow_report = ada_boost_model(Xtrain_bow, Xtest_bow, y_train, y_test)
_, ada_em_accuracy, ada_em_report = ada_boost_model(Xtrain_em, Xtest_em, y_train, y_test)

print(f" TfIdf vectorizer with AdaBoost accuracy: {ada_tf_accuracy}")
print(f" BoW vectorizer with AdaBoost accuracy: {ada_bow_accuracy}")
print(f" Emb. vectorizer with AdaBoost accuracy: {ada_em_accuracy}")

print(f" TfIdf vectorizer with AdaBoost report: \n{ada_tf_report}")
print(f" BoW vectorizer with AdaBoost report: \n{ada_bow_report}")
print(f" Emb. vectorizer with AdaBoost report: \n{ada_em_report}")

AdaBoost with Empricial Parameters...
AdaBoost with Empricial Parameters...
AdaBoost with Empricial Parameters...
 TfIdf vectorizer with AdaBoost accuracy: 0.5
 BoW vectorizer with AdaBoost accuracy: 0.5
 Emb. vectorizer with AdaBoost accuracy: 0.7175
 TfIdf vectorizer with AdaBoost report: 
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         9
           1       1.00      0.37      0.54        59
           2       0.00      0.00      0.00        75
           3       0.86      0.63      0.72       113
           4       0.36      0.98      0.53       109
           5       0.00      0.00      0.00        35

    accuracy                           0.50       400
   macro avg       0.37      0.33      0.30       400
weighted avg       0.49      0.50      0.43       400

 BoW vectorizer with AdaBoost report: 
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         9
           1    

In [71]:
def gradient_boost_model(X_train, X_test, y_train, y_test):
    """
    This model to verify the accuracy of the gradient boost in classification model
    :return: prediction, accuracy of the gradient boost in classification model
    """
    param_grid = {
        "max_depth": 3,
        "learning_rate": 0.1,
        "subsample": 0.8
        }
    gb = GradientBoostingClassifier(**param_grid)
    gb.fit(X_train, y_train)

    # fit the parameters then report
    y_pred = gb.predict(X_test)
    score = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)

    return y_pred, score, report

In [73]:
print("Gradient Boost with Empirical Parameters...")
_, gb_tf_accuracy, gb_tf_report = gradient_boost_model(Xtrain_tfidf, Xtest_tfidf, y_train, y_test)
_, gb_bow_accuracy, gb_bow_report = gradient_boost_model(Xtrain_bow, Xtest_bow, y_train, y_test)
_, gb_em_accuracy, gb_em_report = gradient_boost_model(Xtrain_em, Xtest_em, y_train, y_test)

print(f" TfIdf vectorizer with GradientBoost accuracy: {gb_tf_accuracy}")
print(f" BoW vectorizer with GradientBoost accuracy: {gb_bow_accuracy}")
print(f" Emb. vectorizer with GradientBoost accuracy: {gb_em_accuracy}")

print(f" TfIdf vectorizer with GradientBoost report: \n{gb_tf_report}")
print(f" BoW vectorizer with GradientBoost report: \n{gb_bow_report}")
print(f" Emb. vectorizer with GradientBoost report: \n{gb_em_report}")

Gradient Boost with Empirical Parameters...
 TfIdf vectorizer with GradientBoost accuracy: 0.7875
 BoW vectorizer with GradientBoost accuracy: 0.7675
 Emb. vectorizer with GradientBoost accuracy: 0.8225
 TfIdf vectorizer with GradientBoost report: 
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         9
           1       0.93      0.95      0.94        59
           2       0.75      0.76      0.75        75
           3       0.88      0.81      0.84       113
           4       0.70      0.92      0.80       109
           5       0.61      0.31      0.42        35

    accuracy                           0.79       400
   macro avg       0.65      0.62      0.63       400
weighted avg       0.77      0.79      0.77       400

 BoW vectorizer with GradientBoost report: 
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         9
           1       0.92      0.93      0.92        59
 

In [78]:
def xgboost_model(X_train, X_test, y_train, y_test):
    """
    This model to verify the accuracy of the XGboost in classification model
    :return: prediction, accuracy of the XGboost in classification model
    """
    param_grid = {
        "max_depth": 6,
        "learning_rate": 0.1,
        "colsample_bytree": 0.1,
        "reg_lambda": 1
        }
    xgb = XGBClassifier(**param_grid)
    xgb.fit(X_train, y_train)

    # fit the parameters then report
    y_pred = xgb.predict(X_test)
    score = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)

    return y_pred, score, report

In [79]:
print("XGBoost with Empirical Parameters...")
_, xgb_tf_accuracy, xgb_tf_report = xgboost_model(Xtrain_tfidf, Xtest_tfidf, y_train, y_test)
_, xgb_bow_accuracy, xgb_bow_report = xgboost_model(Xtrain_bow, Xtest_bow, y_train, y_test)
_, xgb_em_accuracy, xgb_em_report = xgboost_model(Xtrain_em, Xtest_em, y_train, y_test)

print(f" TfIdf vectorizer with XGBoost accuracy: {gb_tf_accuracy}")
print(f" BoW vectorizer with XGBoost accuracy: {gb_bow_accuracy}")
print(f" Emb. vectorizer with XGBoost accuracy: {gb_em_accuracy}")

XGBoost with Empirical Parameters...
 TfIdf vectorizer with XGBoost accuracy: 0.7875
 BoW vectorizer with XGBoost accuracy: 0.7675
 Emb. vectorizer with XGBoost accuracy: 0.8225
