# PROJECT 4.1 - ENSEMBLE LEARNING TECHNIQUE
## `XGBoost, GradienBoost & Adaboost test`

In [28]:
# FOR PROJECT ROOT
# Setup cell
import sys
import os
import warnings
warnings.filterwarnings('ignore')

# Add project root to path
if '..' not in sys.path:
    sys.path.append('../')

# Verify path
print("Current working directory:", os.getcwd())
print("Python path includes:", [p for p in sys.path if 'Project' in p])

Current working directory: /media/anhvt/DATA/10_AIO_VN/AIOVN_Main/Project 4.1_Ensemble Learning/notebooks
Python path includes: ['/media/anhvt/DATA/10_AIO_VN/AIOVN_Main/Project 4.1_Ensemble Learning']


## Load Library

In [22]:
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from xgboost import XGBClassifier, XGBRegressor
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier, AdaBoostRegressor, GradientBoostingRegressor
import lightgbm as lgb

In [18]:
import pandas as pd
import numpy as np
from utils.EmbeddingVectorizer import tfidf_vectorizer, bow_vectorizer, embedding_vectorizer
from utils.text_preprocessing import category_numerical

## 1. Import Data

In [2]:
train_data = pd.read_csv("../dataset/train_data.csv")
test_data = pd.read_csv("../dataset/test_data.csv")
X_train = train_data.iloc[:, 0]
X_test = test_data.iloc[:, 0]
y_train = train_data.iloc[:, 1].map(category_numerical)
y_test = test_data.iloc[:, 1].map(category_numerical)

Check the value of `y_train, y_test` to be converted to number & ensure no blank. If blank cell is available, we need to use `mean` or `prediction` this missing value

In [3]:
check_train = y_train[y_train == 0].count()
check_test = y_train[y_train == 0].count()
if check_test == 0 and check_train == 0:
    print("✅ No blank value from train & test target/ label")
print(y_train[:10])

0    3
1    5
2    1
3    2
4    1
5    1
6    3
7    4
8    3
9    5
Name: categories, dtype: int64


## 2.Embedding Data

In [25]:
Xtrain_tfidf, Xtest_tfidf = tfidf_vectorizer(X_train=X_train, X_test=X_test)
Xtrain_bow, Xtest_bow = bow_vectorizer(X_train=X_train, X_test=X_test)
Xtrain_em, Xtest_em = embedding_vectorizer(X_train=X_train.tolist(), X_test=X_test.tolist())

EmbeddingVectorizer initialized with model: intfloat/multilingual-e5-base on device: cpu


## 3. Test Model

In [21]:
def decision_tree(X_train, X_test, y_train, y_test,
                  param_grid=None):
    """


    """
    if param_grid is None:
        param_grid = {'max_depth': [3, 5, 10, 15, None],
                      'min_samples_split': [2, 5, 10, 20],
                      'min_samples_leaf': [1, 2, 5, 10],
                      'criterion': ['gini', 'entropy'],
                      'max_features': ['sqrt', 'log2', None]
                      }
    dtc = DecisionTreeClassifier(random_state=42)
    dtc.fit(X_train, y_train)
    y_pred = dtc.predict(X_test)
    score = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)

    return y_pred, score, report


In [29]:
def random_forest_model(X_train, X_test, y_train, y_test,
                        n_estimators: int=50,
                        max_depth: int=5,
                        random_state: int=42):
    """
    Random Forest model shall learn the data from Vectorizers (TfIdf, BoW, EmbeddingVectorizer) then perfrom the trial on test set. After learning, from test set, the model will raise the error value then compare the performance of Random Forest to each method
    :return accuracy report & classification report for model with each method
    """
    rfc = RandomForestClassifier(n_estimators=n_estimators, random_state=random_state)
    rfc.fit(X_train, y_train)
    y_pred = rfc.predict(X_test)
    report = classification_report(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    return y_pred, accuracy, report

In [30]:
_, tf_accuracy, tf_report = random_forest_model(Xtrain_tfidf, Xtest_tfidf, y_train, y_test)
_, bow_accuracy, bow_report = random_forest_model(Xtrain_bow, Xtest_bow, y_train, y_test)
_, em_accuracy, em_report = random_forest_model(Xtrain_em, Xtest_em, y_train, y_test)

print(f" TfIdf vectorizer with RF accuracy: {tf_accuracy}")
print(f" Emb. vectorizer with RF accuracy: {em_accuracy}")
print(f" BoW vectorizer with RF accuracy: {bow_accuracy}")

print(f" TfIdf vectorizer with RF report: \n{tf_report}")
print(f" Emb. vectorizer with RF report: \n{em_report}")
print(f" BoW vectorizer with RF report: \n{bow_report}")

 TfIdf vectorizer with RF accuracy: 0.765
 Emb. vectorizer with RF accuracy: 0.81
 BoW vectorizer with RF accuracy: 0.74
 TfIdf vectorizer with RF report: 
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         9
           1       0.93      0.86      0.89        59
           2       0.73      0.73      0.73        75
           3       0.86      0.86      0.86       113
           4       0.66      0.94      0.77       109
           5       0.00      0.00      0.00        35

    accuracy                           0.77       400
   macro avg       0.53      0.57      0.54       400
weighted avg       0.70      0.77      0.72       400

 Emb. vectorizer with RF report: 
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         9
           1       0.89      0.95      0.92        59
           2       0.72      0.79      0.75        75
           3       0.90      0.91      0.91       

In [20]:
adaboost = AdaBoostClassifier(n_estimators=100, random_state=42)
adaboost.fit(Xtrain_tfidf, y_train)
y_pred = adaboost.predict(Xtest_tfidf)
print(classification_report(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

    astro-ph       0.94      0.58      0.72        59
    cond-mat       0.56      0.41      0.48        75
          cs       0.81      0.58      0.68       113
        math       0.45      0.94      0.61       109
     math-ph       0.00      0.00      0.00         9
     physics       0.00      0.00      0.00        35

    accuracy                           0.58       400
   macro avg       0.46      0.42      0.41       400
weighted avg       0.60      0.58      0.55       400

0.585


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [21]:
adaboost.fit(Xtrain_bow, y_train)
y_pred = adaboost.predict(Xtest_bow)
print(classification_report(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

    astro-ph       0.98      0.69      0.81        59
    cond-mat       0.71      0.47      0.56        75
          cs       0.76      0.73      0.75       113
        math       0.49      0.85      0.62       109
     math-ph       0.00      0.00      0.00         9
     physics       0.22      0.06      0.09        35

    accuracy                           0.64       400
   macro avg       0.53      0.47      0.47       400
weighted avg       0.65      0.64      0.61       400

0.635
