# LDA - XGBoost Pipeline 

## Import necessary packages

In [1]:
import time
import optuna
import numpy as np
import xgboost as xgb
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import NMF
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.pipeline import Pipeline

from dataset.dataset import Dataset
from constants import CLEANED_DATASET_PATH

## Constants

In [2]:
RANDOM_SEED = 0

## Load the Hate Speech Filipino dataset from Hugging Face

In [3]:
dataset = Dataset(full_data_path=CLEANED_DATASET_PATH,
                  from_scratch=False,
                  split_sizes=[10000, 4232, 4232])
dataset.build()

X_train = dataset.get_features(split_type="train")
Y_train = dataset.get_labels(split_type="train")
X_val = dataset.get_features(split_type="val")
Y_val = dataset.get_labels(split_type="val")
X_test = dataset.get_features(split_type="test")
Y_test = dataset.get_labels(split_type="test")

Data loaded from dataset/cleaned_dataset.pkl


In [4]:
# X_train_np = np.array(X_train)
# Y_train_np = np.array(Y_train)
# X_val_np = np.array(X_val)
# Y_val_np = np.array(Y_val)
# X_test_np = np.array(X_test)
# Y_test_np = np.array(Y_test)

# X_train_np = np.vstack(X_train_np)
# Y_train_np = np.vstack(Y_train_np)
# X_val_np = np.vstack(X_val_np)
# Y_val_np = np.vstack(Y_val_np)
# X_test_np = np.vstack(X_test_np)
# Y_test_np = np.vstack(Y_test_np)

In [5]:
# Re-split the dataset into training, validation, and test sets
# X_train, X_temp, Y_train, Y_temp = train_test_split(X, Y, test_size=0.2, random_state=RANDOM_SEED)
# X_val, X_test, Y_val, Y_test = train_test_split(X_temp, Y_temp, test_size=0.5, random_state=RANDOM_SEED)

## Classifier Only Pipeline

In [6]:
# Define the objective function for Optuna
def create_objective(X_train, Y_train, X_test, Y_test, random_seed):
    def objective(trial):
        # Suggest values for the hyperparameters
        n_estimators = trial.suggest_int("n_estimators", 100, 1000)
        learning_rate = trial.suggest_float("learning_rate", 1e-4, 1e-2)
        max_depth = trial.suggest_int("max_depth", 3, 20)
        subsample = trial.suggest_float("subsample", 0.5, 1.0)
        colsample_bytree = trial.suggest_float("colsample_bytree", 0.5, 1.0)

        # Create an XGBoost classifier model with suggested parameters
        model = xgb.XGBClassifier(
            objective="binary:logistic",
            n_estimators=n_estimators,    # Number of trees
            learning_rate=learning_rate,  # Learning rate
            max_depth=max_depth,          # Depth of the trees
            subsample=subsample,          # Subsampling of the training instances
            colsample_bytree=colsample_bytree,  # Subsampling of columns for each tree
            seed=random_seed,             # Seed for reproducibility
            use_label_encoder=False,      # Disable label encoder warning
            eval_metric="logloss")
        
        # Create the pipeline within the trial
        pipeline = Pipeline([
            ("vectorizer", CountVectorizer(max_df=0.95, min_df=2)),
            ("xgb", model)
        ])

        # Fit the model on the training data
        pipeline.fit(X_train, Y_train)

        # Predict the labels on the test set
        Y_pred = pipeline.predict(X_test)

        # Calculate the accuracy
        accuracy = accuracy_score(Y_test, Y_pred)
        return accuracy
    return objective

In [7]:
# Create the objective function
objective = create_objective(X_train, Y_train, X_val, Y_val, RANDOM_SEED)

# Create an Optuna study object
study = optuna.create_study(direction="maximize", study_name=f"XGB_Pipeline")

# Execute an optimization by running trials
study.optimize(objective, n_trials=50, n_jobs=-1)

best_xgb_params = study.best_trial.params

# Best trial result
print(f"Best training accuracy: {study.best_trial.value}")
print(f"Best training hyperparameters: {best_xgb_params}")

[I 2024-05-03 19:35:02,371] A new study created in memory with name: XGB_Pipeline
[I 2024-05-03 19:35:13,502] Trial 0 finished with value: 0.6996691871455577 and parameters: {'n_estimators': 275, 'learning_rate': 0.009449065633517457, 'max_depth': 8, 'subsample': 0.7670255500162464, 'colsample_bytree': 0.7934570037390087}. Best is trial 0 with value: 0.6996691871455577.
[I 2024-05-03 19:35:15,220] Trial 3 finished with value: 0.6713137996219282 and parameters: {'n_estimators': 418, 'learning_rate': 0.0030662130918332323, 'max_depth': 6, 'subsample': 0.6630028343455379, 'colsample_bytree': 0.9271280578914012}. Best is trial 0 with value: 0.6996691871455577.
[I 2024-05-03 19:35:21,225] Trial 8 finished with value: 0.6980151228733459 and parameters: {'n_estimators': 295, 'learning_rate': 0.003082470685183052, 'max_depth': 15, 'subsample': 0.6475880340922875, 'colsample_bytree': 0.5893824613668761}. Best is trial 0 with value: 0.6996691871455577.
[I 2024-05-03 19:35:21,335] Trial 13 finish

Best training accuracy: 0.7377126654064272
Best training hyperparameters: {'n_estimators': 975, 'learning_rate': 0.00788488180934829, 'max_depth': 20, 'subsample': 0.6862291345349006, 'colsample_bytree': 0.7284848858899601}


In [8]:
# Create an XGBoost classifier model with suggested parameters
model = xgb.XGBClassifier(
    objective="binary:logistic",
    n_estimators=best_xgb_params["n_estimators"],    # Number of trees
    learning_rate=best_xgb_params["learning_rate"],  # Learning rate
    max_depth=best_xgb_params["max_depth"],          # Depth of the trees
    subsample=best_xgb_params["subsample"],          # Subsampling of the training instances
    colsample_bytree=best_xgb_params["colsample_bytree"],  # Subsampling of columns for each tree
    seed=RANDOM_SEED,             # Seed for reproducibility
    use_label_encoder=False,      # Disable label encoder warning
    eval_metric="logloss")

# Create the pipeline within the trial
pipeline = Pipeline([
    ("vectorizer", CountVectorizer(max_df=0.95, min_df=2)),
    ("gbt", model)
])

# Fit the model on the training data
pipeline.fit(X_train, Y_train)

# Predict the labels on the test set
Y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(Y_test, Y_pred)
f1 = f1_score(Y_test, Y_pred)

# Best testing result
print(f"Best test accuracy: {accuracy}")
print(f"Best test F1-score: {f1}")
print(f"Best test hyperparameters: {study.best_trial.params}")

Best test accuracy: 0.7372400756143668
Best test F1-score: 0.6973326075122482
Best test hyperparameters: {'n_estimators': 975, 'learning_rate': 0.00788488180934829, 'max_depth': 20, 'subsample': 0.6862291345349006, 'colsample_bytree': 0.7284848858899601}


## NMF-XGB Pipeline

In [9]:
# Define the objective function for Optuna
def create_objective(X_train, Y_train, X_test, Y_test, best_xgb_params, random_seed):
    def objective(trial):
        # Suggest values for the hyperparameters
        n_topics = trial.suggest_int("n_topics", 50, 300)

        # Create an XGBoost classifier model with suggested parameters
        model = xgb.XGBClassifier(
            objective="binary:logistic",
            n_estimators=best_xgb_params["n_estimators"],    # Number of trees
            learning_rate=best_xgb_params["learning_rate"],  # Learning rate
            max_depth=best_xgb_params["max_depth"],          # Depth of the trees
            subsample=best_xgb_params["subsample"],          # Subsampling of the training instances
            colsample_bytree=best_xgb_params["colsample_bytree"],  # Subsampling of columns for each tree
            seed=random_seed,             # Seed for reproducibility
            use_label_encoder=False,      # Disable label encoder warning
            eval_metric="logloss")
        
        # Create the pipeline within the trial
        pipeline = Pipeline([
            ("vectorizer", CountVectorizer(max_df=0.95, min_df=2)),
            ("nmf", NMF(n_components=n_topics, random_state=random_seed)),
            ("gbt", model)
        ])

        # Fit the model on the training data
        pipeline.fit(X_train, Y_train)

        # Predict the labels on the test set
        Y_pred = pipeline.predict(X_test)

        # Calculate the accuracy
        accuracy = accuracy_score(Y_test, Y_pred)
        return accuracy
    return objective

In [10]:
# Create the objective function
objective = create_objective(X_train, Y_train, X_val, Y_val, best_xgb_params, RANDOM_SEED)

# Create an Optuna study object
study = optuna.create_study(direction="maximize", study_name=f"XGB_Pipeline")

# Execute an optimization by running trials
study.optimize(objective, n_trials=50, n_jobs=-1)

best_nmf_params = study.best_trial.params

# Best trial result
print(f"Best training accuracy: {study.best_trial.value}")
print(f"Best training hyperparameters: {best_nmf_params}")

[I 2024-05-03 19:37:58,561] A new study created in memory with name: XGB_Pipeline
[I 2024-05-03 19:48:17,325] Trial 10 finished with value: 0.7166824196597353 and parameters: {'n_topics': 60}. Best is trial 10 with value: 0.7166824196597353.
[I 2024-05-03 19:48:37,523] Trial 1 finished with value: 0.7164461247637051 and parameters: {'n_topics': 80}. Best is trial 10 with value: 0.7166824196597353.
[I 2024-05-03 19:48:53,824] Trial 4 finished with value: 0.7299149338374291 and parameters: {'n_topics': 134}. Best is trial 4 with value: 0.7299149338374291.
[I 2024-05-03 19:49:24,205] Trial 6 finished with value: 0.7424385633270322 and parameters: {'n_topics': 147}. Best is trial 6 with value: 0.7424385633270322.
[I 2024-05-03 19:50:12,116] Trial 9 finished with value: 0.7346408317580341 and parameters: {'n_topics': 173}. Best is trial 6 with value: 0.7424385633270322.
[I 2024-05-03 19:50:29,895] Trial 11 finished with value: 0.7325141776937618 and parameters: {'n_topics': 146}. Best is tr

Best training accuracy: 0.751890359168242
Best training hyperparameters: {'n_topics': 293}


In [11]:
# Create an XGBoost classifier model with suggested parameters
model = xgb.XGBClassifier(
    objective="binary:logistic",
    n_estimators=best_xgb_params["n_estimators"],    # Number of trees
    learning_rate=best_xgb_params["learning_rate"],  # Learning rate
    max_depth=best_xgb_params["max_depth"],          # Depth of the trees
    subsample=best_xgb_params["subsample"],          # Subsampling of the training instances
    colsample_bytree=best_xgb_params["colsample_bytree"],  # Subsampling of columns for each tree
    seed=RANDOM_SEED,             # Seed for reproducibility
    use_label_encoder=False,      # Disable label encoder warning
    eval_metric="logloss")

# Create the pipeline within the trial
pipeline = Pipeline([
    ("vectorizer", CountVectorizer(max_df=0.95, min_df=2)),
    ("nmf", NMF(n_components=best_nmf_params["n_topics"], 
                random_state=RANDOM_SEED)),
    ("gbt", model)
])

# Fit the model on the training data
pipeline.fit(X_train, Y_train)

# Predict the labels on the test set
Y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(Y_test, Y_pred)
f1 = f1_score(Y_test, Y_pred)

# Best testing result
print(f"Best test accuracy: {accuracy}")
print(f"Best test F1-score: {f1}")
print(f"Best test hyperparameters: {study.best_trial.params}")



Best test accuracy: 0.7405482041587902
Best test F1-score: 0.712565445026178
Best test hyperparameters: {'n_topics': 293}


