# LDA - XGBoost Pipeline 

## Import necessary packages

In [10]:
import time
import optuna
import numpy as np
import xgboost as xgb
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from datasets import load_dataset

## Constants

In [11]:
RANDOM_SEED = 0

## Load the Hate Speech Filipino dataset from Hugging Face

In [12]:
dataset = load_dataset("hate_speech_filipino")

In [13]:
train_set = dataset["train"]
validation_set = dataset["validation"]
test_set = dataset["test"]

X_train, Y_train = train_set["text"], train_set["label"]
X_val, Y_val = validation_set["text"], validation_set["label"]
X_test, Y_test = test_set["text"], test_set["label"]

X = X_train + X_val + X_test
Y = Y_train + Y_val + Y_test

In [14]:
# Re-split the dataset into training, validation, and test sets
X_train, X_temp, Y_train, Y_temp = train_test_split(X, Y, test_size=0.2, random_state=RANDOM_SEED)
X_val, X_test, Y_val, Y_test = train_test_split(X_temp, Y_temp, test_size=0.5, random_state=RANDOM_SEED)

## Vectorize the texts to be able to perform LDA

In [15]:
# Define the CountVectorizer
vectorizer = CountVectorizer(max_df=0.95, min_df=2)
X_train_vector = vectorizer.fit_transform(X_train)

## Perform Latent Dirichlet Allocation on the training set

In [7]:
N_TOPICS = 10
print(f"Performing Latent Dirichlet Allocation for {N_TOPICS} topics")
lda = LatentDirichletAllocation(n_components=N_TOPICS, random_state=RANDOM_SEED)
X_train_topics = lda.fit_transform(X_train_vector)
print(f"Done performing Latent Dirichlet Allocation for {N_TOPICS} topics")

Performing Latent Dirichlet Allocation for 10 topics
Done performing Latent Dirichlet Allocation for 10 topics


In [8]:
# Transform validation and test data using the fitted vectorizer and LDA
X_val_counts = vectorizer.transform(X_val)
X_val_topics = lda.transform(X_val_counts)

X_test_counts = vectorizer.transform(X_test)
X_test_topics = lda.transform(X_test_counts)

## Train the XGBoost model

In [9]:
# Define the objective function for Optuna
def create_objective(X_train, Y_train, X_test, Y_test):
    def objective(trial):
        # Suggest values for the hyperparameters
        n_estimators = trial.suggest_int("n_estimators", 100, 1000)
        learning_rate = trial.suggest_float("learning_rate", 1e-4, 1e-2)
        max_depth = trial.suggest_int("max_depth", 3, 20)
        subsample = trial.suggest_float("subsample", 0.5, 1.0)
        colsample_bytree = trial.suggest_float("colsample_bytree", 0.5, 1.0)

        # Create an XGBoost classifier model with suggested parameters
        model = xgb.XGBClassifier(
            objective="binary:logistic",
            n_estimators=n_estimators,    # Number of trees
            learning_rate=learning_rate,  # Learning rate
            max_depth=max_depth,          # Depth of the trees
            subsample=subsample,          # Subsampling of the training instances
            colsample_bytree=colsample_bytree,  # Subsampling of columns for each tree
            seed=RANDOM_SEED,             # Seed for reproducibility
            use_label_encoder=False,      # Disable label encoder warning
            eval_metric="logloss"
        )

        # Fit the model on the training data
        model.fit(X_train, Y_train)

        # Predict the labels on the test set
        Y_pred = model.predict(X_test)

        # Calculate the accuracy
        accuracy = accuracy_score(Y_test, Y_pred)
        return accuracy
    return objective

In [10]:
# Create a study object
study = optuna.create_study(direction="maximize", study_name=f"LDA_XGBoost_Pipeline")

# Create the study objective
objective = create_objective(X_train_topics, Y_train, X_val_topics, Y_val)

# Execute an optimization
study.optimize(objective, n_trials=20, n_jobs=-1)

# Print the best trial results
print(f"Best Accuracy: {study.best_trial.value}")
print(f"Best hyperparameters: {study.best_trial.params}")

[I 2024-04-30 00:41:10,538] A new study created in memory with name: LDA_XGBoost_Pipeline
[I 2024-04-30 00:41:22,004] Trial 10 finished with value: 0.6562113082955014 and parameters: {'n_estimators': 704, 'learning_rate': 0.006805516016174171, 'max_depth': 3, 'subsample': 0.7258352694258039, 'colsample_bytree': 0.8524639746322628}. Best is trial 10 with value: 0.6562113082955014.
[I 2024-04-30 00:41:27,208] Trial 9 finished with value: 0.7853900123813454 and parameters: {'n_estimators': 166, 'learning_rate': 0.0079551214784902, 'max_depth': 19, 'subsample': 0.556533643662955, 'colsample_bytree': 0.7337060732092171}. Best is trial 9 with value: 0.7853900123813454.
[I 2024-04-30 00:41:33,829] Trial 8 finished with value: 0.682624845233182 and parameters: {'n_estimators': 585, 'learning_rate': 0.0010407333777397137, 'max_depth': 8, 'subsample': 0.7834933018393802, 'colsample_bytree': 0.5051089578872159}. Best is trial 9 with value: 0.7853900123813454.
[I 2024-04-30 00:41:33,989] Trial 1 f

Best Accuracy: 0.8312009905076352
Best hyperparameters: {'n_estimators': 502, 'learning_rate': 0.007951984560972385, 'max_depth': 20, 'subsample': 0.7491971504240673, 'colsample_bytree': 0.5616276076732628}


In [11]:
best_n_estimators = study.best_trial.params["n_estimators"]
best_learning_rate = study.best_trial.params["learning_rate"]
best_max_depth = study.best_trial.params["max_depth"]
best_subsample = study.best_trial.params["subsample"]
best_colsample_bytree = study.best_trial.params["colsample_bytree"]

model = xgb.XGBClassifier(n_estimators=best_n_estimators, 
                                 learning_rate=best_learning_rate,
                                 max_depth=best_max_depth,
                                 random_state=RANDOM_SEED)
model.fit(X_train_topics, Y_train)
Y_pred = model.predict(X_test_topics)
score = accuracy_score(Y_test, Y_pred)
print(f"Best LDA-XGBoost Model Accuracy: {score}")

Best LDA-XGBoost Model Accuracy: 0.8424092409240924


In [6]:
# from sklearn.feature_extraction.text import TfidfVectorizer
# 
# tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2)
# X_train_vector = tfidf_vectorizer.fit_transform(X_train)

In [16]:
from sklearn.decomposition import NMF

N_TOPICS = 10
print(f"Performing Non-negative Matrix Factorization for {N_TOPICS} topics")
nmf = NMF(n_components=N_TOPICS, random_state=RANDOM_SEED)
X_train_topics = nmf.fit_transform(X_train_vector)
print(f"Done performing Non-negative Matrix Factorization for {N_TOPICS} topics")

Performing Non-negative Matrix Factorization for 10 topics
Done performing Non-negative Matrix Factorization for 10 topics


In [17]:
# Transform validation and test data using the fitted vectorizer and LDA
X_val_counts = vectorizer.transform(X_val)
X_val_topics = nmf.transform(X_val_counts)

X_test_counts = vectorizer.transform(X_test)
X_test_topics = nmf.transform(X_test_counts)

In [18]:
model = xgb.XGBClassifier(n_estimators=502, 
                         learning_rate=0.007951984560972385,
                         max_depth=20,
                         subsample=0.7491971504240673,
                         colsample_bytree=0.5616276076732628, 
                         random_state=RANDOM_SEED)
model.fit(X_train_topics, Y_train)
Y_pred = model.predict(X_test_topics)
score = accuracy_score(Y_test, Y_pred)
print(f"Best NMF-XGBoost Model Accuracy: {score}")

Best LDA-XGBoost Model Accuracy: 0.8432343234323433
