# LDA - GBT Pipeline 

## Import necessary packages

In [1]:
import time
import optuna
import numpy as np
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

from dataset.dataset import Dataset
from constants import CLEANED_DATASET_PATH

## Constants

In [2]:
RANDOM_SEED = 0

## Load the Hate Speech Filipino dataset from Hugging Face

In [3]:
dataset = Dataset(full_data_path=CLEANED_DATASET_PATH,
                  from_scratch=False,
                  split_sizes=[10000, 4232, 4232])
dataset.build()

X_train = dataset.get_features(split_type="train")
Y_train = dataset.get_labels(split_type="train")
X_val = dataset.get_features(split_type="val")
Y_val = dataset.get_labels(split_type="val")
X_test = dataset.get_features(split_type="test")
Y_test = dataset.get_labels(split_type="test")

Data loaded from dataset/cleaned_dataset.pkl


In [4]:
# Re-split the dataset into training, validation, and test sets
# X_train, X_temp, Y_train, Y_temp = train_test_split(X, Y, test_size=0.2, random_state=RANDOM_SEED)
# X_val, X_test, Y_val, Y_test = train_test_split(X_temp, Y_temp, test_size=0.5, random_state=RANDOM_SEED)

## Vectorize the texts to be able to perform LDA

In [5]:
# Define the CountVectorizer
vectorizer = CountVectorizer(max_df=0.95, min_df=2)
X_train_vector = vectorizer.fit_transform(X_train)
X_val_counts = vectorizer.transform(X_val)
X_test_counts = vectorizer.transform(X_test)

## Perform Latent Dirichlet Allocation on the training set

In [6]:
# N_TOPICS = 10
# print(f"Performing Latent Dirichlet Allocation for {N_TOPICS} topics")
# lda = LatentDirichletAllocation(n_components=N_TOPICS, random_state=RANDOM_SEED)
# X_train_topics = lda.fit_transform(X_train_vector)
# X_val_topics = lda.transform(X_val_counts)
# X_test_topics = lda.transform(X_test_counts)
# print(f"Done performing Latent Dirichlet Allocation for {N_TOPICS} topics")

## Perform Non-negative Matrix Factorization on the training set

In [7]:
N_TOPICS = 20
print(f"Performing Non-negative Matrix Factorization for {N_TOPICS} topics")
nmf = NMF(n_components=N_TOPICS, random_state=RANDOM_SEED)
X_train_topics = nmf.fit_transform(X_train_vector)
X_val_topics = nmf.transform(X_val_counts)
X_test_topics = nmf.transform(X_test_counts)
print(f"Done performing Non-negative Matrix Factorization for {N_TOPICS} topics")

Performing Non-negative Matrix Factorization for 20 topics
Done performing Non-negative Matrix Factorization for 20 topics


## Train the Gradient-boosting Tree

In [8]:
# Define the objective function for Optuna
def create_objective(X_train, Y_train, X_test, Y_test):
    def objective(trial):
        n_estimators = trial.suggest_int("n_estimators", 100, 1000)
        learning_rate = trial.suggest_float("learning_rate", 1e-4, 1e-2)
        max_depth = trial.suggest_int("max_depth", 10, 100)
        gbt = GradientBoostingClassifier(n_estimators=n_estimators, 
                                         learning_rate=learning_rate, 
                                         max_depth=max_depth, 
                                         random_state=RANDOM_SEED)
        gbt.fit(X_train, Y_train)
        Y_pred = gbt.predict(X_test)
        score = accuracy_score(Y_test, Y_pred)
        return score
    return objective

In [9]:
# Create a study object
# study = optuna.create_study(direction="maximize", study_name=f"LDA_GBT_Pipeline")
study = optuna.create_study(direction="maximize", study_name=f"NMF_GBT_Pipeline")

# Create the study objective
objective = create_objective(X_train_topics, Y_train, X_val_topics, Y_val)

# Execute an optimization
study.optimize(objective, n_trials=20, n_jobs=-1)

# Print the best trial results
print(f"Best Accuracy: {study.best_trial.value}")
print(f"Best hyperparameters: {study.best_trial.params}")

[I 2024-04-30 23:33:12,145] A new study created in memory with name: LDA_GBT_Pipeline
[I 2024-04-30 23:34:44,445] Trial 9 finished with value: 0.6141304347826086 and parameters: {'n_estimators': 100, 'learning_rate': 0.00743688608398819, 'max_depth': 72}. Best is trial 9 with value: 0.6141304347826086.
[I 2024-04-30 23:36:14,781] Trial 8 finished with value: 0.625 and parameters: {'n_estimators': 217, 'learning_rate': 0.0010303707874840464, 'max_depth': 19}. Best is trial 8 with value: 0.625.
[I 2024-04-30 23:37:20,193] Trial 4 finished with value: 0.6153119092627599 and parameters: {'n_estimators': 274, 'learning_rate': 0.00393159977077752, 'max_depth': 100}. Best is trial 8 with value: 0.625.
[I 2024-04-30 23:39:16,652] Trial 6 finished with value: 0.6143667296786389 and parameters: {'n_estimators': 405, 'learning_rate': 0.006907270597423413, 'max_depth': 92}. Best is trial 8 with value: 0.625.
[I 2024-04-30 23:40:55,122] Trial 11 finished with value: 0.6131852551984878 and parameter

Best Accuracy: 0.6450850661625709
Best hyperparameters: {'n_estimators': 944, 'learning_rate': 0.005110020139553487, 'max_depth': 18}


In [10]:
best_n_estimators = study.best_trial.params["n_estimators"]
best_learning_rate = study.best_trial.params["learning_rate"]
best_max_depth = study.best_trial.params["max_depth"]

gbt = GradientBoostingClassifier(n_estimators=best_n_estimators, 
                                 learning_rate=best_learning_rate,
                                 max_depth=best_max_depth,
                                 random_state=RANDOM_SEED)
gbt.fit(X_train_topics, Y_train)
Y_pred = gbt.predict(X_test_topics)
score = accuracy_score(Y_test, Y_pred)
# print(f"Best LDA-GBT Model Accuracy: {score}")
print(f"Best NMF-GBT Model Accuracy: {score}")

Best NMF-GBT Model Accuracy: 0.6446124763705104
