# LDA - GBT Pipeline 

## Import necessary packages

In [1]:
import time
import optuna
import numpy as np
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import NMF
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.pipeline import Pipeline

from dataset.dataset import Dataset
from constants import CLEANED_DATASET_PATH

## Constants

In [2]:
RANDOM_SEED = 0

## Load the Hate Speech Filipino dataset from Hugging Face

In [3]:
dataset = Dataset(full_data_path=CLEANED_DATASET_PATH,
                  from_scratch=False,
                  split_sizes=[10000, 4232, 4232])
dataset.build()

X_train = dataset.get_features(split_type="train")
Y_train = dataset.get_labels(split_type="train")
X_val = dataset.get_features(split_type="val")
Y_val = dataset.get_labels(split_type="val")
X_test = dataset.get_features(split_type="test")
Y_test = dataset.get_labels(split_type="test")

Data loaded from dataset/cleaned_dataset.pkl


In [4]:
# Re-split the dataset into training, validation, and test sets
# X_train, X_temp, Y_train, Y_temp = train_test_split(X, Y, test_size=0.2, random_state=RANDOM_SEED)
# X_val, X_test, Y_val, Y_test = train_test_split(X_temp, Y_temp, test_size=0.5, random_state=RANDOM_SEED)

## Classifier Only Pipeline

In [5]:
def create_objective(X_train, Y_train, X_test, Y_test, random_seed):
    def objective(trial):
        # Suggest values for the hyperparameters
        n_estimators = trial.suggest_int("n_estimators", 100, 1000)
        learning_rate = trial.suggest_float("learning_rate", 1e-4, 1e-2)
        max_depth = trial.suggest_int("max_depth", 10, 100)

        # Create the pipeline within the trial
        pipeline = Pipeline([
            ("vectorizer", CountVectorizer(max_df=0.95, min_df=2)),
            ("gbt", GradientBoostingClassifier(n_estimators=n_estimators, 
                                         learning_rate=learning_rate, 
                                         max_depth=max_depth, 
                                         random_state=RANDOM_SEED))
        ])

        # Fit the pipeline
        pipeline.fit(X_train, Y_train)

        # Evaluate the pipeline
        Y_pred = pipeline.predict(X_test)
        accuracy = accuracy_score(Y_test, Y_pred)
        return accuracy

    return objective

In [6]:
# Create the objective function
objective = create_objective(X_train, Y_train, X_val, Y_val, RANDOM_SEED)

# Create an Optuna study object
study = optuna.create_study(direction="maximize", study_name=f"GBT_Pipeline")

# Execute an optimization by running trials
study.optimize(objective, n_trials=25, n_jobs=-1)

best_gbt_params = study.best_trial.params

# Best trial result
print(f"Best training accuracy: {study.best_trial.value}")
print(f"Best training hyperparameters: {best_gbt_params}")

[I 2024-05-03 21:21:08,374] A new study created in memory with name: GBT_Pipeline
[I 2024-05-03 21:22:41,663] Trial 1 finished with value: 0.7143194706994329 and parameters: {'n_estimators': 516, 'learning_rate': 0.005788813256230545, 'max_depth': 13}. Best is trial 1 with value: 0.7143194706994329.
[I 2024-05-03 21:23:36,788] Trial 0 finished with value: 0.7136105860113422 and parameters: {'n_estimators': 559, 'learning_rate': 0.004293169087471002, 'max_depth': 16}. Best is trial 1 with value: 0.7143194706994329.
[I 2024-05-03 21:23:40,379] Trial 9 finished with value: 0.7150283553875236 and parameters: {'n_estimators': 251, 'learning_rate': 0.008085571657796728, 'max_depth': 27}. Best is trial 9 with value: 0.7150283553875236.
[I 2024-05-03 21:24:12,107] Trial 8 finished with value: 0.7053402646502835 and parameters: {'n_estimators': 332, 'learning_rate': 0.0055385245233496114, 'max_depth': 25}. Best is trial 9 with value: 0.7150283553875236.
[I 2024-05-03 21:24:20,936] Trial 2 finis

Best training accuracy: 0.7384215500945179
Best training hyperparameters: {'n_estimators': 920, 'learning_rate': 0.009979937594792538, 'max_depth': 50}


In [7]:
# Create and fit the pipeline
pipeline = Pipeline([
    ("vectorizer", CountVectorizer(max_df=0.95, min_df=2)),
    ("gbt", GradientBoostingClassifier(n_estimators=best_gbt_params["n_estimators"], 
                                         learning_rate=best_gbt_params["learning_rate"], 
                                         max_depth=best_gbt_params["max_depth"], 
                                         random_state=RANDOM_SEED))
])

# Fit the pipeline
pipeline.fit(X_train, Y_train)

# Evaluate the pipeline
Y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(Y_test, Y_pred)
f1 = f1_score(Y_test, Y_pred)

# Best testing result
print(f"Best test accuracy: {study.best_trial.value}")
print(f"Best test hyperparameters: {study.best_trial.params}")

Best test accuracy: 0.7384215500945179
Best test hyperparameters: {'n_estimators': 920, 'learning_rate': 0.009979937594792538, 'max_depth': 50}


## NMF-GBT Pipeline

In [8]:
def create_objective(X_train, Y_train, X_test, Y_test, best_gbt_params, random_seed):
    def objective(trial):
        # Suggest values for the hyperparameters
        n_topics = trial.suggest_int("n_topics", 50, 300)

        # Create the pipeline within the trial
        pipeline = Pipeline([
            ("vectorizer", CountVectorizer(max_df=0.95, min_df=2)),
            ("nmf", NMF(n_components=n_topics, random_state=random_seed)),
            ("gbt", GradientBoostingClassifier(n_estimators=best_gbt_params["n_estimators"], 
                                         learning_rate=best_gbt_params["learning_rate"], 
                                         max_depth=best_gbt_params["max_depth"], 
                                         random_state=RANDOM_SEED))
        ])

        # Fit the pipeline
        pipeline.fit(X_train, Y_train)

        # Evaluate the pipeline
        Y_pred = pipeline.predict(X_test)
        accuracy = accuracy_score(Y_test, Y_pred)
        return accuracy

    return objective

In [9]:
# Create the objective function
objective = create_objective(X_train, Y_train, X_val, Y_val, best_gbt_params, RANDOM_SEED)

# Create an Optuna study object
study = optuna.create_study(direction="maximize", study_name=f"NMF_GBT_Pipeline")

# Execute an optimization by running trials
study.optimize(objective, n_trials=25, n_jobs=-1)

best_nmf_params = study.best_trial.params

# Best trial result
print(f"Best training accuracy: {study.best_trial.value}")
print(f"Best training hyperparameters: {best_nmf_params}")

[I 2024-05-03 21:47:37,371] A new study created in memory with name: NMF_GBT_Pipeline
[I 2024-05-03 22:02:50,837] Trial 2 finished with value: 0.6580812854442344 and parameters: {'n_topics': 74}. Best is trial 2 with value: 0.6580812854442344.
[I 2024-05-03 22:04:03,602] Trial 6 finished with value: 0.650992438563327 and parameters: {'n_topics': 88}. Best is trial 2 with value: 0.6580812854442344.
[I 2024-05-03 22:07:53,664] Trial 8 finished with value: 0.6604442344045368 and parameters: {'n_topics': 113}. Best is trial 8 with value: 0.6604442344045368.
[I 2024-05-03 22:08:18,680] Trial 9 finished with value: 0.6651701323251418 and parameters: {'n_topics': 106}. Best is trial 9 with value: 0.6651701323251418.
[I 2024-05-03 22:17:54,701] Trial 13 finished with value: 0.6254725897920604 and parameters: {'n_topics': 51}. Best is trial 9 with value: 0.6651701323251418.
[I 2024-05-03 22:19:10,587] Trial 10 finished with value: 0.6722589792060492 and parameters: {'n_topics': 151}. Best is tr

In [None]:
# Create and fit the pipeline
pipeline = Pipeline([
    ("vectorizer", CountVectorizer(max_df=0.95, min_df=2)),
    ("nmf", NMF(n_components=best_nmf_params["n_topics"], 
                random_state=RANDOM_SEED)),
    ("rf", GradientBoostingClassifier(n_estimators=best_gbt_params["n_estimators"], 
                                         learning_rate=best_gbt_params["learning_rate"], 
                                         max_depth=best_gbt_params["max_depth"], 
                                         random_state=RANDOM_SEED))
])

# Fit the pipeline
pipeline.fit(X_train, Y_train)

# Evaluate the pipeline
Y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(Y_test, Y_pred)
f1 = f1_score(Y_test, Y_pred)

# Best testing result
print(f"Best test accuracy: {study.best_trial.value}")
print(f"Best test hyperparameters: {study.best_trial.params}")