# LDA - RF Pipeline 

## Import necessary packages

In [1]:
import time
import optuna
import numpy as np
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF
from sklearn.model_selection import KFold, train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
# from bitermplus import BTM

from load_dataset import load_dataset

## Constants

In [2]:
RANDOM_SEED = 0
DATASET_PATH = "dataset/hate_speech_filipino_cleaned.pkl"

## Load the Hate Speech Filipino dataset from Hugging Face

In [3]:
X, Y = load_dataset(DATASET_PATH)

Data loaded from dataset/hate_speech_filipino_cleaned.pkl


In [4]:
# Re-split the dataset into training, validation, and test sets
X_train, X_temp, Y_train, Y_temp = train_test_split(X, Y, test_size=0.2, random_state=RANDOM_SEED)
X_val, X_test, Y_val, Y_test = train_test_split(X_temp, Y_temp, test_size=0.5, random_state=RANDOM_SEED)

## Vectorize the texts to be able to perform LDA

In [5]:
# Define the CountVectorizer
vectorizer = CountVectorizer(max_df=0.95, min_df=2)
X_train_vector = vectorizer.fit_transform(X_train)
X_val_counts = vectorizer.transform(X_val)
X_test_counts = vectorizer.transform(X_test)

## Perform Latent Dirichlet Allocation on the training set

In [6]:
N_TOPICS = 10
print(f"Performing Latent Dirichlet Allocation for {N_TOPICS} topics")
lda = LatentDirichletAllocation(n_components=N_TOPICS, random_state=RANDOM_SEED)
X_train_topics = lda.fit_transform(X_train_vector)
X_val_topics = lda.transform(X_val_counts)
X_test_topics = lda.transform(X_test_counts)
print(f"Done performing Latent Dirichlet Allocation for {N_TOPICS} topics")

Performing Latent Dirichlet Allocation for 10 topics
Done performing Latent Dirichlet Allocation for 10 topics


## Search for the best hyperparameters of the Random Forest model

In [7]:
# Define the objective function for Optuna
def create_objective(X_train, Y_train, X_test, Y_test):
    def objective(trial):
        n_estimators = trial.suggest_int("n_estimators", 100, 1000)
        rf = RandomForestClassifier(n_estimators=n_estimators, random_state=RANDOM_SEED)
        rf.fit(X_train, Y_train)
        Y_pred = rf.predict(X_test)
        score = accuracy_score(Y_test, Y_pred)
        return score
    return objective

In [8]:
# Create a study object
study = optuna.create_study(direction="maximize", study_name=f"LDA_RF_Pipeline")

# Create the study objective
objective = create_objective(X_train_topics, Y_train, X_val_topics, Y_val)

# Execute an optimization
study.optimize(objective, n_trials=20, n_jobs=-1)

# Print the best trial results
print(f"Best Accuracy: {study.best_trial.value}")
print(f"Best hyperparameters: {study.best_trial.params}")

[I 2024-04-30 17:47:40,976] A new study created in memory with name: LDA_RF_Pipeline
[I 2024-04-30 17:47:53,549] Trial 4 finished with value: 0.8464713165497317 and parameters: {'n_estimators': 171}. Best is trial 4 with value: 0.8464713165497317.
[I 2024-04-30 17:47:55,987] Trial 2 finished with value: 0.8460586050350805 and parameters: {'n_estimators': 205}. Best is trial 4 with value: 0.8464713165497317.
[I 2024-04-30 17:47:58,812] Trial 0 finished with value: 0.8481221626083367 and parameters: {'n_estimators': 244}. Best is trial 0 with value: 0.8481221626083367.
[I 2024-04-30 17:48:00,658] Trial 1 finished with value: 0.8448204704911267 and parameters: {'n_estimators': 270}. Best is trial 0 with value: 0.8481221626083367.
[I 2024-04-30 17:48:03,569] Trial 11 finished with value: 0.8477094510936856 and parameters: {'n_estimators': 306}. Best is trial 0 with value: 0.8481221626083367.
[I 2024-04-30 17:48:11,974] Trial 16 finished with value: 0.8456458935204292 and parameters: {'n_es

Best Accuracy: 0.8493602971522906
Best hyperparameters: {'n_estimators': 529}


In [9]:
best_n_estimators = study.best_trial.params["n_estimators"]

rf = RandomForestClassifier(n_estimators=best_n_estimators, random_state=RANDOM_SEED)
rf.fit(X_train_topics, Y_train)
Y_pred = rf.predict(X_test_topics)
score = accuracy_score(Y_test, Y_pred)
print(f"Best LDA-RF Model Accuracy: {score}")

Best LDA-RF Model Accuracy: 0.8683993399339934


In [10]:
N_TOPICS = 20
print(f"Performing Non-negative Matrix Factorization for {N_TOPICS} topics")
nmf = NMF(n_components=N_TOPICS, random_state=RANDOM_SEED)
X_train_topics = nmf.fit_transform(X_train_vector)
X_val_topics = nmf.transform(X_val_counts)
X_test_topics = nmf.transform(X_test_counts)
print(f"Done performing Non-negative Matrix Factorization for {N_TOPICS} topics")

Performing Non-negative Matrix Factorization for 20 topics
Done performing Non-negative Matrix Factorization for 20 topics


In [11]:
rf = RandomForestClassifier(n_estimators=169, random_state=RANDOM_SEED)
rf.fit(X_train_topics, Y_train)
Y_pred = rf.predict(X_test_topics)
score = accuracy_score(Y_test, Y_pred)
print(f"Best NMF-RF Model Accuracy: {score}")

Best NMF-RF Model Accuracy: 0.8704620462046204


In [12]:
topics_values = [i for i in range(10, 21)]
for topic in topics_values:
    print(f"Performing Non-negative Matrix Factorization for {topic} topics")
    nmf = NMF(n_components=topic, random_state=RANDOM_SEED)
    X_train_topics = nmf.fit_transform(X_train_vector)
    print(f"Done performing Non-negative Matrix Factorization for {topic} topics")
    
    # Transform validation and test data using the fitted vectorizer and LDA
    X_val_counts = vectorizer.transform(X_val)
    X_val_topics = nmf.transform(X_val_counts)
    
    X_test_counts = vectorizer.transform(X_test)
    X_test_topics = nmf.transform(X_test_counts)
    
    rf = RandomForestClassifier(n_estimators=514, random_state=RANDOM_SEED)
    rf.fit(X_train_topics, Y_train)
    Y_pred = rf.predict(X_test_topics)
    score = accuracy_score(Y_test, Y_pred)
    print(f"Best NMF-RF Model Accuracy for {topic} topics: {score}")

Performing Non-negative Matrix Factorization for 10 topics
Done performing Non-negative Matrix Factorization for 10 topics
Best NMF-RF Model Accuracy for 10 topics: 0.8663366336633663
Performing Non-negative Matrix Factorization for 11 topics
Done performing Non-negative Matrix Factorization for 11 topics
Best NMF-RF Model Accuracy for 11 topics: 0.8572607260726073
Performing Non-negative Matrix Factorization for 12 topics
Done performing Non-negative Matrix Factorization for 12 topics
Best NMF-RF Model Accuracy for 12 topics: 0.8696369636963697
Performing Non-negative Matrix Factorization for 13 topics
Done performing Non-negative Matrix Factorization for 13 topics
Best NMF-RF Model Accuracy for 13 topics: 0.865924092409241
Performing Non-negative Matrix Factorization for 14 topics
Done performing Non-negative Matrix Factorization for 14 topics
Best NMF-RF Model Accuracy for 14 topics: 0.8679867986798679
Performing Non-negative Matrix Factorization for 15 topics
Done performing Non-ne