# LDA - SVC Pipeline

## Import necessary packages

In [1]:
import time
import optuna
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier

from datasets import load_dataset

## Constants

In [2]:
RANDOM_SEED = 0

## Load the Hate Speech Filipino dataset from Hugging Face

In [3]:
dataset = load_dataset("hate_speech_filipino")

In [4]:
train_set = dataset["train"]
validation_set = dataset["validation"]
test_set = dataset["test"]

X_train, Y_train = train_set["text"], train_set["label"]
X_val, Y_val = validation_set["text"], validation_set["label"]
X_test, Y_test = test_set["text"], test_set["label"]

X = X_train + X_val + X_test
Y = Y_train + Y_val + Y_test

In [None]:
# Re-split the dataset into training, validation, and test sets
X_train, X_temp, Y_train, Y_temp = train_test_split(X, Y, test_size=0.2, random_state=RANDOM_SEED)
X_val, X_test, Y_val, Y_test = train_test_split(X_temp, Y_temp, test_size=0.5, random_state=RANDOM_SEED)

## Vectorize the texts to be able to perform LDA

In [5]:
# Define the CountVectorizer
vectorizer = CountVectorizer(max_df=0.95, min_df=2)
X_train_vector = vectorizer.fit_transform(X_train)

## Objective function for Optuna hyperparameter tuning

In [6]:
def create_objective(N_TOPICS, X_train, Y_train, X_val, Y_val):
    def objective(trial):
        # Create a list of models for the ensemble
        estimators = []
        for i in range(N_TOPICS):
            C = trial.suggest_float(f"C_{i}", 0.1, 10.0)
            kernel = trial.suggest_categorical(f"kernel_{i}", ["rbf", "poly", "sigmoid"])
            svc = SVC(probability=True, C=C, kernel=kernel, random_state=RANDOM_SEED)
            estimators.append((f'svc_{i}', svc))

        # Create a voting classifier for the ensemble
        # Using 'soft' voting to average probabilities or 'hard' for majority vote
        ensemble = VotingClassifier(estimators, voting="soft")
        ensemble.fit(X_train, Y_train)

        # Evaluate the ensemble on the validation set
        return accuracy_score(Y_val, ensemble.predict(X_val))

    return objective

In [7]:
def run_optimization(N_TOPICS, X_counts, Y, random_seed):

    N_TOPICS = 10
    print(f"Performing Latent Dirichlet Allocation for {N_TOPICS} topics")
    lda = LatentDirichletAllocation(n_components=N_TOPICS, random_state=RANDOM_SEED)
    X_train_topics = lda.fit_transform(X_train_vector)
    print(f"Done performing Latent Dirichlet Allocation for {N_TOPICS} topics")

    # Transform validation and test data using the fitted vectorizer and LDA
    X_val_counts = vectorizer.transform(X_val)
    X_val_topics = lda.transform(X_val_counts)

    X_test_counts = vectorizer.transform(X_test)
    X_test_topics = lda.transform(X_test_counts)

    # Split the transformed data
    X_train, X_temp, Y_train, Y_temp = train_test_split(X_topics, Y, test_size=0.2, random_state=random_seed)
    X_val, X_test, Y_val, Y_test = train_test_split(X_temp, Y_temp, test_size=0.5, random_state=random_seed)

    # Create a study object
    study = optuna.create_study(direction="maximize", study_name=f"LDA_{N_TOPICS}_SVC_Ensemble")
    
    # Create the study objective
    objective = create_objective(N_TOPICS, X_train, Y_train, X_val, Y_val)

    # Execute an optimization
    study.optimize(objective, n_trials=20, n_jobs=-1)

    # Print the best trial results
    print(f"Best Accuracy: {study.best_trial.value}")
    print(f"Best hyperparameters: {study.best_trial.params}")

    return study

In [8]:
def evaluate_best_model(N_TOPICS, X_counts, Y, random_seed, best_params):
    # Perform LDA with the best number of topics
    lda = LatentDirichletAllocation(n_components=N_TOPICS, random_state=random_seed)
    X_topics = lda.fit_transform(X_counts)
    
    # Split the transformed data
    X_train, X_temp, Y_train, Y_temp = train_test_split(X_topics, Y, test_size=0.2, random_state=random_seed)
    X_val, X_test, Y_val, Y_test = train_test_split(X_temp, Y_temp, test_size=0.5, random_state=random_seed)

    # Set up the best SVC ensemble
    estimators = []
    for i in range(N_TOPICS):
        C = best_params[f"C_{i}"]
        kernel = best_params[f"kernel_{i}"]
        svc = SVC(probability=True, C=C, kernel=kernel, random_state=random_seed)
        estimators.append((f'svc_{i}', svc))

    # Create and train the Voting Classifier with the best parameters
    ensemble = VotingClassifier(estimators, voting='soft')
    ensemble.fit(X_train, Y_train)

    # Evaluate on the test set
    Y_test_pred = ensemble.predict(X_test)
    test_accuracy = accuracy_score(Y_test, Y_test_pred)
    print(f"Test Accuracy: {test_accuracy}")

# LDA + 3 SVCs

In [9]:
N_TOPICS = 3
study = run_optimization(N_TOPICS, X_counts, Y, RANDOM_SEED)
best_params = study.best_trial.params

Performing Latent Dirichlet Allocation for 3 topics


[I 2024-04-29 11:48:08,169] A new study created in memory with name: LDA_3_SVC_Ensemble


Done performing Latent Dirichlet Allocation for 3 topics


[I 2024-04-29 11:53:25,348] Trial 11 finished with value: 0.5476681799422204 and parameters: {'C_0': 9.978450928019946, 'kernel_0': 'sigmoid', 'C_1': 5.394056745213837, 'kernel_1': 'sigmoid', 'C_2': 8.006539928574718, 'kernel_2': 'sigmoid'}. Best is trial 11 with value: 0.5476681799422204.
[I 2024-04-29 11:53:48,482] Trial 7 finished with value: 0.5489063144861742 and parameters: {'C_0': 3.5620806543037404, 'kernel_0': 'sigmoid', 'C_1': 0.16774659296069133, 'kernel_1': 'sigmoid', 'C_2': 5.227069465747458, 'kernel_2': 'sigmoid'}. Best is trial 7 with value: 0.5489063144861742.
[I 2024-04-29 11:57:44,217] Trial 1 finished with value: 0.6380520016508461 and parameters: {'C_0': 4.864873052818894, 'kernel_0': 'rbf', 'C_1': 1.1544432869880796, 'kernel_1': 'sigmoid', 'C_2': 2.209081796751062, 'kernel_2': 'rbf'}. Best is trial 1 with value: 0.6380520016508461.
[I 2024-04-29 11:57:48,567] Trial 3 finished with value: 0.6397028477094511 and parameters: {'C_0': 5.8911394062180165, 'kernel_0': 'rb

In [10]:
best_params = {'C_0': 9.695951175549485, 'kernel_0': 'rbf', 'C_1': 4.8378497287129, 'kernel_1': 'poly', 'C_2': 6.963154845339166, 'kernel_2': 'sigmoid'}
evaluate_best_model(3, X_counts, Y, RANDOM_SEED, best_params)

Test Accuracy: 0.634075907590759


# LDA + 5 SVCs

In [None]:
N_TOPICS = 5
study = run_optimization(N_TOPICS, X_counts, Y, RANDOM_SEED)
best_params = study.best_trial.params

In [None]:
evaluate_best_model(N_TOPICS, X_counts, Y, RANDOM_SEED, best_params)

# LDA + 7 SVCs

In [None]:
N_TOPICS = 7
study = run_optimization(N_TOPICS, X_counts, Y, RANDOM_SEED)
best_params = study.best_trial.params

In [None]:
evaluate_best_model(N_TOPICS, X_counts, Y, RANDOM_SEED, best_params)