# LDA - SVC Pipeline

## Import necessary packages

In [1]:
import time
import optuna
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import NMF
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.ensemble import VotingClassifier

from dataset.dataset import Dataset
from constants import CLEANED_DATASET_PATH

## Constants

In [2]:
RANDOM_SEED = 0

## Load the Hate Speech Filipino dataset from Hugging Face

In [3]:
dataset = Dataset(full_data_path=CLEANED_DATASET_PATH,
                  from_scratch=False,
                  split_sizes=[10000, 4232, 4232])
dataset.build()

X_train = dataset.get_features(split_type="train")
Y_train = dataset.get_labels(split_type="train")
X_val = dataset.get_features(split_type="val")
Y_val = dataset.get_labels(split_type="val")
X_test = dataset.get_features(split_type="test")
Y_test = dataset.get_labels(split_type="test")

Data loaded from dataset/cleaned_dataset.pkl


## Vectorize the texts to be able to perform LDA

In [4]:
def create_objective(X_train, Y_train, X_test, Y_test, random_seed):
    def objective(trial):
        # Suggest values for the hyperparameters
        kernel = trial.suggest_categorical(f"kernel", ["rbf", "poly", "sigmoid"])
        C = trial.suggest_float(f"C", 0.1, 10.0)

        # Create the pipeline within the trial
        pipeline = Pipeline([
            ("vectorizer", CountVectorizer(max_df=0.95, min_df=2)),
            ("svc", SVC(probability=True, C=C, kernel=kernel, random_state=random_seed))
        ])

        # Fit the pipeline
        pipeline.fit(X_train, Y_train)

        # Evaluate the pipeline
        Y_pred = pipeline.predict(X_test)
        accuracy = accuracy_score(Y_test, Y_pred)
        return accuracy

    return objective

In [5]:
# Create the objective function
objective = create_objective(X_train, Y_train, X_val, Y_val, RANDOM_SEED)

# Create an Optuna study object
study = optuna.create_study(direction="maximize", study_name=f"SVC_Pipeline")

# Execute an optimization by running trials
study.optimize(objective, n_trials=30, n_jobs=-1)

best_svc_params = study.best_trial.params

# Best trial result
print(f"Best training accuracy: {study.best_trial.value}")
print(f"Best training hyperparameters: {best_svc_params}")

[I 2024-05-04 16:01:55,383] A new study created in memory with name: SVC_Pipeline
[I 2024-05-04 16:03:20,370] Trial 5 finished with value: 0.6483931947069943 and parameters: {'kernel': 'sigmoid', 'C': 6.82008628737552}. Best is trial 5 with value: 0.6483931947069943.
[I 2024-05-04 16:03:22,551] Trial 2 finished with value: 0.650992438563327 and parameters: {'kernel': 'sigmoid', 'C': 5.663324130087262}. Best is trial 2 with value: 0.650992438563327.
[I 2024-05-04 16:03:37,313] Trial 11 finished with value: 0.6552457466918714 and parameters: {'kernel': 'sigmoid', 'C': 3.9917939369747355}. Best is trial 11 with value: 0.6552457466918714.
[I 2024-05-04 16:03:37,984] Trial 8 finished with value: 0.6550094517958412 and parameters: {'kernel': 'sigmoid', 'C': 4.1782610448748025}. Best is trial 11 with value: 0.6552457466918714.
[I 2024-05-04 16:03:43,776] Trial 3 finished with value: 0.6717863894139886 and parameters: {'kernel': 'sigmoid', 'C': 3.3662508253029886}. Best is trial 3 with value: 

Best training accuracy: 0.7592155009451795
Best training hyperparameters: {'kernel': 'rbf', 'C': 4.438008982749939}


In [6]:
# Create and fit the pipeline
pipeline = Pipeline([
            ("vectorizer", CountVectorizer(max_df=0.95, min_df=2)),
            ("svc", SVC(C=best_svc_params["C"], 
                        kernel=best_svc_params["kernel"], 
                        random_state=RANDOM_SEED))
        ])

# Fit the pipeline
pipeline.fit(X_train, Y_train)

# Evaluate the pipeline
Y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(Y_test, Y_pred)
f1 = f1_score(Y_test, Y_pred)

# Best testing result
print(f"Best test accuracy: {accuracy}")
print(f"Best test F1-score: {f1}")
print(f"Best test hyperparameters: {study.best_trial.params}")

Best test accuracy: 0.7511814744801513
Best test F1-score: 0.7235494880546075
Best test hyperparameters: {'kernel': 'rbf', 'C': 4.438008982749939}


## NMF-SVC Pipeline

In [7]:
def create_objective(X_train, Y_train, X_test, Y_test, best_svc_params, random_seed):
    def objective(trial):
        # Suggest values for the hyperparameters
        n_topics = trial.suggest_int("n_topics", 50, 300)

        # Create the pipeline within the trial
        pipeline = Pipeline([
            ("vectorizer", CountVectorizer(max_df=0.95, min_df=2)),
            ("svc", SVC(probability=True, 
                        C=best_svc_params["C"], 
                        kernel=best_svc_params["kernel"], 
                        random_state=random_seed))
        ])

        # Fit the pipeline
        pipeline.fit(X_train, Y_train)

        # Evaluate the pipeline
        Y_pred = pipeline.predict(X_test)
        accuracy = accuracy_score(Y_test, Y_pred)
        return accuracy

    return objective

In [8]:
# Create the objective function
objective = create_objective(X_train, Y_train, X_val, Y_val, best_svc_params, RANDOM_SEED)

# Create an Optuna study object
study = optuna.create_study(direction="maximize", study_name=f"NMF_SVC_Pipeline")

# Execute an optimization by running trials
study.optimize(objective, n_trials=30, n_jobs=-1)

best_nmf_params = study.best_trial.params

# Best trial result
print(f"Best training accuracy: {study.best_trial.value}")
print(f"Best training hyperparameters: {best_nmf_params}")

[I 2024-05-04 16:10:07,715] A new study created in memory with name: NMF_SVC_Pipeline
[I 2024-05-04 16:16:41,782] Trial 9 finished with value: 0.7592155009451795 and parameters: {'n_topics': 237}. Best is trial 9 with value: 0.7592155009451795.
[I 2024-05-04 16:16:41,922] Trial 7 finished with value: 0.7592155009451795 and parameters: {'n_topics': 300}. Best is trial 9 with value: 0.7592155009451795.
[I 2024-05-04 16:16:41,926] Trial 8 finished with value: 0.7592155009451795 and parameters: {'n_topics': 133}. Best is trial 9 with value: 0.7592155009451795.
[I 2024-05-04 16:16:42,087] Trial 0 finished with value: 0.7592155009451795 and parameters: {'n_topics': 172}. Best is trial 9 with value: 0.7592155009451795.
[I 2024-05-04 16:16:42,091] Trial 2 finished with value: 0.7592155009451795 and parameters: {'n_topics': 295}. Best is trial 9 with value: 0.7592155009451795.
[I 2024-05-04 16:16:42,153] Trial 3 finished with value: 0.7592155009451795 and parameters: {'n_topics': 159}. Best is 

In [None]:
# Create and fit the pipeline
pipeline = Pipeline([
    ("vectorizer", CountVectorizer(max_df=0.95, min_df=2)),
    ("nmf", NMF(n_components=best_nmf_params["n_topics"], 
                random_state=RANDOM_SEED)),
    ("svc", SVC(probability=True, 
                C=best_svc_params["C"], 
                kernel=best_svc_params["kernel"], 
                random_state=RANDOM_SEED))
])

# Fit the pipeline
pipeline.fit(X_train, Y_train)

# Evaluate the pipeline
Y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(Y_test, Y_pred)
f1 = f1_score(Y_test, Y_pred)

# Best testing result
print(f"Best test accuracy: {accuracy}")
print(f"Best test F1-score: {f1}")
print(f"Best test hyperparameters: {study.best_trial.params}")