# Import necessary packages

In [38]:
import optuna
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC

from datasets import load_dataset

# Constants

In [26]:
RANDOM_SEED = 0

# Load the Hate Speech Filipino dataset from Hugging Face

In [27]:
dataset = load_dataset("hate_speech_filipino")

In [28]:
train_set = dataset["train"]
validation_set = dataset["validation"]
test_set = dataset["test"]

X_train, Y_train = train_set["text"], train_set["label"]
X_val, Y_val = validation_set["text"], validation_set["label"]
X_test, Y_test = test_set["text"], test_set["label"]

X = X_train + X_val + X_test
Y = Y_train + Y_val + Y_test

# Vectorize the texts to be able to perform LDA

In [39]:
# Define the CountVectorizer outside the objective to ensure it's fit just once
vectorizer = CountVectorizer(max_df=0.95, min_df=2)  # Adjust stop_words as needed
X_counts = vectorizer.fit_transform(X)

In [40]:
def objective(trial):
    # Suggest the number of topics for LDA
    n_topics = trial.suggest_int('n_topics', 3, 15)  # Adjust the range based on your dataset and needs

    # Create and fit LDA model
    lda = LatentDirichletAllocation(n_components=n_topics, random_state=RANDOM_SEED)
    X_topics = lda.fit_transform(X_counts)

    # Split the transformed data
    X_train, X_temp, Y_train, Y_temp = train_test_split(X_topics, Y, test_size=0.3, random_state=RANDOM_SEED)
    X_val, X_test, Y_val, Y_test = train_test_split(X_temp, Y_temp, test_size=0.5, random_state=RANDOM_SEED)

    # Suggest values for the SVM hyperparameters
    C = trial.suggest_float('C', 0.1, 10.0)
    kernel = trial.suggest_categorical('kernel', ['rbf', 'poly', 'sigmoid'])

    # Create the SVC model
    svc = SVC(C=C, kernel=kernel, random_state=RANDOM_SEED)
    svc.fit(X_train, Y_train)

    # Evaluate on the validation set
    Y_val_pred = svc.predict(X_val)
    return accuracy_score(Y_val, Y_val_pred)

In [41]:
# Create a study object
study = optuna.create_study(direction='maximize')

# Execute an optimization
study.optimize(objective, n_trials=50)  # Number of trials can be adjusted based on resources

# Best trial information
best_trial = study.best_trial
print(f'Best Accuracy: {best_trial.value}')
print('Best hyperparameters:', best_trial.params)

# Optional: Retrain model on full data with best parameters for final evaluation
best_n_topics = best_trial.params['n_topics']
best_lda = LatentDirichletAllocation(n_components=best_n_topics, random_state=RANDOM_SEED)
X_topics = best_lda.fit_transform(X_counts)

# Final model with all data
best_svc = SVC(**{key: best_trial.params[key] for key in ['C', 'kernel', 'gamma']}, random_state=RANDOM_SEED)
best_svc.fit(X_train, Y_train)  # Use appropriate train/test split or full dataset as needed
Y_test_pred = best_svc.predict(X_test)
test_accuracy = accuracy_score(Y_test, Y_test_pred)
print(f"Test Accuracy: {test_accuracy}")

[I 2024-04-28 12:11:12,160] A new study created in memory with name: no-name-dd5b446d-a8d7-4748-9498-b38b22365911
[I 2024-04-28 12:11:33,058] Trial 0 finished with value: 0.5447042640990372 and parameters: {'n_topics': 6, 'C': 7.449623423796225, 'kernel': 'sigmoid', 'gamma': 'scale'}. Best is trial 0 with value: 0.5447042640990372.
[W 2024-04-28 12:11:54,425] Trial 1 failed with parameters: {'n_topics': 11, 'C': 6.36488673281211, 'kernel': 'rbf', 'gamma': 'scale'} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/Users/lendluy/hate-speech-detection-ensemble-methods/lib/python3.11/site-packages/optuna/study/_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/var/folders/cq/5p30h6x5741g3nvfdq8t32g40000gn/T/ipykernel_10411/2461547898.py", line 20, in objective
    svc.fit(X_train, Y_train)
  File "/Users/lendluy/hate-speech-detection-ensemble-methods/lib/python3.11/site-packages/

KeyboardInterrupt: 