# LDA - RF Pipeline 

## Import necessary packages

In [1]:
import time
import optuna
import numpy as np
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

from dataset.dataset import Dataset
from dataset.dataset_constants import split_sizes_cleaned

## Constants

In [2]:
RANDOM_SEED = 0

## Load the Hate Speech Filipino dataset from Hugging Face

In [3]:
dataset = Dataset(full_data_path="dataset/cleaned_dataset_v1.pkl",
                  from_scratch=False,
                  split_sizes=split_sizes_cleaned)
dataset.build()

X_train = dataset.get_features(split_type="train")
Y_train = dataset.get_labels(split_type="train")
X_val = dataset.get_features(split_type="val")
Y_val = dataset.get_labels(split_type="val")
X_test = dataset.get_features(split_type="test")
Y_test = dataset.get_labels(split_type="test")

Data loaded from dataset/cleaned_dataset_v1.pkl


In [4]:
# Re-split the dataset into training, validation, and test sets
# X = X_train + X_val + X_test
# Y = Y_train + Y_val + Y_test
# X_train, X_temp, Y_train, Y_temp = train_test_split(X, Y, test_size=0.2, random_state=RANDOM_SEED)
# X_val, X_test, Y_val, Y_test = train_test_split(X_temp, Y_temp, test_size=0.5, random_state=RANDOM_SEED)

## Vectorize the texts to be able to perform LDA

In [5]:
# Define the CountVectorizer
vectorizer = CountVectorizer(max_df=0.95, min_df=2)
X_train_vector = vectorizer.fit_transform(X_train)
X_val_counts = vectorizer.transform(X_val)
X_test_counts = vectorizer.transform(X_test)

## Search for the best hyperparameters of the Random Forest model

In [6]:
# Define the objective function for Optuna
def create_objective(X_train, Y_train, X_test, Y_test):
    def objective(trial):
        n_estimators = trial.suggest_int("n_estimators", 100, 1000)
        rf = RandomForestClassifier(n_estimators=n_estimators, 
                                    random_state=RANDOM_SEED)
        rf.fit(X_train, Y_train)
        Y_pred = rf.predict(X_test)
        score = accuracy_score(Y_test, Y_pred)
        return score
    return objective

In [7]:
# Create a study object
study = optuna.create_study(direction="maximize", study_name=f"RF_Pipeline")

# Create the study objective
objective = create_objective(X_train_vector, Y_train, X_val_counts, Y_val)

# Execute an optimization
study.optimize(objective, n_trials=20, n_jobs=-1)

# Print the best trial results
print(f"Best Accuracy: {study.best_trial.value}")
print(f"Best hyperparameters: {study.best_trial.params}")

[I 2024-06-14 15:17:46,959] A new study created in memory with name: RF_Pipeline
[I 2024-06-14 15:18:14,993] Trial 7 finished with value: 0.7326257265605256 and parameters: {'n_estimators': 422}. Best is trial 7 with value: 0.7326257265605256.
[I 2024-06-14 15:18:37,407] Trial 1 finished with value: 0.731614859742229 and parameters: {'n_estimators': 761}. Best is trial 7 with value: 0.7326257265605256.
[I 2024-06-14 15:18:39,330] Trial 0 finished with value: 0.731614859742229 and parameters: {'n_estimators': 804}. Best is trial 7 with value: 0.7326257265605256.
[I 2024-06-14 15:18:41,066] Trial 6 finished with value: 0.7323730098559514 and parameters: {'n_estimators': 813}. Best is trial 7 with value: 0.7326257265605256.
[I 2024-06-14 15:18:41,775] Trial 5 finished with value: 0.7318675764468031 and parameters: {'n_estimators': 828}. Best is trial 7 with value: 0.7326257265605256.
[I 2024-06-14 15:18:44,122] Trial 2 finished with value: 0.7321202931513773 and parameters: {'n_estimators

Best Accuracy: 0.7333838766742482
Best hyperparameters: {'n_estimators': 219}


In [9]:
best_n_estimators = study.best_trial.params["n_estimators"]

rf = RandomForestClassifier(n_estimators=best_n_estimators, random_state=RANDOM_SEED)
rf.fit(X_train_vector, Y_train)
Y_pred = rf.predict(X_test_counts)
score = accuracy_score(Y_test, Y_pred)
# print(f"Best LDA-RF Model Accuracy: {score}")
print(f"Best NMF-RF Model Accuracy: {score}")

Best NMF-RF Model Accuracy: 0.7174920712368871


In [10]:
# Define the objective function for Optuna
def create_objective(X_train, Y_train, X_test, Y_test):
    def objective(trial):
        # Suggest values for the hyperparameters
        n_estimators = trial.suggest_int("n_estimators", 100, 1000)
        learning_rate = trial.suggest_float("learning_rate", 1e-4, 1e-2)
        max_depth = trial.suggest_int("max_depth", 3, 20)
        subsample = trial.suggest_float("subsample", 0.5, 1.0)
        colsample_bytree = trial.suggest_float("colsample_bytree", 0.5, 1.0)

        # Create an XGBoost classifier model with suggested parameters
        model = xgb.XGBClassifier(
            objective="binary:logistic",
            n_estimators=n_estimators,    # Number of trees
            learning_rate=learning_rate,  # Learning rate
            max_depth=max_depth,          # Depth of the trees
            subsample=subsample,          # Subsampling of the training instances
            colsample_bytree=colsample_bytree,  # Subsampling of columns for each tree
            seed=RANDOM_SEED,             # Seed for reproducibility
            use_label_encoder=False,      # Disable label encoder warning
            eval_metric="logloss")

        # Fit the model on the training data
        model.fit(X_train, Y_train)

        # Predict the labels on the test set
        Y_pred = model.predict(X_test)

        # Calculate the accuracy
        accuracy = accuracy_score(Y_test, Y_pred)
        return accuracy
    return objective

In [11]:
# Create a study object
study = optuna.create_study(direction="maximize", study_name=f"XGBoost_Pipeline")

# Create the study objective
objective = create_objective(X_train_vector, Y_train, X_val_counts, Y_val)

# Execute an optimization
study.optimize(objective, n_trials=20, n_jobs=-1)

# Print the best trial results
print(f"Best Accuracy: {study.best_trial.value}")
print(f"Best hyperparameters: {study.best_trial.params}")

[I 2024-06-14 15:22:28,991] A new study created in memory with name: XGBoost_Pipeline
[I 2024-06-14 15:22:33,513] Trial 7 finished with value: 0.6413949962092494 and parameters: {'n_estimators': 266, 'learning_rate': 0.00481807085623704, 'max_depth': 7, 'subsample': 0.7050784296785573, 'colsample_bytree': 0.9642485952958153}. Best is trial 7 with value: 0.6413949962092494.
[I 2024-06-14 15:22:33,915] Trial 4 finished with value: 0.6499873641647713 and parameters: {'n_estimators': 121, 'learning_rate': 0.002419082798583541, 'max_depth': 17, 'subsample': 0.9159316590587236, 'colsample_bytree': 0.6601483126580272}. Best is trial 4 with value: 0.6499873641647713.
[I 2024-06-14 15:22:36,452] Trial 1 finished with value: 0.6105635582512005 and parameters: {'n_estimators': 487, 'learning_rate': 0.0008392042108450673, 'max_depth': 5, 'subsample': 0.9363021485921652, 'colsample_bytree': 0.6566180999905978}. Best is trial 4 with value: 0.6499873641647713.
[I 2024-06-14 15:22:43,871] Trial 5 fini

Best Accuracy: 0.7210007581501138
Best hyperparameters: {'n_estimators': 964, 'learning_rate': 0.009929167613734993, 'max_depth': 20, 'subsample': 0.8376826351129252, 'colsample_bytree': 0.733864778451071}


In [12]:
best_n_estimators = study.best_trial.params["n_estimators"]
best_learning_rate = study.best_trial.params["learning_rate"]
best_max_depth = study.best_trial.params["max_depth"]
best_subsample = study.best_trial.params["subsample"]
best_colsample_bytree = study.best_trial.params["colsample_bytree"]

model = xgb.XGBClassifier(n_estimators=best_n_estimators, 
                                 learning_rate=best_learning_rate,
                                 max_depth=best_max_depth,
                                 random_state=RANDOM_SEED)
model.fit(X_train_vector, Y_train)
Y_pred = model.predict(X_test_counts)
score = accuracy_score(Y_test, Y_pred)
# print(f"Best LDA-XGBoost Model Accuracy: {score}")
print(f"Best NMF-XGBoost Model Accuracy: {score}")

Best NMF-XGBoost Model Accuracy: 0.7226152720175653


In [19]:
import autosklearn.classification

# Define Auto-sklearn classifier with validation set
automl = autosklearn.classification.AutoSklearnClassifier(
    time_left_for_this_task=120,
    per_run_time_limit=30,
    include={
        "classifier": ["random_forest", "gradient_boosting", "xgboost", "liblinear_svc", "libsvm_svc"]
    },
    ensemble_size=1,
    n_jobs=-1,
    # New: Pass validation data using X_val and y_val 
    validation_split=0.0,
    resampling_strategy='holdout',
    resampling_strategy_arguments={'train_size': 0.8, 'shuffle': True},
)

# Train the models
automl.fit(X_train_vector, Y_train, X_val_counts, Y_val) # Pass validation data here

# Evaluate the best model on the test set
Y_pred = automl.predict(X_test)
accuracy = accuracy_score(Y_test, Y_pred)
print(f"Test Accuracy: {accuracy:.4f}")

# Print the best model and its hyperparameters
print("Best Model:", automl.show_models())
print("Best Hyperparameters:", automl.sprint_statistics())


ModuleNotFoundError: No module named 'autosklearn'