# XGBoost Pipeline 

## Import necessary packages

In [1]:
import time
import optuna
import numpy as np
import xgboost as xgb
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from dataset.dataset import Dataset
from constants import CLEANED_DATASET_PATH

  from .autonotebook import tqdm as notebook_tqdm


## Constants

In [2]:
RANDOM_SEED = 0

## Load the 2016 Dataset

In [3]:
dataset = Dataset(full_data_path='dataset/cleaned_dataset_v1.pkl',
                  from_scratch=False,
                  split_sizes=[8993, 3957, 4099])
dataset.build()

X_train = dataset.get_features(split_type="train")
Y_train = dataset.get_labels(split_type="train")
X_val = dataset.get_features(split_type="val")
Y_val = dataset.get_labels(split_type="val")
X_test = dataset.get_features(split_type="test")
Y_test = dataset.get_labels(split_type="test")

Data loaded from dataset/cleaned_dataset_v1.pkl


In [4]:
X_trainval = X_train + X_val
Y_trainval = Y_train + Y_val

vectorizer = TfidfVectorizer(max_df=0.95, min_df=2)
X_trainval_vectorized = vectorizer.fit_transform(X_trainval)
X_test_vectorized = vectorizer.transform(X_test)

In [4]:
# vectorizer = TfidfVectorizer(max_df=0.95, min_df=2)
# X_train_vectorized = vectorizer.fit_transform(X_train)
# X_val_vectorized = vectorizer.transform(X_val)
# X_test_vectorized = vectorizer.transform(X_test)

## Train the XGBoost model on the 2016 Dataset

In [5]:
# Define the objective function for Optuna
def create_objective(X_train, Y_train):
    def objective(trial):
        # Suggest values for the hyperparameters
        n_estimators = trial.suggest_int("n_estimators", 100, 1000)
        learning_rate = trial.suggest_float("learning_rate", 1e-4, 1e-2)
        max_depth = trial.suggest_int("max_depth", 3, 100)
        subsample = trial.suggest_float("subsample", 0.5, 1.0)
        colsample_bytree = trial.suggest_float("colsample_bytree", 0.5, 1.0)

        # Create an XGBoost classifier model with suggested parameters
        model = xgb.XGBClassifier(
            objective="binary:logistic",
            n_estimators=n_estimators,    # Number of trees
            learning_rate=learning_rate,  # Learning rate
            max_depth=max_depth,          # Depth of the trees
            subsample=subsample,          # Subsampling of the training instances
            colsample_bytree= colsample_bytree,  # Subsampling of columns for each tree
            seed=RANDOM_SEED,             # Seed for reproducibility
            use_label_encoder=False,      # Disable label encoder warning
            eval_metric="logloss")

        # Perform cross-validation and return the accuracy
        score = cross_val_score(model, X_train, Y_train, n_jobs=-1, cv=3)
        accuracy = score.mean()
        return accuracy

    return objective

In [6]:
# Create a study object
# study = optuna.create_study(direction="maximize", study_name=f"LDA_XGBoost_Pipeline")
study = optuna.create_study(direction="maximize", study_name=f"XGBoost_Pipeline")


# Create the study objective
objective = create_objective(X_trainval_vectorized, Y_trainval)

# Execute an optimization
study.optimize(objective, n_trials=100, n_jobs=-1)

# Print the best trial results
print(f"Best Accuracy: {study.best_trial.value}")
print(f"Best hyperparameters: {study.best_trial.params}")

[I 2024-06-14 12:18:36,646] A new study created in memory with name: XGBoost_Pipeline
[I 2024-06-14 12:30:17,342] Trial 3 finished with value: 0.6778382395067343 and parameters: {'n_estimators': 239, 'learning_rate': 0.0017347244365351385, 'max_depth': 70, 'subsample': 0.6787503288481672, 'colsample_bytree': 0.9568375609285915}. Best is trial 3 with value: 0.6778382395067343.


KeyboardInterrupt: 

In [None]:
best_n_estimators = study.best_trial.params["n_estimators"]
best_learning_rate = study.best_trial.params["learning_rate"]
best_max_depth = study.best_trial.params["max_depth"]
best_subsample = study.best_trial.params["subsample"]
best_colsample_bytree = study.best_trial.params["colsample_bytree"]

model = xgb.XGBClassifier(n_estimators=best_n_estimators, 
                                 learning_rate=best_learning_rate,
                                 max_depth=best_max_depth,
                                 subsample=best_subsample,
                                 colsample_bytree = best_colsample_bytree,
                                 random_state=RANDOM_SEED)
model.fit(X_trainval_vectorized, Y_trainval)
Y_pred = model.predict(X_test_vectorized)
accuracy = accuracy_score(Y_test, Y_pred)
precision = precision_score(Y_test, Y_pred)
recall = recall_score(Y_test, Y_pred)
f1 = f1_score(Y_test, Y_pred)

print(f"Best XGBoost Model Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Best XGBoost Model Accuracy: 0.7016683022571149
Precision: 0.8014469453376206
Recall: 0.5071210579857579
F1 Score: 0.621183800623053


# Other Classical Models

In [5]:
X_trainval = X_train + X_val
Y_trainval = Y_train + Y_val

In [6]:
vectorizer = TfidfVectorizer(max_df=0.95, min_df=2)
X_trainval_vectorized = vectorizer.fit_transform(X_trainval)
X_test_vectorized = vectorizer.transform(X_test)

In [7]:
X_trainval_vectorized = X_trainval_vectorized.toarray()
X_test_vectorized = X_test_vectorized.toarray()

In [10]:
# TAKES A LONG TIME TO RUN DUE TO SPARSITY OF EMBEDDINGS
def objective(trial):
    model_name = trial.suggest_categorical('model', ['Random Forest', 'Logistic Regression', 'Naive Bayes', 'SVC', 'kNN'])
    
    if model_name == 'Random Forest':
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 50, 300),
            'max_depth': trial.suggest_int('max_depth', 5, 50),
            'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
            'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),
            'max_features': trial.suggest_categorical('max_features', [None, 'sqrt', 'log2'])
        }
        model = RandomForestClassifier(**params, n_jobs=-1, random_state=42)
    
    elif model_name == 'Logistic Regression':
        params = {
            'C': trial.suggest_float('C', 1e-4, 1e2, log=True),
            'penalty': trial.suggest_categorical('penalty', ['l1', 'l2']),
            'solver': trial.suggest_categorical('solver', ['liblinear', 'saga'])
        }
        model = LogisticRegression(**params, random_state=42)
    
    elif model_name == 'Naive Bayes':
        model = GaussianNB()
    
    elif model_name == 'SVC':
        params = {
            'C': trial.suggest_float('C', 1e-4, 1e2, log=True),
            'gamma': trial.suggest_float('gamma', 1e-4, 1e-1, log=True),
            'kernel': trial.suggest_categorical('kernel', ['linear', 'rbf', 'poly', 'sigmoid'])
        }
        model = SVC(**params, random_state=42)
    
    elif model_name == 'kNN':
        params = {
            'n_neighbors': trial.suggest_int('n_neighbors', 3, 20),
            'weights': trial.suggest_categorical('weights', ['uniform', 'distance']),
            'metric': trial.suggest_categorical('metric', ['euclidean', 'manhattan', 'minkowski'])
        }
        model = KNeighborsClassifier(**params, n_jobs = -1)
    
    score = cross_val_score(model, X_trainval_vectorized, Y_trainval, cv=10, scoring='f1', n_jobs=-1).mean()
    return score


study = optuna.create_study(direction="maximize", study_name='Hate Speech Detection')
study.optimize(objective, n_trials=100) # Adjust the number of trials as necessary

[I 2024-06-13 14:10:20,056] A new study created in memory with name: Hate Speech Detection
[W 2024-06-13 14:12:40,861] Trial 0 failed with parameters: {'model': 'Random Forest', 'n_estimators': 268, 'max_depth': 27, 'min_samples_split': 19, 'min_samples_leaf': 18, 'max_features': 'sqrt'} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/Users/cayasryan/.pyenv/versions/3.11.2/lib/python3.11/site-packages/optuna/study/_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/var/folders/dk/zhw4_3kn6cb029559dwjrmt40000gn/T/ipykernel_24963/1993855732.py", line 41, in objective
    score = cross_val_score(model, X_trainval_vectorized, Y_trainval, cv=10, scoring='f1', n_jobs=-1).mean()
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/cayasryan/.pyenv/versions/3.11.2/lib/python3.11/site-packages/sklearn/utils/_param_valida

KeyboardInterrupt: 

In [None]:
# Print the best trial's details
best_trial = study.best_trial
print(f'Best model: {best_trial.params["model"]}')
print(f'Best parameters: {best_trial.params}')
print(f'Best accuracy: {best_trial.value}')

In [None]:
# Retrieve the performance of the best model
best_classifier = study.best_params["model"]
# best_model = MLPClassifier(
#         hidden_layer_sizes=best_trial.params['hidden_layer_sizes'],
#         alpha=best_trial.params['alpha'],
#         learning_rate_init=best_trial.params['learning_rate_init'],
#         max_iter=best_trial.params['max_iter'],
#         random_state=42
#     )

# Re-train the best model on the entire training set
best_model.fit(X_trainval_vectorized, Y_trainval)

Y_pred = best_model.predict(X_test_vectorized)

print(f"Best Model Parameters: {study.best_params}")
print(f"Accuracy of Best {best_classifier}: {accuracy_score(Y_test, Y_pred)}")
print(f"F1-Score of Best {best_classifier}: {f1_score(Y_test, Y_pred, average='weighted')}")

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

cfm = confusion_matrix(Y_test, Y_pred)
cm_display = ConfusionMatrixDisplay(confusion_matrix = cfm)
cm_display.plot(cmap='Blues')
plt.title('Confusion Matrix for Test Data')
plt.show()

## Load the 2016+2022 Dataset

In [18]:
dataset = Dataset(full_data_path='dataset/cleaned_2022dataset_v1.pkl',
                  from_scratch=False,
                  split_sizes=[21282,2644,2492])
dataset.build()

X_train = dataset.get_features(split_type="train")
Y_train = dataset.get_labels(split_type="train")
X_val = dataset.get_features(split_type="val")
Y_val = dataset.get_labels(split_type="val")
X_test = dataset.get_features(split_type="test")
Y_test = dataset.get_labels(split_type="test")

Data loaded from dataset/cleaned_2022dataset_v1.pkl


In [19]:
vectorizer = TfidfVectorizer(max_df=0.95, min_df=2)
X_train_vectorized = vectorizer.fit_transform(X_train)
X_val_vectorized = vectorizer.transform(X_val)
X_test_vectorized = vectorizer.transform(X_test)

## Train the XGBoost model on the 2016+2022 Dataset

In [21]:
# Define the objective function for Optuna
def create_objective(X_train, Y_train, X_test, Y_test):
    def objective(trial):
        # Suggest values for the hyperparameters
        n_estimators = trial.suggest_int("n_estimators", 100, 1000)
        learning_rate = trial.suggest_float("learning_rate", 1e-4, 1e-2)
        max_depth = trial.suggest_int("max_depth", 3, 20)
        subsample = trial.suggest_float("subsample", 0.5, 1.0)
        colsample_bytree = trial.suggest_float("colsample_bytree", 0.5, 1.0)

        # Create an XGBoost classifier model with suggested parameters
        model = xgb.XGBClassifier(
            objective="binary:logistic",
            n_estimators=n_estimators,    # Number of trees
            learning_rate=learning_rate,  # Learning rate
            max_depth=max_depth,          # Depth of the trees
            subsample=subsample,          # Subsampling of the training instances
            colsample_bytree=colsample_bytree,  # Subsampling of columns for each tree
            seed=RANDOM_SEED,             # Seed for reproducibility
            use_label_encoder=False,      # Disable label encoder warning
            eval_metric="logloss")

        # Fit the model on the training data
        model.fit(X_train, Y_train)

        # Predict the labels on the test set
        Y_pred = model.predict(X_test)

        # Calculate the accuracy
        accuracy = accuracy_score(Y_test, Y_pred)
        return accuracy
    return objective

In [22]:
# Create a study object
# study = optuna.create_study(direction="maximize", study_name=f"LDA_XGBoost_Pipeline")
study = optuna.create_study(direction="maximize", study_name=f"XGBoost_Pipeline")


# Create the study objective
objective = create_objective(X_train_vectorized, Y_train, X_val_vectorized, Y_val)

# Execute an optimization
study.optimize(objective, n_trials=20, n_jobs=-1)

# Print the best trial results
print(f"Best Accuracy: {study.best_trial.value}")
print(f"Best hyperparameters: {study.best_trial.params}")

[I 2024-06-09 18:06:27,338] A new study created in memory with name: XGBoost_Pipeline
[I 2024-06-09 18:07:10,994] Trial 3 finished with value: 0.7651285930408472 and parameters: {'n_estimators': 211, 'learning_rate': 0.006186040177201665, 'max_depth': 4, 'subsample': 0.8451707562957096, 'colsample_bytree': 0.5711144537871977}. Best is trial 3 with value: 0.7651285930408472.
[I 2024-06-09 18:07:36,728] Trial 5 finished with value: 0.7757186081694403 and parameters: {'n_estimators': 120, 'learning_rate': 0.007022088244742499, 'max_depth': 7, 'subsample': 0.5277965457069388, 'colsample_bytree': 0.9675031619290091}. Best is trial 5 with value: 0.7757186081694403.
[I 2024-06-09 18:11:40,340] Trial 7 finished with value: 0.8018154311649016 and parameters: {'n_estimators': 133, 'learning_rate': 0.002809967631056339, 'max_depth': 18, 'subsample': 0.5719656726304068, 'colsample_bytree': 0.8781240032230213}. Best is trial 7 with value: 0.8018154311649016.
[I 2024-06-09 18:16:06,467] Trial 1 fini

Best Accuracy: 0.8199697428139183
Best hyperparameters: {'n_estimators': 943, 'learning_rate': 0.00920796395393293, 'max_depth': 20, 'subsample': 0.976996041734845, 'colsample_bytree': 0.7861283006119602}


In [24]:
best_n_estimators = study.best_trial.params["n_estimators"]
best_learning_rate = study.best_trial.params["learning_rate"]
best_max_depth = study.best_trial.params["max_depth"]
best_subsample = study.best_trial.params["subsample"]
best_colsample_bytree = study.best_trial.params["colsample_bytree"]

model = xgb.XGBClassifier(n_estimators=best_n_estimators, 
                                 learning_rate=best_learning_rate,
                                 max_depth=best_max_depth,
                                 random_state=RANDOM_SEED)
model.fit(X_train_vectorized, Y_train)
Y_pred = model.predict(X_test_vectorized)
accuracy = accuracy_score(Y_test, Y_pred)
precision = precision_score(Y_test, Y_pred)
recall = recall_score(Y_test, Y_pred)
f1 = f1_score(Y_test, Y_pred)

print(f"Best XGBoost Model Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Best XGBoost Model Accuracy: 0.8252818035426731
Precision: 0.813076923076923
Recall: 0.8469551282051282
F1 Score: 0.8296703296703297


Training Time: 53m 52s \
Inference Time: 6m 19s