# XGBoost Pipeline 

## Import necessary packages

In [17]:
import time
import optuna
import numpy as np
import xgboost as xgb
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from dataset.dataset import Dataset
from constants import CLEANED_DATASET_PATH

## Constants

In [2]:
RANDOM_SEED = 0

## Load the 2016 Dataset

In [None]:
dataset = Dataset(full_data_path='dataset/cleaned_dataset_v1.pkl',
                  from_scratch=False,
                  split_sizes=[21282,2644,2492])
dataset.build()

X_train = dataset.get_features(split_type="train")
Y_train = dataset.get_labels(split_type="train")
X_val = dataset.get_features(split_type="val")
Y_val = dataset.get_labels(split_type="val")
X_test = dataset.get_features(split_type="test")
Y_test = dataset.get_labels(split_type="test")

In [None]:
vectorizer = TfidfVectorizer(max_df=0.95, min_df=2)
X_train_vectorized = vectorizer.fit_transform(X_train)
X_val_vectorized = vectorizer.transform(X_val)
X_test_vectorized = vectorizer.transform(X_test)

## Train the XGBoost model on the 2016 Dataset

In [None]:
# Define the objective function for Optuna
def create_objective(X_train, Y_train, X_test, Y_test):
    def objective(trial):
        # Suggest values for the hyperparameters
        n_estimators = trial.suggest_int("n_estimators", 100, 1000)
        learning_rate = trial.suggest_float("learning_rate", 1e-4, 1e-2)
        max_depth = trial.suggest_int("max_depth", 3, 20)
        subsample = trial.suggest_float("subsample", 0.5, 1.0)
        colsample_bytree = trial.suggest_float("colsample_bytree", 0.5, 1.0)

        # Create an XGBoost classifier model with suggested parameters
        model = xgb.XGBClassifier(
            objective="binary:logistic",
            n_estimators=n_estimators,    # Number of trees
            learning_rate=learning_rate,  # Learning rate
            max_depth=max_depth,          # Depth of the trees
            subsample=subsample,          # Subsampling of the training instances
            colsample_bytree=colsample_bytree,  # Subsampling of columns for each tree
            seed=RANDOM_SEED,             # Seed for reproducibility
            use_label_encoder=False,      # Disable label encoder warning
            eval_metric="logloss")

        # Fit the model on the training data
        model.fit(X_train, Y_train)

        # Predict the labels on the test set
        Y_pred = model.predict(X_test)

        # Calculate the accuracy
        accuracy = accuracy_score(Y_test, Y_pred)
        return accuracy
    return objective

In [None]:
# Create a study object
# study = optuna.create_study(direction="maximize", study_name=f"LDA_XGBoost_Pipeline")
study = optuna.create_study(direction="maximize", study_name=f"XGBoost_Pipeline")


# Create the study objective
objective = create_objective(X_train_vectorized, Y_train, X_val_vectorized, Y_val)

# Execute an optimization
study.optimize(objective, n_trials=20, n_jobs=-1)

# Print the best trial results
print(f"Best Accuracy: {study.best_trial.value}")
print(f"Best hyperparameters: {study.best_trial.params}")

In [None]:
best_n_estimators = study.best_trial.params["n_estimators"]
best_learning_rate = study.best_trial.params["learning_rate"]
best_max_depth = study.best_trial.params["max_depth"]
best_subsample = study.best_trial.params["subsample"]
best_colsample_bytree = study.best_trial.params["colsample_bytree"]

model = xgb.XGBClassifier(n_estimators=best_n_estimators, 
                                 learning_rate=best_learning_rate,
                                 max_depth=best_max_depth,
                                 random_state=RANDOM_SEED)
model.fit(X_train_vectorized, Y_train)
Y_pred = model.predict(X_test_vectorized)
accuracy = accuracy_score(Y_test, Y_pred)
precision = precision_score(Y_test, Y_pred)
recall = recall_score(Y_test, Y_pred)
f1 = f1_score(Y_test, Y_pred)

print(f"Best XGBoost Model Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

## Load the 2016+2022 Dataset

In [18]:
dataset = Dataset(full_data_path='dataset/cleaned_2022dataset_v1.pkl',
                  from_scratch=False,
                  split_sizes=[21282,2644,2492])
dataset.build()

X_train = dataset.get_features(split_type="train")
Y_train = dataset.get_labels(split_type="train")
X_val = dataset.get_features(split_type="val")
Y_val = dataset.get_labels(split_type="val")
X_test = dataset.get_features(split_type="test")
Y_test = dataset.get_labels(split_type="test")

Data loaded from dataset/cleaned_2022dataset_v1.pkl


In [19]:
vectorizer = TfidfVectorizer(max_df=0.95, min_df=2)
X_train_vectorized = vectorizer.fit_transform(X_train)
X_val_vectorized = vectorizer.transform(X_val)
X_test_vectorized = vectorizer.transform(X_test)

## Train the XGBoost model on the 2016+2022 Dataset

In [21]:
# Define the objective function for Optuna
def create_objective(X_train, Y_train, X_test, Y_test):
    def objective(trial):
        # Suggest values for the hyperparameters
        n_estimators = trial.suggest_int("n_estimators", 100, 1000)
        learning_rate = trial.suggest_float("learning_rate", 1e-4, 1e-2)
        max_depth = trial.suggest_int("max_depth", 3, 20)
        subsample = trial.suggest_float("subsample", 0.5, 1.0)
        colsample_bytree = trial.suggest_float("colsample_bytree", 0.5, 1.0)

        # Create an XGBoost classifier model with suggested parameters
        model = xgb.XGBClassifier(
            objective="binary:logistic",
            n_estimators=n_estimators,    # Number of trees
            learning_rate=learning_rate,  # Learning rate
            max_depth=max_depth,          # Depth of the trees
            subsample=subsample,          # Subsampling of the training instances
            colsample_bytree=colsample_bytree,  # Subsampling of columns for each tree
            seed=RANDOM_SEED,             # Seed for reproducibility
            use_label_encoder=False,      # Disable label encoder warning
            eval_metric="logloss")

        # Fit the model on the training data
        model.fit(X_train, Y_train)

        # Predict the labels on the test set
        Y_pred = model.predict(X_test)

        # Calculate the accuracy
        accuracy = accuracy_score(Y_test, Y_pred)
        return accuracy
    return objective

In [22]:
# Create a study object
# study = optuna.create_study(direction="maximize", study_name=f"LDA_XGBoost_Pipeline")
study = optuna.create_study(direction="maximize", study_name=f"XGBoost_Pipeline")


# Create the study objective
objective = create_objective(X_train_vectorized, Y_train, X_val_vectorized, Y_val)

# Execute an optimization
study.optimize(objective, n_trials=20, n_jobs=-1)

# Print the best trial results
print(f"Best Accuracy: {study.best_trial.value}")
print(f"Best hyperparameters: {study.best_trial.params}")

[I 2024-06-09 18:06:27,338] A new study created in memory with name: XGBoost_Pipeline
[I 2024-06-09 18:07:10,994] Trial 3 finished with value: 0.7651285930408472 and parameters: {'n_estimators': 211, 'learning_rate': 0.006186040177201665, 'max_depth': 4, 'subsample': 0.8451707562957096, 'colsample_bytree': 0.5711144537871977}. Best is trial 3 with value: 0.7651285930408472.
[I 2024-06-09 18:07:36,728] Trial 5 finished with value: 0.7757186081694403 and parameters: {'n_estimators': 120, 'learning_rate': 0.007022088244742499, 'max_depth': 7, 'subsample': 0.5277965457069388, 'colsample_bytree': 0.9675031619290091}. Best is trial 5 with value: 0.7757186081694403.
[I 2024-06-09 18:11:40,340] Trial 7 finished with value: 0.8018154311649016 and parameters: {'n_estimators': 133, 'learning_rate': 0.002809967631056339, 'max_depth': 18, 'subsample': 0.5719656726304068, 'colsample_bytree': 0.8781240032230213}. Best is trial 7 with value: 0.8018154311649016.
[I 2024-06-09 18:16:06,467] Trial 1 fini

Best Accuracy: 0.8199697428139183
Best hyperparameters: {'n_estimators': 943, 'learning_rate': 0.00920796395393293, 'max_depth': 20, 'subsample': 0.976996041734845, 'colsample_bytree': 0.7861283006119602}


In [24]:
best_n_estimators = study.best_trial.params["n_estimators"]
best_learning_rate = study.best_trial.params["learning_rate"]
best_max_depth = study.best_trial.params["max_depth"]
best_subsample = study.best_trial.params["subsample"]
best_colsample_bytree = study.best_trial.params["colsample_bytree"]

model = xgb.XGBClassifier(n_estimators=best_n_estimators, 
                                 learning_rate=best_learning_rate,
                                 max_depth=best_max_depth,
                                 random_state=RANDOM_SEED)
model.fit(X_train_vectorized, Y_train)
Y_pred = model.predict(X_test_vectorized)
accuracy = accuracy_score(Y_test, Y_pred)
precision = precision_score(Y_test, Y_pred)
recall = recall_score(Y_test, Y_pred)
f1 = f1_score(Y_test, Y_pred)

print(f"Best XGBoost Model Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Best XGBoost Model Accuracy: 0.8252818035426731
Precision: 0.813076923076923
Recall: 0.8469551282051282
F1 Score: 0.8296703296703297


Training Time: 53m 52s \
Inference Time: 6m 19s