<center>

# **PREDICTIVE METHOD**<br>
# **XGBOOST - WITH PCA**<br>

by: Ly Nguyen

</center>


In [1]:
# Import necessary libraries for this notebook: 

# Read from SQLite database and load to a pandas dataframe
import os
import sqlite3
import pandas as pd

# For using arrays 
import numpy as np

# For ML work (data preprocessing, hyperparameter tuning, Random Forest Classifier, training & testing sets, and stratified sampling)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder 
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.utils.class_weight import compute_class_weight


# For Dimensionality Reduction
from sklearn.decomposition import PCA
import umap.umap_ as umap
from sklearn.preprocessing import StandardScaler

# For model evaluation, including explainability:  
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report, balanced_accuracy_score, make_scorer
from sklearn.utils.class_weight import compute_class_weight
import statsmodels.api as sm
import shap

# For data visualization 
import matplotlib.pyplot as plt
import seaborn as sns

# For saving the model into a pkl file
import joblib



  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Define the base directory for the files
base_dir = os.path.join("..", "src")

# Define file paths
file_paths = {
    "df_reduced": "df_reduced.parquet",
    "X_pca_df": "X_pca_df.csv",
    "X_train_pca" : "X_train_pca.csv", 
    "X_test_pca" : "X_test_pca.csv", 
    "y_train_pca": "y_train_pca.csv",
    "y_test_pca": "y_test_pca.csv",
    }

# Load files
df_reduced = pd.read_parquet(os.path.join(base_dir, file_paths["df_reduced"]))
X_pca_df = pd.read_csv(os.path.join(base_dir, file_paths["X_pca_df"]))
X_train_pca = pd.read_csv(os.path.join(base_dir, file_paths["X_train_pca"]))
X_test_pca = pd.read_csv(os.path.join(base_dir, file_paths["X_test_pca"]))
y_train_pca = pd.read_csv(os.path.join(base_dir, file_paths["y_train_pca"]))
y_test_pca = pd.read_csv(os.path.join(base_dir, file_paths["y_test_pca"]))


In [4]:
# Define X and y:
X = df_reduced.drop(columns=['delayType'])  # Use parentheses with the 'columns' argument
y = df_reduced['delayType']


# Apply stratified sampling, balanced class weights, and balanced accuracy score to the cross-validation for XGBoost hyperparameter tuning

In [10]:
y_train_pca = y_train_pca.squeeze()  # Convert to Series if it's a DataFrame


In [11]:
classes = np.unique(y_train_pca)  # Ensure it matches the unique values in y_train_pca


In [30]:
# Shift class labels to zero-based indexing
y_train_pca -= 1
y_test_pca -= 1

# Verify unique values
print("Unique values in y_train_pca:", np.unique(y_train_pca))
print("Unique values in y_test_pca:", np.unique(y_test_pca))

Unique values in y_train_pca: [0 1 2]
Unique values in y_test_pca: [0 1 2]


In [31]:
from sklearn.model_selection import StratifiedKFold

# Define Stratified K-Fold
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


In [32]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

# Compute class weights
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train_pca),
    y=y_train_pca
)
print("Class weights:", class_weights)

# Create sample weights for each instance
sample_weights = np.array([class_weights[label] for label in y_train_pca])

Class weights: [3.12588783 0.47592945 1.72729733]


In [33]:
# Define the balanced accuracy scorer
balanced_accuracy_scorer = make_scorer(balanced_accuracy_score)

# Define the model
model = XGBClassifier(
    objective='multi:softprob',
    eval_metric='mlogloss',
    random_state=42
)

# Define the parameter grid
param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'n_estimators': [50, 100, 200],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'min_child_weight': [1, 3, 5]
}

# Set up RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_grid,
    n_iter=50,
    scoring=balanced_accuracy_scorer,
    cv=cv,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

# Perform the hyperparameter search with sample weights
random_search.fit(X_train_pca, y_train_pca, sample_weight=sample_weights)

# Get the best parameters
print("Best parameters:", random_search.best_params_)


Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best parameters: {'subsample': 0.8, 'n_estimators': 100, 'min_child_weight': 3, 'max_depth': 3, 'learning_rate': 0.2, 'colsample_bytree': 0.8}


# Hyperparameter tuning
*Using Randomized Search CV (for a faster result than Grid Search CV, although the tradeoff would be accuracy)*

# Train final model

In [34]:
best_xgb = random_search.best_estimator_
best_xgb.fit(X_train_pca, y_train_pca)


In [35]:
# Evaluate the model on the test set
y_pred_pca = best_xgb.predict(X_test_pca)

In [36]:
# Generate the classification report
class_report_xgboost_pca = classification_report(y_test_pca, y_pred_pca)
print("Classification Report:\n", class_report_xgboost_pca)

# Calculate and print the balanced accuracy score
balanced_acc_xgboost_pca = balanced_accuracy_score(y_test_pca, y_pred_pca)
print(f"\nBalanced Accuracy: {balanced_acc_xgboost_pca:.2f}")

Classification Report:
               precision    recall  f1-score   support

           0       0.64      0.20      0.30       583
           1       0.75      0.94      0.83      3832
           2       0.61      0.26      0.36      1056

    accuracy                           0.73      5471
   macro avg       0.67      0.47      0.50      5471
weighted avg       0.71      0.73      0.69      5471


Balanced Accuracy: 0.47


### **INTERPRETATION:**

- So far, this model gives the best **accuracy** score (73%), but a poor **balanced accuracy score** (47%). 
- It performs very well for the majority class (Delay Type 2 - normal delay) across precision (75%), recall (94%) and F1 score (83%), but performs poorly at recall scores for the minority classes (20% and 26%) compared to the other models.
- Due to these reasons, the RF-PCA model is the optimal one so far.

---
---