In [22]:
# Basic libraries
import pandas as pd
import numpy as np

# Load the preprocessed dataset
df = pd.read_csv("../data/synthetic_finory_preprocessed.csv")

# Feature columns and target
feature_cols = ['vendor_encoded', 'amount_log', 'payment_encoded', 'day_of_week', 'month']
target_col = 'category_encoded'

X = df[feature_cols]
y = df[target_col]

# Train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [24]:
# -----------------------------------------------------------------------------------
# Phase 5: Hyperparameter Tuning for XGBoost (GridSearchCV)
# -----------------------------------------------------------------------------------
# Objective:
# Systematically search all parameter combinations for XGBoost 
# using GridSearchCV to find the best-performing model.
# -----------------------------------------------------------------------------------

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier

# âœ… Define parameter grid for exhaustive search
param_grid = {
    'max_depth': [3, 5, 7],        # Depth of trees
    'n_estimators': [50, 100],     # Number of boosting rounds
    'learning_rate': [0.05, 0.1]   # Step size shrinkage
}

# âœ… Initialize the base model
xgb_model = XGBClassifier(
    objective='multi:softmax',
    num_class=len(np.unique(y_train)),  # Ensure correct number of output classes
    random_state=42
)

# âœ… GridSearchCV: tries ALL combinations with cross-validation
grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring='accuracy',
    cv=3,        # 3-fold cross-validation
    verbose=1    # Prints progress
)

# âœ… Fit on training data
grid_search.fit(X_train, y_train)

# âœ… Best parameters and best cross-validation accuracy
print("\nðŸŽ¯ Best Parameters:", grid_search.best_params_)
print("âœ… Best CV Accuracy:", grid_search.best_score_)

# âœ… Evaluate the tuned model on the held-out test set
best_xgb = grid_search.best_estimator_
y_pred_best = best_xgb.predict(X_test)

print("\nðŸš€ Test Accuracy with Best Params:", accuracy_score(y_test, y_pred_best))
print("\nClassification Report:\n", classification_report(y_test, y_pred_best))

Fitting 3 folds for each of 12 candidates, totalling 36 fits

ðŸŽ¯ Best Parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50}
âœ… Best CV Accuracy: 0.6072526377471429

ðŸš€ Test Accuracy with Best Params: 0.602

Classification Report:
               precision    recall  f1-score   support

           0       0.33      0.17      0.22        12
           1       0.00      0.00      0.00        42
           2       0.33      0.02      0.04        52
           3       0.00      0.00      0.00        45
           4       0.75      0.43      0.55        95
           5       0.60      0.80      0.69       521
           6       0.66      0.68      0.67       157
           7       0.45      0.45      0.45        76

    accuracy                           0.60      1000
   macro avg       0.39      0.32      0.33      1000
weighted avg       0.54      0.60      0.55      1000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [25]:
# -----------------------------------------------------------------------------------
# Phase 5: Hyperparameter Tuning for XGBoost
# -----------------------------------------------------------------------------------
# Objective:
# Improve XGBoost model performance using RandomizedSearchCV to find the best parameters.
# -----------------------------------------------------------------------------------

from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

# âœ… Define the parameter search space
param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0],
    'min_child_weight': [1, 3, 5],
    'gamma': [0, 1, 5]
}

# âœ… Initialize the base XGBoost model
xgb_model = XGBClassifier(
    objective='multi:softmax',
    num_class=len(np.unique(y_train)),
    random_state=42,
    n_jobs=-1
)

# âœ… Randomized Search with 15 combinations & 3-fold CV
random_search_xgb = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_dist,
    n_iter=15,            # randomly sample 15 combinations
    scoring='accuracy',
    cv=3,                 # 3-fold cross-validation
    verbose=1,
    random_state=42,
    n_jobs=-1
)

# âœ… Fit the random search
random_search_xgb.fit(X_train, y_train)

# âœ… Best parameters & best CV score
print("\nðŸŽ¯ Best Parameters:", random_search_xgb.best_params_)
print("âœ… Best CV Accuracy:", random_search_xgb.best_score_)

# âœ… Evaluate tuned model on test data
best_xgb = random_search_xgb.best_estimator_
y_pred_xgb = best_xgb.predict(X_test)

print("\nðŸš€ Tuned XGBoost Test Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("\nClassification Report:\n", classification_report(y_test, y_pred_xgb))

Fitting 3 folds for each of 15 candidates, totalling 45 fits

ðŸŽ¯ Best Parameters: {'subsample': 0.7, 'n_estimators': 100, 'min_child_weight': 3, 'max_depth': 7, 'learning_rate': 0.2, 'gamma': 5, 'colsample_bytree': 1.0}
âœ… Best CV Accuracy: 0.6087530128409164

ðŸš€ Tuned XGBoost Test Accuracy: 0.616

Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00        12
           1       0.00      0.00      0.00        42
           2       0.00      0.00      0.00        52
           3       0.00      0.00      0.00        45
           4       0.75      0.43      0.55        95
           5       0.61      0.81      0.70       521
           6       0.66      0.68      0.67       157
           7       0.49      0.58      0.53        76

    accuracy                           0.62      1000
   macro avg       0.31      0.31      0.31      1000
weighted avg       0.53      0.62      0.56      1000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [29]:
# Load saved encoders from preprocessing step
label_encoders = joblib.load("../models/finory_label_encoders.joblib")
print("âœ… Loaded label encoders successfully!")

âœ… Loaded label encoders successfully!


In [33]:
# Save the tuned XGBoost model
import os
import joblib

# Ensure models folder exists
os.makedirs("../models", exist_ok=True)

# Paths for saving
model_path = "../models/finory_baseline_xgb.joblib"
encoders_path = "../models/finory_label_encoders.joblib"

# Save the tuned model
joblib.dump(best_xgb, model_path)

# Save the already loaded label_encoders (no need to redefine!)
joblib.dump(label_encoders, encoders_path)

print(f"âœ… Baseline tuned model saved at: {model_path}")
print(f"âœ… Label encoders saved at: {encoders_path}")

âœ… Baseline tuned model saved at: ../models/finory_baseline_xgb.joblib
âœ… Label encoders saved at: ../models/finory_label_encoders.joblib
