### Experiment 3 - XGBoost

In [8]:
# import libraries

import os
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, average_precision_score, f1_score, precision_score, recall_score
import xgboost as xgb

import mlflow
import dagshub
import logging

print('Setup Complete')

Setup Complete


In [None]:
df = pd.read_csv('../data/data.csv')
print(df.shape)

(6362620, 10)


In [10]:
# train test split

X = df.drop(columns=['isFraud', 'isFlaggedFraud'])
y = df['isFraud'].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)


In [12]:
# Experiment tracking

mlflow.set_tracking_uri('https://dagshub.com/jayast29/ml-fraud-detection.mlflow')
dagshub.init(repo_owner='jayast29', repo_name='ml-fraud-detection', mlflow=True)
mlflow.set_experiment("XGBoost")

<Experiment: artifact_location='mlflow-artifacts:/4de042f55d7345b0a59cbbb8c1d5f2e9', creation_time=1771652380797, experiment_id='2', last_update_time=1771652380797, lifecycle_stage='active', name='XGBoost', tags={}, workspace='default'>

In [None]:
# MLflow

logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logging.getLogger("mlflow").setLevel(logging.ERROR)
logging.getLogger("urllib3").setLevel(logging.ERROR)
logger = logging.getLogger(__name__)

with mlflow.start_run(run_name="xgboost_v1"):
    
    logger.info("Starting MLflow run...")
    
    # Scale
    scaler = RobustScaler()
    num_cols = X_train.select_dtypes(include=[np.number]).columns
    X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
    X_test[num_cols] = scaler.transform(X_test[num_cols])
    logger.info("Scaling complete")
    
    # Train
    scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
    model = xgb.XGBClassifier(
        n_estimators=200,
        max_depth=6,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        scale_pos_weight=scale_pos_weight,
        tree_method='hist',
        random_state=42,
        n_jobs=-1,
        eval_metric='aucpr'
    )
    model.fit(X_train, y_train)
    logger.info("Model training complete")
    
    # Evaluate
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]
    
    roc = roc_auc_score(y_test, y_prob)
    ap = average_precision_score(y_test, y_prob)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    logger.info(f"ROC-AUC: {roc:.4f} | F1: {f1:.4f}")
    
    # Log params
    mlflow.log_param("n_estimators", 200)
    mlflow.log_param("max_depth", 6)
    mlflow.log_param("learning_rate", 0.1)
    mlflow.log_param("scale_pos_weight", scale_pos_weight)
    
    # Log metrics
    mlflow.log_metric("roc_auc", roc)
    mlflow.log_metric("average_precision", ap)
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("recall", recall)
    mlflow.log_metric("f1_score", f1)
    
    # Log model
    mlflow.xgboost.log_model(model, "model")
    logger.info("MLflow run complete")
    
    print(classification_report(y_test, y_pred))
    print("ROC-AUC:", roc)
    print("Average Precision:", ap)

2026-02-20 22:51:11,281 - INFO - Starting MLflow run...
2026-02-20 22:51:13,094 - INFO - Scaling complete
2026-02-20 22:51:56,075 - INFO - Model training complete
2026-02-20 22:51:57,823 - INFO - ROC-AUC: 0.9978 | F1: 0.1501
2026-02-20 22:52:17,781 - INFO - MLflow run complete


              precision    recall  f1-score   support

           0       1.00      0.99      0.99   1270881
           1       0.08      0.95      0.15      1643

    accuracy                           0.99   1272524
   macro avg       0.54      0.97      0.57   1272524
weighted avg       1.00      0.99      0.99   1272524

ROC-AUC: 0.9978275942894623
Average Precision: 0.8774283106325276
üèÉ View run xgboost_v1 at: https://dagshub.com/jayast29/ml-fraud-detection.mlflow/#/experiments/2/runs/4318d3fce26f49c887797bcac0cdd18f
üß™ View experiment at: https://dagshub.com/jayast29/ml-fraud-detection.mlflow/#/experiments/2


XGBoost delivers the best overall balance across all metrics. It achieves the highest recall of 0.95, catching 95% of all fraudulent transactions on the full dataset. However, precision drops to 0.08, indicating a high false positive rate. With ROC-AUC and AUPRC as primary metrics, XGBoost is selected as the final model due to its superior fraud detection capability - in banking fraud, missing real fraud is far more costly than investigating false alerts.