# Sentinel: Credit Card Fraud Detection Training (Local)

Train XGBoost model locally with MLflow experiment tracking via DagsHub.

## 1. Setup

In [None]:
import os
import sys
import json
import warnings
from pathlib import Path

warnings.filterwarnings('ignore')

# Add src to path
sys.path.insert(0, str(Path.cwd().parent / 'src'))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    precision_recall_curve,
    average_precision_score,
    roc_auc_score,
    f1_score,
    precision_score,
    recall_score
)

import xgboost as xgb
import mlflow
import dagshub

print(f"XGBoost: {xgb.__version__}")
print(f"MLflow: {mlflow.__version__}")
print("Setup complete!")

## 2. Load Local Data

In [None]:
# Local data paths
DATA_DIR = Path.cwd().parent / 'data' / 'processed'
MODELS_DIR = Path.cwd().parent / 'models'
REPORTS_DIR = Path.cwd().parent / 'reports'

print(f"Data dir: {DATA_DIR}")
print(f"Models dir: {MODELS_DIR}")
print(f"Reports dir: {REPORTS_DIR}")

In [None]:
# Load processed data
train_df = pd.read_csv(DATA_DIR / 'train.csv')
val_df = pd.read_csv(DATA_DIR / 'val.csv')
test_df = pd.read_csv(DATA_DIR / 'test.csv')

print(f"Train: {len(train_df):,} rows")
print(f"Val: {len(val_df):,} rows")
print(f"Test: {len(test_df):,} rows")
print(f"\nTrain fraud ratio: {train_df['Class'].mean()*100:.4f}%")

In [None]:
# Prepare features
TARGET = 'Class'
FEATURES = [col for col in train_df.columns if col != TARGET]

X_train, y_train = train_df[FEATURES], train_df[TARGET]
X_val, y_val = val_df[FEATURES], val_df[TARGET]
X_test, y_test = test_df[FEATURES], test_df[TARGET]

# Calculate class imbalance weight
scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
print(f"Scale pos weight: {scale_pos_weight:.1f}")

## 3. Initialize DagsHub MLflow

In [None]:
# Connect to DagsHub for experiment tracking
dagshub.init(repo_owner='hammadmunir959', repo_name='my-first-repo', mlflow=True)
mlflow.set_experiment('sentinel-fraud-detection')
print(f"MLflow URI: {mlflow.get_tracking_uri()}")

## 4. Train XGBoost Model

In [None]:
# Hyperparameters
PARAMS = {
    'n_estimators': 200,
    'max_depth': 6,
    'learning_rate': 0.1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'scale_pos_weight': scale_pos_weight,
    'random_state': 42,
    'eval_metric': 'aucpr',
    'early_stopping_rounds': 20,
    'n_jobs': -1,
}

In [None]:
# Helper functions
def compute_metrics(y_true, y_pred, y_prob):
    return {
        'precision': precision_score(y_true, y_pred),
        'recall': recall_score(y_true, y_pred),
        'f1_score': f1_score(y_true, y_pred),
        'roc_auc': roc_auc_score(y_true, y_prob),
        'avg_precision': average_precision_score(y_true, y_prob),
    }

def plot_confusion_matrix(y_true, y_pred, save_path=None):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(6, 5))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=['Normal', 'Fraud'], yticklabels=['Normal', 'Fraud'])
    plt.title('Confusion Matrix')
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.tight_layout()
    if save_path:
        plt.savefig(save_path, dpi=150)
    plt.show()

def plot_feature_importance(model, features, top_n=15, save_path=None):
    imp = model.feature_importances_
    idx = np.argsort(imp)[-top_n:]
    plt.figure(figsize=(8, 6))
    plt.barh(range(len(idx)), imp[idx], color='steelblue')
    plt.yticks(range(len(idx)), [features[i] for i in idx])
    plt.xlabel('Importance')
    plt.title(f'Top {top_n} Features')
    plt.tight_layout()
    if save_path:
        plt.savefig(save_path, dpi=150)
    plt.show()

In [None]:
# Train with MLflow logging
with mlflow.start_run(run_name='xgboost_local'):
    
    mlflow.log_params(PARAMS)
    
    model = xgb.XGBClassifier(**PARAMS)
    
    print("Training...")
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=50)
    
    # Predictions
    y_test_pred = model.predict(X_test)
    y_test_prob = model.predict_proba(X_test)[:, 1]
    
    # Metrics
    metrics = compute_metrics(y_test, y_test_pred, y_test_prob)
    
    print("\n" + "="*50)
    print("TEST METRICS")
    print("="*50)
    for k, v in metrics.items():
        print(f"  {k}: {v:.4f}")
        mlflow.log_metric(f'test_{k}', v)
    
    # Plots
    REPORTS_DIR.mkdir(parents=True, exist_ok=True)
    
    plot_confusion_matrix(y_test, y_test_pred, REPORTS_DIR / 'confusion_matrix.png')
    plot_feature_importance(model, FEATURES, save_path=REPORTS_DIR / 'feature_importance.png')
    
    # Save model locally
    MODELS_DIR.mkdir(parents=True, exist_ok=True)
    model_path = MODELS_DIR / 'model.json'
    model.save_model(str(model_path))
    
    # Log artifacts
    mlflow.log_artifact(str(model_path))
    mlflow.log_artifact(str(REPORTS_DIR / 'confusion_matrix.png'))
    mlflow.log_artifact(str(REPORTS_DIR / 'feature_importance.png'))
    mlflow.xgboost.log_model(model, 'model')
    
    print(f"\nModel saved to: {model_path}")
    print("Training complete!")

In [None]:
# Classification report
print(classification_report(y_test, y_test_pred, target_names=['Normal', 'Fraud']))

## 5. Done!

View experiments at: https://dagshub.com/hammadmunir959/my-first-repo.mlflow

Model saved to `../models/model.json`