# Sentinel: Credit Card Fraud Detection Training

This notebook trains an XGBoost model for fraud detection with MLflow experiment tracking via DagsHub.

**Requirements:**
- Run on Google Colab with GPU runtime
- DagsHub account for experiment tracking

## 1. Setup Environment

In [None]:
# Install dependencies
!pip install -q xgboost mlflow dagshub scikit-learn pandas matplotlib seaborn

In [None]:
import os
import json
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    precision_recall_curve,
    average_precision_score,
    roc_auc_score,
    f1_score,
    precision_score,
    recall_score
)

import xgboost as xgb
import mlflow
import dagshub

print(f"XGBoost version: {xgb.__version__}")
print(f"MLflow version: {mlflow.__version__}")

## 2. Mount Google Drive & Load Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Set your data path - UPDATE THIS to your actual path
# Option 1: Upload processed data to Google Drive
# Option 2: Download directly from Kaggle

DATA_PATH = '/content/drive/MyDrive/Sentinel/data/processed'

# If data is not in Drive, download from Kaggle and preprocess
if not os.path.exists(DATA_PATH):
    print("Data not found in Drive. Downloading from Kaggle...")
    
    # Install Kaggle
    !pip install -q kaggle
    
    # Upload your kaggle.json or set credentials
    # from google.colab import files
    # files.upload()  # Upload kaggle.json
    # !mkdir -p ~/.kaggle && mv kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json
    
    # Download dataset
    !kaggle datasets download -d mlg-ulb/creditcardfraud -p /content/data/raw --unzip
    
    # Preprocess
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import StandardScaler
    
    df = pd.read_csv('/content/data/raw/creditcard.csv')
    
    # Scale Amount and Time
    scaler = StandardScaler()
    df[['Amount', 'Time']] = scaler.fit_transform(df[['Amount', 'Time']])
    
    # Stratified split
    train_df, temp_df = train_test_split(df, test_size=0.3, stratify=df['Class'], random_state=42)
    val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['Class'], random_state=42)
    
    # Save
    os.makedirs('/content/data/processed', exist_ok=True)
    train_df.to_csv('/content/data/processed/train.csv', index=False)
    val_df.to_csv('/content/data/processed/val.csv', index=False)
    test_df.to_csv('/content/data/processed/test.csv', index=False)
    
    DATA_PATH = '/content/data/processed'
    print(f"Data preprocessed and saved to {DATA_PATH}")

In [None]:
# Load processed data
train_df = pd.read_csv(f'{DATA_PATH}/train.csv')
val_df = pd.read_csv(f'{DATA_PATH}/val.csv')
test_df = pd.read_csv(f'{DATA_PATH}/test.csv')

print(f"Train: {len(train_df):,} rows")
print(f"Val: {len(val_df):,} rows")
print(f"Test: {len(test_df):,} rows")

# Check class distribution
print(f"\nTrain fraud ratio: {train_df['Class'].mean()*100:.4f}%")

In [None]:
# Prepare features and labels
TARGET = 'Class'
FEATURES = [col for col in train_df.columns if col != TARGET]

X_train = train_df[FEATURES]
y_train = train_df[TARGET]

X_val = val_df[FEATURES]
y_val = val_df[TARGET]

X_test = test_df[FEATURES]
y_test = test_df[TARGET]

# Calculate scale_pos_weight for class imbalance
scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
print(f"Scale pos weight: {scale_pos_weight:.1f}")

## 3. Initialize DagsHub MLflow Tracking

In [None]:
# Initialize DagsHub connection
# This will prompt for authentication on first run
dagshub.init(repo_owner='hammadmunir959', repo_name='my-first-repo', mlflow=True)

# Set experiment name
mlflow.set_experiment('sentinel-fraud-detection')

print(f"MLflow tracking URI: {mlflow.get_tracking_uri()}")

## 4. Define Hyperparameters

In [None]:
# XGBoost Hyperparameters
PARAMS = {
    'n_estimators': 200,
    'max_depth': 6,
    'learning_rate': 0.1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'min_child_weight': 1,
    'gamma': 0,
    'reg_alpha': 0,
    'reg_lambda': 1,
    'scale_pos_weight': scale_pos_weight,
    'random_state': 42,
    'n_jobs': -1,
    'eval_metric': 'aucpr',
    'early_stopping_rounds': 20,
}

print("Hyperparameters:")
for k, v in PARAMS.items():
    print(f"  {k}: {v}")

## 5. Train Model with MLflow Logging

In [None]:
def compute_metrics(y_true, y_pred, y_prob):
    """Compute all evaluation metrics."""
    return {
        'precision': precision_score(y_true, y_pred),
        'recall': recall_score(y_true, y_pred),
        'f1_score': f1_score(y_true, y_pred),
        'roc_auc': roc_auc_score(y_true, y_prob),
        'average_precision': average_precision_score(y_true, y_prob),
    }

def plot_confusion_matrix(y_true, y_pred, save_path=None):
    """Plot and optionally save confusion matrix."""
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=['Normal', 'Fraud'],
                yticklabels=['Normal', 'Fraud'])
    plt.title('Confusion Matrix')
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.tight_layout()
    if save_path:
        plt.savefig(save_path, dpi=150)
    plt.show()

def plot_precision_recall_curve(y_true, y_prob, save_path=None):
    """Plot precision-recall curve."""
    precision, recall, _ = precision_recall_curve(y_true, y_prob)
    ap = average_precision_score(y_true, y_prob)
    
    plt.figure(figsize=(8, 6))
    plt.plot(recall, precision, 'b-', linewidth=2, label=f'AP = {ap:.4f}')
    plt.fill_between(recall, precision, alpha=0.3)
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curve')
    plt.legend(loc='upper right')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    if save_path:
        plt.savefig(save_path, dpi=150)
    plt.show()

def plot_feature_importance(model, feature_names, top_n=15, save_path=None):
    """Plot top N feature importances."""
    importance = model.feature_importances_
    indices = np.argsort(importance)[-top_n:]
    
    plt.figure(figsize=(10, 8))
    plt.barh(range(len(indices)), importance[indices], color='steelblue')
    plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
    plt.xlabel('Feature Importance')
    plt.title(f'Top {top_n} Feature Importances')
    plt.tight_layout()
    if save_path:
        plt.savefig(save_path, dpi=150)
    plt.show()

In [None]:
# Start MLflow run
with mlflow.start_run(run_name='xgboost_baseline'):
    
    # Log parameters
    mlflow.log_params(PARAMS)
    mlflow.log_param('train_size', len(X_train))
    mlflow.log_param('val_size', len(X_val))
    mlflow.log_param('test_size', len(X_test))
    mlflow.log_param('n_features', len(FEATURES))
    
    # Initialize model
    model = xgb.XGBClassifier(
        n_estimators=PARAMS['n_estimators'],
        max_depth=PARAMS['max_depth'],
        learning_rate=PARAMS['learning_rate'],
        subsample=PARAMS['subsample'],
        colsample_bytree=PARAMS['colsample_bytree'],
        min_child_weight=PARAMS['min_child_weight'],
        gamma=PARAMS['gamma'],
        reg_alpha=PARAMS['reg_alpha'],
        reg_lambda=PARAMS['reg_lambda'],
        scale_pos_weight=PARAMS['scale_pos_weight'],
        random_state=PARAMS['random_state'],
        n_jobs=PARAMS['n_jobs'],
        eval_metric=PARAMS['eval_metric'],
        early_stopping_rounds=PARAMS['early_stopping_rounds'],
        use_label_encoder=False,
    )
    
    # Train model
    print("Training XGBoost model...")
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        verbose=20
    )
    
    # Predictions
    y_val_pred = model.predict(X_val)
    y_val_prob = model.predict_proba(X_val)[:, 1]
    
    y_test_pred = model.predict(X_test)
    y_test_prob = model.predict_proba(X_test)[:, 1]
    
    # Compute metrics
    val_metrics = compute_metrics(y_val, y_val_pred, y_val_prob)
    test_metrics = compute_metrics(y_test, y_test_pred, y_test_prob)
    
    # Log validation metrics
    for name, value in val_metrics.items():
        mlflow.log_metric(f'val_{name}', value)
    
    # Log test metrics
    for name, value in test_metrics.items():
        mlflow.log_metric(f'test_{name}', value)
    
    # Print results
    print("\n" + "="*60)
    print("VALIDATION METRICS")
    print("="*60)
    for name, value in val_metrics.items():
        print(f"  {name}: {value:.4f}")
    
    print("\n" + "="*60)
    print("TEST METRICS")
    print("="*60)
    for name, value in test_metrics.items():
        print(f"  {name}: {value:.4f}")
    
    # Plot and save artifacts
    os.makedirs('/content/reports', exist_ok=True)
    
    # Confusion matrix
    plot_confusion_matrix(y_test, y_test_pred, '/content/reports/confusion_matrix.png')
    mlflow.log_artifact('/content/reports/confusion_matrix.png')
    
    # Precision-Recall curve
    plot_precision_recall_curve(y_test, y_test_prob, '/content/reports/pr_curve.png')
    mlflow.log_artifact('/content/reports/pr_curve.png')
    
    # Feature importance
    plot_feature_importance(model, FEATURES, save_path='/content/reports/feature_importance.png')
    mlflow.log_artifact('/content/reports/feature_importance.png')
    
    # Save model
    os.makedirs('/content/models', exist_ok=True)
    model.save_model('/content/models/model.json')
    mlflow.log_artifact('/content/models/model.json')
    
    # Log model with signature
    from mlflow.models.signature import infer_signature
    signature = infer_signature(X_train, model.predict(X_train))
    mlflow.xgboost.log_model(model, 'model', signature=signature)
    
    # Save metrics as JSON
    all_metrics = {
        'validation': val_metrics,
        'test': test_metrics
    }
    with open('/content/reports/metrics.json', 'w') as f:
        json.dump(all_metrics, f, indent=2)
    mlflow.log_artifact('/content/reports/metrics.json')
    
    print("\n" + "="*60)
    print("Training complete! Artifacts logged to MLflow.")
    print("="*60)

## 6. Classification Report

In [None]:
print("\nClassification Report (Test Set):")
print(classification_report(y_test, y_test_pred, target_names=['Normal', 'Fraud']))

## 7. Download Model

In [None]:
# Option 1: Save to Google Drive
!mkdir -p /content/drive/MyDrive/Sentinel/models
!cp /content/models/model.json /content/drive/MyDrive/Sentinel/models/
print("Model saved to Google Drive: /Sentinel/models/model.json")

In [None]:
# Option 2: Download directly
from google.colab import files
files.download('/content/models/model.json')

## 8. View Experiments on DagsHub

Visit your DagsHub repository to see the logged experiments:
https://dagshub.com/hammadmunir959/my-first-repo/experiments