# FraudGuard - Fraud Detection Pipeline

## Content
1. Setup & Load Data
2. Rule-Based System
3. ML-Based System
4. Comparison & Export

---
## 1. Setup & Load Data

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, f1_score
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
import warnings

import sys
sys.path.append('..')
from src.rules import FraudRuleEngine
from src.features import engineer_features, select_ml_features, prepare_for_ml
from src.utils import load_data, temporal_train_test_split, print_fraud_stats

warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-darkgrid')
%matplotlib inline

print("✓ Setup complete")

✓ Setup complete


In [2]:
# Load Data
df = load_data('../data/raw/fraudTrain.csv')

print_fraud_stats(df)

Loading data from ../data/raw/fraudTrain.csv...
✓ Loaded 1,296,675 rows, 23 columns
FRAUD STATISTICS
Total Transactions: 1,296,675
Fraud Cases:        7,506
Fraud Rate:         0.58%
Legitimate Cases:   1,289,169


---
## 2. Rule-Based System

Apply Business Rules

In [3]:
# Initialise rule engine
engine = FraudRuleEngine()

# Apply rules
print("Applying rules...\n")
df = engine.apply_all_rules(df)

print("\n✓ Rules applied")

Applying rules...

Applying fraud detection rules...
  Rule 1 (High Frequency): 72 triggered
  Rule 2 (Night Transaction): 127,288 triggered
  Rule 3 (High Amount): 50,207 triggered
  Rule 4 (Round Amount): 147 triggered
  Rule 5 (Risky Category): 532,799 triggered
✓ Applied 5 rules

  Transactions flagged (≥2 rules): 120,994


✓ Rules applied


In [4]:
# Quick Check: Rule Performance
rule_flagged = df['rule_based_prediction'].sum()
rule_correct = df[df['rule_based_prediction'] == 1]['is_fraud'].sum()

print(f"Rules flagged: {rule_flagged:,} transactions")
print(f"Actual fraud found: {rule_correct:,}")
print(f"Precision: {rule_correct / rule_flagged:.1%}" if rule_flagged > 0 else "No flags")

Rules flagged: 120,994 transactions
Actual fraud found: 4,957
Precision: 4.1%


---
## 3. ML-Based System

Feature Engineering → Training → Predictions

In [5]:
# Feature Engineering
print("Engineering features...\n")
df = engineer_features(df)

Engineering features...

Engineering features for ML...
  1. Time features...
  2. Aggregated features...
  3. Categorical features...
✓ Feature engineering complete: 46 total columns


In [None]:
# Train/Test Split (Temporal)
train_df, test_df = temporal_train_test_split(df, time_col='trans_date_trans_time', train_ratio=0.7)

print(f"\nTrain fraud rate: {train_df['is_fraud'].mean():.2%}")
print(f"Test fraud rate:  {test_df['is_fraud'].mean():.2%}")

In [None]:
# Select Features
ml_features = select_ml_features(df)

# Prepare Data
X_train, y_train = prepare_for_ml(train_df, ml_features)
X_test, y_test = prepare_for_ml(test_df, ml_features)

In [None]:
# Handle Class Imbalance (SMOTE)
print("Applying SMOTE...\n")

smote = SMOTE(random_state=42, sampling_strategy=0.3)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

print(f"Before SMOTE: {y_train.mean():.2%} fraud")
print(f"After SMOTE:  {y_train_balanced.mean():.2%} fraud")
print(f"Samples: {len(X_train_balanced):,}")

In [None]:
# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_balanced)
X_test_scaled = scaler.transform(X_test)

print("✓ Features scaled")

In [None]:
# Train XGBoost
print("Training XGBoost...\n")

model = XGBClassifier(
    n_estimators=300,
    max_depth=5,
    learning_rate=0.1,
    random_state=42,
    eval_metric='logloss'
)

model.fit(
    X_train_scaled,
    y_train_balanced,
    eval_set=[(X_test_scaled, y_test)],
    verbose=50
)

print("\n✓ Training complete")

In [None]:
# Predictions
y_pred_ml = model.predict(X_test_scaled)
y_proba_ml = model.predict_proba(X_test_scaled)[:, 1]

print(f"ML flagged: {y_pred_ml.sum():,} transactions")

---
## 4. Comparison & Export

In [None]:
# Calculate Metrics
y_true = y_test

# Rule-Based
y_pred_rules = test_df['rule_based_prediction'].values
precision_rules = precision_score(y_true, y_pred_rules, zero_division=0)
recall_rules = recall_score(y_true, y_pred_rules, zero_division=0)
f1_rules = f1_score(y_true, y_pred_rules, zero_division=0)

# ML
precision_ml = precision_score(y_true, y_pred_ml)
recall_ml = recall_score(y_true, y_pred_ml)
f1_ml = f1_score(y_true, y_pred_ml)

# Comparison
improvement = ((f1_ml - f1_rules) / f1_rules * 100) if f1_rules > 0 else 0

print("="*80)
print("COMPARISON: RULE-BASED vs. ML")
print("="*80)
print(f"\nRule-Based:")
print(f"  Precision: {precision_rules:.3f} ({precision_rules:.1%})")
print(f"  Recall:    {recall_rules:.3f} ({recall_rules:.1%})")
print(f"  F1-Score:  {f1_rules:.3f}")

print(f"\nML:")
print(f"  Precision: {precision_ml:.3f} ({precision_ml:.1%})")
print(f"  Recall:    {recall_ml:.3f} ({recall_ml:.1%})")
print(f"  F1-Score:  {f1_ml:.3f}")

print(f"\n✅ ML improves F1-Score by {improvement:.1f}%")
print("="*80)

In [None]:
# Visualization: Comparison
fig, ax = plt.subplots(figsize=(10, 6))

metrics = ['Precision', 'Recall', 'F1-Score']
x = np.arange(len(metrics))
width = 0.35

ax.bar(x - width/2, [precision_rules, recall_rules, f1_rules], width, label='Rule-Based', color='coral')
ax.bar(x + width/2, [precision_ml, recall_ml, f1_ml], width, label='ML', color='steelblue')

ax.set_ylabel('Score')
ax.set_title('Performance Comparison: Rule-Based vs. ML')
ax.set_xticks(x)
ax.set_xticklabels(metrics)
ax.legend()
ax.set_ylim(0, 1)
ax.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

In [None]:
# Prepare Export DataFrame
predictions_df = test_df.copy()

# Add ML predictions
predictions_df['ml_prediction'] = y_pred_ml
predictions_df['ml_probability'] = y_proba_ml

# Risk Level
def assign_risk_level(prob):
    if prob >= 0.7:
        return 'High'
    elif prob >= 0.3:
        return 'Medium'
    else:
        return 'Low'

predictions_df['ml_risk_level'] = predictions_df['ml_probability'].apply(assign_risk_level)

print(f"✓ Prepared {len(predictions_df):,} predictions")

In [None]:
# Export for Dashboard
predictions_df.to_csv('../data/processed/predictions_comparison.csv', index=False)

print("\n✓ Exported predictions to: data/processed/predictions_comparison.csv")
print(f"  File ready for Streamlit Dashboard!")

In [None]:
# Save Model
import joblib

joblib.dump(model, '../models/xgboost_ml_only.pkl')
joblib.dump(scaler, '../models/scaler_ml_only.pkl')
joblib.dump(ml_features, '../models/ml_features.pkl')

print("✓ Models saved to: models/")

---
## Summary

In [None]:
print("\n" + "="*80)
print("FRAUDGUARD - PIPELINE COMPLETE")
print("="*80)
print(f"\nDataset: {len(df):,} transactions ({df['is_fraud'].mean():.2%} fraud)")
print(f"Train/Test: {len(train_df):,} / {len(test_df):,}")
print(f"\nResults:")
print(f"  Rule-Based F1: {f1_rules:.3f}")
print(f"  ML F1:    {f1_ml:.3f}")
print(f"  Improvement:   +{improvement:.1f}%")
print("="*80)