# FraudGuard - Fraud Detection Pipeline

## Content
1. Setup & Load Data
2. Rule-Based System
3. ML-Based System
4. Comparison & Export

---
## 1. Setup & Load Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
import warnings

import sys
sys.path.append('..')
from src.rules import FraudRuleEngine
from src.features import engineer_features, select_ml_features, prepare_for_ml
from src.utils import load_data, temporal_train_test_split, print_fraud_stats

warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-darkgrid')
%matplotlib inline

print("✓ Setup complete")

In [None]:
df = load_data('../data/raw/fraudTrain.csv', nrows=50000)
print_fraud_stats(df)

---
## 2. Rule-Based System

In [None]:
engine = FraudRuleEngine()
df = engine.apply_all_rules(df)
print("✓ Rules applied")

In [None]:
rule_flagged = df['rule_based_prediction'].sum()
rule_correct = df[df['rule_based_prediction'] == 1]['is_fraud'].sum()

print(f"Rules flagged: {rule_flagged:,} transactions")
print(f"Actual fraud found: {rule_correct:,}")
print(f"Precision: {rule_correct / rule_flagged:.1%}" if rule_flagged > 0 else "No flags")

---
## 3. ML-Based System

In [None]:
df = engineer_features(df)
print(f"✓ Features engineered ({len(df.columns)} total columns)")

In [None]:
train_df, test_df = temporal_train_test_split(df, test_size=0.3)
print(f"Train: {len(train_df):,} | Test: {len(test_df):,}")

In [None]:
ml_features = select_ml_features(train_df)
print(f"Selected {len(ml_features)} ML features")

In [None]:
X_train, y_train = prepare_for_ml(train_df, ml_features)
X_test, y_test = prepare_for_ml(test_df, ml_features)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"✓ Data prepared: {X_train_scaled.shape}")

In [None]:
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_scaled, y_train)

print(f"Before SMOTE: {y_train.sum()} fraud cases")
print(f"After SMOTE: {y_train_balanced.sum()} fraud cases")

In [None]:
model = XGBClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    random_state=42,
    eval_metric='logloss'
)

model.fit(X_train_balanced, y_train_balanced)
print("✓ Model trained")

In [None]:
y_pred_ml = model.predict(X_test_scaled)
y_proba_ml = model.predict_proba(X_test_scaled)[:, 1]

print("✓ Predictions generated")

---
## 4. Comparison & Export

In [None]:
y_true = test_df['is_fraud']
y_pred_rules = test_df['rule_based_prediction']

# Rule-Based metrics
precision_rules = precision_score(y_true, y_pred_rules, zero_division=0)
recall_rules = recall_score(y_true, y_pred_rules, zero_division=0)
f1_rules = f1_score(y_true, y_pred_rules, zero_division=0)

# ML metrics
precision_ml = precision_score(y_true, y_pred_ml, zero_division=0)
recall_ml = recall_score(y_true, y_pred_ml, zero_division=0)
f1_ml = f1_score(y_true, y_pred_ml, zero_division=0)

print("Performance Comparison:")
print(f"\nRule-Based: Precision={precision_rules:.3f}, Recall={recall_rules:.3f}, F1={f1_rules:.3f}")
print(f"ML-Only:    Precision={precision_ml:.3f}, Recall={recall_ml:.3f}, F1={f1_ml:.3f}")

improvement = ((f1_ml - f1_rules) / f1_rules * 100) if f1_rules > 0 else 0
print(f"\n✅ ML improves F1-Score by {improvement:.1f}%")

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))

metrics = ['Precision', 'Recall', 'F1-Score']
x = np.arange(len(metrics))
width = 0.35

ax.bar(x - width/2, [precision_rules, recall_rules, f1_rules], width, label='Rule-Based', color='coral')
ax.bar(x + width/2, [precision_ml, recall_ml, f1_ml], width, label='ML-Only', color='steelblue')

ax.set_ylabel('Score')
ax.set_title('Performance Comparison: Rule-Based vs. ML-Only')
ax.set_xticks(x)
ax.set_xticklabels(metrics)
ax.legend()
ax.set_ylim(0, 1)
ax.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

In [None]:
predictions_df = test_df.copy()
predictions_df['ml_prediction'] = y_pred_ml
predictions_df['ml_probability'] = y_proba_ml

def assign_risk_level(prob):
    if prob >= 0.7:
        return 'High'
    elif prob >= 0.3:
        return 'Medium'
    else:
        return 'Low'

predictions_df['ml_risk_level'] = predictions_df['ml_probability'].apply(assign_risk_level)
print(f"✓ Prepared {len(predictions_df):,} predictions")

In [None]:
predictions_df.to_csv('../data/processed/predictions_comparison.csv', index=False)
print("✓ Exported predictions to: data/processed/predictions_comparison.csv")

In [None]:
import joblib

joblib.dump(model, '../models/xgboost_ml_only.pkl')
joblib.dump(scaler, '../models/scaler_ml_only.pkl')
joblib.dump(ml_features, '../models/ml_features.pkl')

print("✓ Models saved to: models/")

---
## Summary

In [None]:
print("\n" + "="*80)
print("FRAUDGUARD - PIPELINE COMPLETE")
print("="*80)
print(f"\nDataset: {len(df):,} transactions ({df['is_fraud'].mean():.2%} fraud)")
print(f"Train/Test: {len(train_df):,} / {len(test_df):,}")
print(f"\nResults:")
print(f"  Rule-Based F1: {f1_rules:.3f}")
print(f"  ML-Only F1:    {f1_ml:.3f}")
print(f"  Improvement:   +{improvement:.1f}%")
print("\nFiles Created:")
print("  ✓ predictions_comparison.csv (for Dashboard)")
print("  ✓ Model files (xgboost, scaler, features)")
print("\nNext Step:")
print("  → Run Streamlit Dashboard: streamlit run app.py")
print("="*80)