# Fraud Detection Solution

## Complete implementation with best practices

This notebook provides the complete solution for the fraud detection exercise.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import (
    classification_report, confusion_matrix, roc_auc_score, 
    precision_recall_curve, roc_curve, auc
)
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
import lightgbm as lgb
from imblearn.over_sampling import SMOTE
import shap
import joblib
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)
pd.set_option('display.max_columns', None)
plt.style.use('seaborn-v0_8')
%matplotlib inline

## 1. Data Generation

In [None]:
def generate_fraud_dataset(n_samples=100000, fraud_ratio=0.02):
    """
    Generate synthetic fraud detection dataset with realistic patterns
    """
    n_fraud = int(n_samples * fraud_ratio)
    n_legit = n_samples - n_fraud
    
    # Legitimate transactions
    legit_data = {
        'transaction_amount': np.random.gamma(2, 50, n_legit),
        'hour_of_day': np.random.choice(range(24), n_legit, p=[
            0.01, 0.01, 0.01, 0.01, 0.01, 0.02, 0.04, 0.06,
            0.08, 0.07, 0.06, 0.07, 0.08, 0.07, 0.06, 0.05,
            0.04, 0.05, 0.06, 0.07, 0.06, 0.04, 0.03, 0.02
        ]),
        'day_of_week': np.random.randint(0, 7, n_legit),
        'merchant_category': np.random.choice(
            ['retail', 'grocery', 'gas', 'restaurant', 'online'],
            n_legit
        ),
        'distance_from_home': np.abs(np.random.normal(5, 10, n_legit)),
        'distance_from_last_transaction': np.abs(np.random.normal(3, 5, n_legit)),
        'transaction_velocity': np.random.poisson(2, n_legit),
        'is_fraud': np.zeros(n_legit, dtype=int)
    }
    
    # Fraudulent transactions (different patterns)
    fraud_data = {
        'transaction_amount': np.random.gamma(5, 100, n_fraud),  # Higher amounts
        'hour_of_day': np.random.choice(range(24), n_fraud, p=[
            0.08, 0.08, 0.07, 0.06, 0.05, 0.03, 0.02, 0.02,
            0.02, 0.02, 0.03, 0.03, 0.03, 0.03, 0.03, 0.03,
            0.03, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.08
        ]),  # More at night
        'day_of_week': np.random.randint(0, 7, n_fraud),
        'merchant_category': np.random.choice(
            ['retail', 'grocery', 'gas', 'restaurant', 'online'],
            n_fraud,
            p=[0.15, 0.10, 0.15, 0.10, 0.50]  # More online
        ),
        'distance_from_home': np.abs(np.random.normal(50, 100, n_fraud)),  # Farther
        'distance_from_last_transaction': np.abs(np.random.normal(100, 200, n_fraud)),
        'transaction_velocity': np.random.poisson(8, n_fraud),  # Higher velocity
        'is_fraud': np.ones(n_fraud, dtype=int)
    }
    
    # Combine and shuffle
    df_legit = pd.DataFrame(legit_data)
    df_fraud = pd.DataFrame(fraud_data)
    df = pd.concat([df_legit, df_fraud], ignore_index=True)
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)
    
    return df

# Generate dataset
df = generate_fraud_dataset(n_samples=100000, fraud_ratio=0.02)

print("Dataset shape:", df.shape)
print("\nClass distribution:")
print(df['is_fraud'].value_counts())
print("\nFraud ratio:", df['is_fraud'].mean())
df.head()

## 2. Data Exploration

In [None]:
# Visualize class distribution
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Transaction amount
axes[0, 0].hist(df[df['is_fraud']==0]['transaction_amount'], bins=50, alpha=0.5, label='Legit')
axes[0, 0].hist(df[df['is_fraud']==1]['transaction_amount'], bins=50, alpha=0.5, label='Fraud')
axes[0, 0].set_xlabel('Transaction Amount')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].legend()
axes[0, 0].set_title('Transaction Amount Distribution')

# Hour of day
df.groupby(['hour_of_day', 'is_fraud']).size().unstack().plot(ax=axes[0, 1])
axes[0, 1].set_xlabel('Hour of Day')
axes[0, 1].set_ylabel('Count')
axes[0, 1].set_title('Transactions by Hour')

# Distance from home
axes[1, 0].hist(df[df['is_fraud']==0]['distance_from_home'], bins=50, alpha=0.5, label='Legit')
axes[1, 0].hist(df[df['is_fraud']==1]['distance_from_home'], bins=50, alpha=0.5, label='Fraud')
axes[1, 0].set_xlabel('Distance from Home')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].legend()
axes[1, 0].set_title('Distance from Home Distribution')

# Transaction velocity
axes[1, 1].hist(df[df['is_fraud']==0]['transaction_velocity'], bins=20, alpha=0.5, label='Legit')
axes[1, 1].hist(df[df['is_fraud']==1]['transaction_velocity'], bins=20, alpha=0.5, label='Fraud')
axes[1, 1].set_xlabel('Transaction Velocity')
axes[1, 1].set_ylabel('Frequency')
axes[1, 1].legend()
axes[1, 1].set_title('Transaction Velocity Distribution')

plt.tight_layout()
plt.show()

## 3. Feature Engineering

In [None]:
# Create additional features
df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)
df['is_night'] = ((df['hour_of_day'] >= 22) | (df['hour_of_day'] <= 6)).astype(int)
df['amount_velocity_ratio'] = df['transaction_amount'] / (df['transaction_velocity'] + 1)
df['distance_ratio'] = df['distance_from_last_transaction'] / (df['distance_from_home'] + 1)

# Encode categorical variable
le = LabelEncoder()
df['merchant_category_encoded'] = le.fit_transform(df['merchant_category'])

print("Engineered features:")
df[['is_weekend', 'is_night', 'amount_velocity_ratio', 'distance_ratio', 'merchant_category_encoded']].head()

## 4. Data Preparation

In [None]:
# Select features
feature_cols = [
    'transaction_amount', 'hour_of_day', 'day_of_week',
    'distance_from_home', 'distance_from_last_transaction',
    'transaction_velocity', 'is_weekend', 'is_night',
    'amount_velocity_ratio', 'distance_ratio', 'merchant_category_encoded'
]

X = df[feature_cols]
y = df['is_fraud']

# Split data: 60% train, 20% validation, 20% test
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.25, random_state=42, stratify=y_temp
)

print(f"Train set: {X_train.shape}, Fraud ratio: {y_train.mean():.4f}")
print(f"Validation set: {X_val.shape}, Fraud ratio: {y_val.mean():.4f}")
print(f"Test set: {X_test.shape}, Fraud ratio: {y_test.mean():.4f}")

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

## 5. Handle Imbalanced Data with SMOTE

In [None]:
# Apply SMOTE to training data
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_scaled, y_train)

print(f"Original training set: {X_train_scaled.shape}")
print(f"Balanced training set: {X_train_balanced.shape}")
print(f"\nOriginal fraud ratio: {y_train.mean():.4f}")
print(f"Balanced fraud ratio: {y_train_balanced.mean():.4f}")

## 6. Model Training

In [None]:
# Train Logistic Regression (baseline)
lr_model = LogisticRegression(random_state=42, max_iter=1000)
lr_model.fit(X_train_balanced, y_train_balanced)

# Train XGBoost
xgb_model = xgb.XGBClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    random_state=42,
    eval_metric='auc'
)
xgb_model.fit(X_train_balanced, y_train_balanced)

# Train LightGBM
lgb_model = lgb.LGBMClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    random_state=42
)
lgb_model.fit(X_train_balanced, y_train_balanced)

print("All models trained successfully!")

## 7. Model Evaluation

In [None]:
# Evaluate on validation set
models = {
    'Logistic Regression': lr_model,
    'XGBoost': xgb_model,
    'LightGBM': lgb_model
}

results = {}
for name, model in models.items():
    y_pred_proba = model.predict_proba(X_val_scaled)[:, 1]
    y_pred = (y_pred_proba >= 0.5).astype(int)
    
    roc_auc = roc_auc_score(y_val, y_pred_proba)
    
    results[name] = {
        'predictions': y_pred,
        'probabilities': y_pred_proba,
        'roc_auc': roc_auc
    }
    
    print(f"\n{name}:")
    print(f"ROC-AUC: {roc_auc:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_val, y_pred))
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_val, y_pred))

In [None]:
# Plot ROC curves
plt.figure(figsize=(10, 8))
for name, result in results.items():
    fpr, tpr, _ = roc_curve(y_val, result['probabilities'])
    plt.plot(fpr, tpr, label=f"{name} (AUC = {result['roc_auc']:.4f})")

plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves Comparison')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Plot Precision-Recall curves
plt.figure(figsize=(10, 8))
for name, result in results.items():
    precision, recall, _ = precision_recall_curve(y_val, result['probabilities'])
    plt.plot(recall, precision, label=name)

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curves Comparison')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Cost-sensitive evaluation
cost_fp = 5   # Cost of investigating a legitimate transaction
cost_fn = 100  # Cost of missing a fraudulent transaction

print("Cost-Sensitive Evaluation:")
print("="*50)
for name, result in results.items():
    tn, fp, fn, tp = confusion_matrix(y_val, result['predictions']).ravel()
    total_cost = (fp * cost_fp) + (fn * cost_fn)
    print(f"\n{name}:")
    print(f"  False Positives: {fp}, Cost: ${fp * cost_fp}")
    print(f"  False Negatives: {fn}, Cost: ${fn * cost_fn}")
    print(f"  Total Cost: ${total_cost}")

## 8. Model Explainability with SHAP

In [None]:
# Use best model (typically XGBoost or LightGBM)
best_model = xgb_model

# Calculate SHAP values
explainer = shap.TreeExplainer(best_model)
shap_values = explainer.shap_values(X_val_scaled[:1000])  # Sample for speed

# Summary plot
shap.summary_plot(shap_values, X_val.iloc[:1000], feature_names=feature_cols)

In [None]:
# Feature importance
feature_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': best_model.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 6))
plt.barh(feature_importance['feature'], feature_importance['importance'])
plt.xlabel('Importance')
plt.title('Feature Importance')
plt.tight_layout()
plt.show()

print(feature_importance)

In [None]:
# Explain a single fraud prediction
fraud_idx = y_val[y_val == 1].index[0]
single_pred = X_val_scaled[fraud_idx:fraud_idx+1]

print(f"Prediction probability: {best_model.predict_proba(single_pred)[0, 1]:.4f}")
shap.force_plot(
    explainer.expected_value,
    shap_values[fraud_idx],
    X_val.iloc[fraud_idx],
    feature_names=feature_cols
)

## 9. Final Evaluation on Test Set

In [None]:
# Evaluate best model on test set
y_test_pred_proba = best_model.predict_proba(X_test_scaled)[:, 1]
y_test_pred = (y_test_pred_proba >= 0.5).astype(int)

print("Final Test Set Performance:")
print("="*50)
print(f"ROC-AUC: {roc_auc_score(y_test, y_test_pred_proba):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_test_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_test_pred))

## 10. Save Model Artifacts

In [None]:
# Save model and preprocessing artifacts
import os

model_dir = '../../models'
os.makedirs(model_dir, exist_ok=True)

# Save model
joblib.dump(best_model, f'{model_dir}/fraud_detection_model.pkl')
joblib.dump(scaler, f'{model_dir}/fraud_detection_scaler.pkl')
joblib.dump(le, f'{model_dir}/fraud_detection_encoder.pkl')
joblib.dump(feature_cols, f'{model_dir}/fraud_detection_features.pkl')

print("Model artifacts saved successfully!")
print(f"Location: {model_dir}")

## Key Takeaways

1. **Class Imbalance**: SMOTE significantly improved model performance
2. **Model Selection**: XGBoost/LightGBM outperformed logistic regression
3. **Feature Engineering**: Temporal and distance-based features were crucial
4. **Explainability**: SHAP values help understand and trust predictions
5. **Cost-Sensitive**: Consider business costs when choosing threshold

## Next Steps for Production

1. Deploy to SageMaker real-time endpoint
2. Set up model monitoring for data drift
3. Implement automated retraining pipeline
4. Add feature store for real-time features
5. Set up A/B testing framework