# Fraud Detection Baseline Models

This notebook establishes baseline models for fraud detection to provide performance benchmarks and validate our preprocessing pipeline before implementing more sophisticated approaches.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    classification_report, 
    confusion_matrix, 
    roc_auc_score, 
    precision_recall_curve,
    roc_curve,
    ConfusionMatrixDisplay
)
import matplotlib.pyplot as plt
import seaborn as sns

print("Starting Baseline Model Development...")

## 1. Data Loading and Preprocessing

In [None]:
# Load the dataset
import sys
sys.path.append('../src')
from config import RAW_DATA_PATH, PLOT_STYLE

# Set up plotting style
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use(PLOT_STYLE)
sns.set_palette("husl")

df = pd.read_csv(RAW_DATA_PATH)

# Separate features and target
X = df.drop('Class', axis=1)
y = df['Class']

print(f"Dataset shape: {X.shape}")
print(f"Target distribution: {y.value_counts().to_dict()}")

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print(f"Training target distribution: {y_train.value_counts().to_dict()}")
print(f"Test target distribution: {y_test.value_counts().to_dict()}")

In [None]:
# Feature scaling (especially important for Amount and Time)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Features scaled successfully")

## 2. Baseline Model 1: Logistic Regression

In [None]:
# Logistic Regression with class weights to handle imbalance
lr_model = LogisticRegression(
    random_state=42,
    class_weight='balanced',  # Handle class imbalance
    max_iter=1000
)

# Train the model
print("Training Logistic Regression...")
lr_model.fit(X_train_scaled, y_train)

# Predictions
lr_pred = lr_model.predict(X_test_scaled)
lr_proba = lr_model.predict_proba(X_test_scaled)[:, 1]

print("Logistic Regression training completed")

In [None]:
# Logistic Regression Evaluation
lr_auc = roc_auc_score(y_test, lr_proba)

print("="*50)
print("LOGISTIC REGRESSION RESULTS")
print("="*50)
print(f"AUC-ROC Score: {lr_auc:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, lr_pred, target_names=['Normal', 'Fraud']))

# Confusion Matrix
cm_lr = confusion_matrix(y_test, lr_pred)
print("\nConfusion Matrix:")
print(cm_lr)

In [None]:
# Visualize Logistic Regression results
plt.figure(figsize=(15, 5))

# Subplot 1: Confusion Matrix
plt.subplot(1, 3, 1)
sns.heatmap(cm_lr, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Normal', 'Fraud'], yticklabels=['Normal', 'Fraud'])
plt.title('Logistic Regression - Confusion Matrix', fontweight='bold')
plt.ylabel('Actual')
plt.xlabel('Predicted')

# Subplot 2: ROC Curve
plt.subplot(1, 3, 2)
fpr_lr, tpr_lr, _ = roc_curve(y_test, lr_proba)
plt.plot(fpr_lr, tpr_lr, color='blue', lw=2, label=f'ROC curve (AUC = {lr_auc:.3f})')
plt.plot([0, 1], [0, 1], color='gray', lw=1, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Logistic Regression - ROC Curve', fontweight='bold')
plt.legend(loc="lower right")

# Subplot 3: Precision-Recall Curve
plt.subplot(1, 3, 3)
precision_lr, recall_lr, _ = precision_recall_curve(y_test, lr_proba)
plt.plot(recall_lr, precision_lr, color='blue', lw=2, 
         label=f'PR curve (AP = {auc(recall_lr, precision_lr):.3f})')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Logistic Regression - Precision-Recall Curve', fontweight='bold')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 3. Baseline Model 2: Random Forest

In [None]:
# Random Forest with class weights
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    random_state=42,
    class_weight='balanced',  # Handle class imbalance
    n_jobs=-1
)

# Train the model
print("Training Random Forest...")
rf_model.fit(X_train_scaled, y_train)

# Predictions
rf_pred = rf_model.predict(X_test_scaled)
rf_proba = rf_model.predict_proba(X_test_scaled)[:, 1]

print("Random Forest training completed")

In [None]:
# Random Forest Evaluation
rf_auc = roc_auc_score(y_test, rf_proba)

print("="*50)
print("RANDOM FOREST RESULTS")
print("="*50)
print(f"AUC-ROC Score: {rf_auc:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, rf_pred, target_names=['Normal', 'Fraud']))

# Confusion Matrix
cm_rf = confusion_matrix(y_test, rf_pred)
print("\nConfusion Matrix:")
print(cm_rf)

In [None]:
# Visualize Random Forest results
plt.figure(figsize=(15, 5))

# Subplot 1: Confusion Matrix
plt.subplot(1, 3, 1)
sns.heatmap(cm_rf, annot=True, fmt='d', cmap='Greens', 
            xticklabels=['Normal', 'Fraud'], yticklabels=['Normal', 'Fraud'])
plt.title('Random Forest - Confusion Matrix', fontweight='bold')
plt.ylabel('Actual')
plt.xlabel('Predicted')

# Subplot 2: ROC Curve
plt.subplot(1, 3, 2)
fpr_rf, tpr_rf, _ = roc_curve(y_test, rf_proba)
plt.plot(fpr_rf, tpr_rf, color='green', lw=2, label=f'ROC curve (AUC = {rf_auc:.3f})')
plt.plot([0, 1], [0, 1], color='gray', lw=1, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Random Forest - ROC Curve', fontweight='bold')
plt.legend(loc="lower right")

# Subplot 3: Precision-Recall Curve
plt.subplot(1, 3, 3)
precision_rf, recall_rf, _ = precision_recall_curve(y_test, rf_proba)
plt.plot(recall_rf, precision_rf, color='green', lw=2, 
         label=f'PR curve (AP = {auc(recall_rf, precision_rf):.3f})')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Random Forest - Precision-Recall Curve', fontweight='bold')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 4. Feature Importance Analysis

In [None]:
# Feature importance from Random Forest
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

print("Top 15 Most Important Features:")
print(feature_importance.head(15).to_string(index=False))

In [None]:
# Visualize feature importance
plt.figure(figsize=(12, 8))
top_features = feature_importance.head(15)

plt.barh(range(len(top_features)), top_features['importance'], color='skyblue')
plt.yticks(range(len(top_features)), top_features['feature'])
plt.xlabel('Feature Importance')
plt.title('Top 15 Feature Importances (Random Forest)', fontweight='bold')
plt.gca().invert_yaxis()  # Most important at top

# Add value labels
for i, (feature, importance) in enumerate(zip(top_features['feature'], top_features['importance'])):
    plt.text(importance + 0.001, i, f'{importance:.3f}', va='center')

plt.tight_layout()
plt.show()

## 5. Cross-Validation Analysis

In [None]:
# Cross-validation for both models (with fresh models to avoid data leakage)
print("Cross-Validation Results (5-fold):")
print("="*50)

# Create fresh models for CV to avoid data leakage
lr_model_cv = LogisticRegression(
    random_state=42,
    class_weight='balanced',
    max_iter=1000
)

rf_model_cv = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    random_state=42,
    class_weight='balanced',
    n_jobs=-1
)

# Logistic Regression CV
lr_cv_scores = cross_val_score(lr_model_cv, X_train_scaled, y_train, cv=5, scoring='roc_auc')
print(f"Logistic Regression AUC-ROC CV: {lr_cv_scores.mean():.4f} (+/- {lr_cv_scores.std() * 2:.4f})")

# Random Forest CV
rf_cv_scores = cross_val_score(rf_model_cv, X_train_scaled, y_train, cv=5, scoring='roc_auc')
print(f"Random Forest AUC-ROC CV: {rf_cv_scores.mean():.4f} (+/- {rf_cv_scores.std() * 2:.4f})")

## 6. Model Comparison and Baseline Summary

In [None]:
# Compare models side by side
comparison_df = pd.DataFrame({
    'Model': ['Logistic Regression', 'Random Forest'],
    'AUC-ROC': [lr_auc, rf_auc],
    'CV Mean': [lr_cv_scores.mean(), rf_cv_scores.mean()],
    'CV Std': [lr_cv_scores.std(), rf_cv_scores.std()],
    'Precision (Fraud)': [
        classification_report(y_test, lr_pred, output_dict=True)['1']['precision'],
        classification_report(y_test, rf_pred, output_dict=True)['1']['precision']
    ],
    'Recall (Fraud)': [
        classification_report(y_test, lr_pred, output_dict=True)['1']['recall'],
        classification_report(y_test, rf_pred, output_dict=True)['1']['recall']
    ],
    'F1-Score (Fraud)': [
        classification_report(y_test, lr_pred, output_dict=True)['1']['f1-score'],
        classification_report(y_test, rf_pred, output_dict=True)['1']['f1-score']
    ]
})

print("BASELINE MODEL COMPARISON")
print("="*60)
print(comparison_df.round(4).to_string(index=False))

In [None]:
# Visualize model comparison
metrics_to_compare = ['AUC-ROC', 'Precision (Fraud)', 'Recall (Fraud)', 'F1-Score (Fraud)']

plt.figure(figsize=(12, 6))

x = np.arange(len(metrics_to_compare))
width = 0.35

plt.bar(x - width/2, comparison_df['AUC-ROC'], width, label='Logistic Regression', alpha=0.8)
plt.bar(x + width/2, comparison_df['F1-Score (Fraud)'], width, label='Random Forest', alpha=0.8)

# Actually, let's plot each metric separately for clarity
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
axes = axes.flatten()

for i, metric in enumerate(metrics_to_compare):
    ax = axes[i]
    lr_value = comparison_df[comparison_df['Model'] == 'Logistic Regression'][metric].iloc[0]
    rf_value = comparison_df[comparison_df['Model'] == 'Random Forest'][metric].iloc[0]
    
    bars = ax.bar(['Logistic Regression', 'Random Forest'], [lr_value, rf_value], 
                  color=['blue', 'green'], alpha=0.7)
    
    # Add value labels on bars
    for bar, value in zip(bars, [lr_value, rf_value]):
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height + 0.01,
                f'{value:.3f}', ha='center', va='bottom', fontweight='bold')
    
    ax.set_title(f'{metric}', fontweight='bold')
    ax.set_ylim(0, 1)
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 7. Baseline Summary and Recommendations

In [None]:
print("="*70)
print("BASELINE MODEL SUMMARY")
print("="*70)

print(f"\n📊 DATASET CHARACTERISTICS:")
print(f"   - Total samples: {len(X):,}")
print(f"   - Features: {X.shape[1]}")
print(f"   - Class imbalance: {y.value_counts()[0]/y.value_counts()[1]:.1f}:1")

print(f"\n🎯 MODEL PERFORMANCE:")
print(f"   - Logistic Regression AUC-ROC: {lr_auc:.4f}")
print(f"   - Random Forest AUC-ROC: {rf_auc:.4f}")
print(f"   - Best model: {'Random Forest' if rf_auc > lr_auc else 'Logistic Regression'}")

print(f"\n🔍 KEY INSIGHTS:")
print(f"   - Both models handle class imbalance with class_weight='balanced'")
print(f"   - Feature scaling is crucial for Logistic Regression")
print(f"   - Random Forest shows better performance on this dataset")
print(f"   - Top features: {', '.join(feature_importance.head(3)['feature'].tolist())}")

print(f"\n⚠️  LIMITATIONS:")
print(f"   - High class imbalance affects precision for fraud detection")
print(f"   - Models may overfit to majority class patterns")
print(f"   - No hyperparameter optimization performed yet")

print(f"\n🚀 NEXT STEPS FOR IMPROVEMENT:")
print(f"   1. Implement advanced sampling techniques (SMOTE, Tomek links)")
print(f"   2. Hyperparameter tuning with GridSearchCV or Optuna")
print(f"   3. Feature engineering (time-based features, interaction terms)")
print(f"   4. Ensemble methods (XGBoost, LightGBM)")
print(f"   5. Cost-sensitive learning with custom loss functions")
print(f"   6. Threshold optimization for better precision/recall trade-off")

print("\n" + "="*70)
print("BASELINE ESTABLISHED - READY FOR ADVANCED EXPERIMENTATION")
print("="*70)