# ML Model Evaluation
## Credit Card Fraud Detection

This notebook performs comprehensive evaluation of trained ML models using the evaluator module from src folder.


In [5]:
# Import necessary libraries
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Add src to path
sys.path.append(str(Path('..')))
from src.data_loader import DataLoader
from src.models import FraudDetectionModels
from src.evaluator import ModelEvaluator

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
%matplotlib inline


## 1. Load Data and Models


In [6]:
# Load test data (using cleaned dataset for consistency)
data_loader = DataLoader(data_dir='../data')

# Load cleaned data (from Data Cleaning notebook)
cleaned_csv_path = Path('../data/creditcard_cleaned.csv')
if cleaned_csv_path.exists():
    print(f"✓ Loading cleaned data from: {cleaned_csv_path.absolute()}")
    df_clean = pd.read_csv(cleaned_csv_path)
    print(f"  - Using cleaned dataset: {len(df_clean):,} transactions")
else:
    print("⚠ WARNING: Cleaned data not found!")
    print("Loading original data and removing duplicates...")
    csv_path = Path('../data/creditcard.csv')
    df_original = data_loader.load_csv_data('creditcard.csv')
    df_clean = df_original.drop_duplicates(keep='first')
    print(f"  - Removed {df_original.duplicated().sum():,} duplicates")

# Preprocess cleaned data (same as training)
print(f"\n✓ Preprocessing cleaned dataset for evaluation...")
X_train, X_test, y_train, y_test, feature_cols = data_loader.preprocess_data(
    df_clean,  # Using cleaned dataset
    target_col='Class',
    test_size=0.2,
    random_state=42
)

print(f"\n✓ Test set prepared:")
print(f"  Test samples: {X_test.shape[0]:,}")
print(f"  Test features: {X_test.shape[1]}")
print(f"  Test fraud cases: {y_test.sum()} ({y_test.mean()*100:.2f}%)")
print(f"  Test normal cases: {len(y_test) - y_test.sum()} ({(1-y_test.mean())*100:.2f}%)")


✓ Loading cleaned data from: d:\h\Financial Fraud Detection-AI\transactions\notebooks\..\data\creditcard_cleaned.csv
  - Using cleaned dataset: 283,726 transactions

✓ Preprocessing cleaned dataset for evaluation...

Preprocessing data...
Features: 29
Feature columns: ['V1', 'V2', 'V3', 'V4', 'V5']... (showing first 5)

Train set: 226980 samples
  - Fraud: 378 (0.17%)
Test set: 56746 samples
  - Fraud: 95 (0.17%)

Scaling features...

✓ Test set prepared:
  Test samples: 56,746
  Test features: 29
  Test fraud cases: 95 (0.17%)
  Test normal cases: 56651 (99.83%)


In [7]:
# Load trained models
models = FraudDetectionModels(models_dir='../models')

# Load all available models
model_files = list(Path('../models').glob('*.pkl'))
model_files = [f for f in model_files if f.stem not in ['scaler', 'feature_columns']]

trained_models = {}
for model_file in model_files:
    model_name = model_file.stem
    trained_models[model_name] = models.load_model(model_name)

print(f"Loaded {len(trained_models)} models:")
for name in trained_models.keys():
    print(f"  - {name}")


Model loaded from ..\models\decision_tree.pkl
Model loaded from ..\models\logistic_regression.pkl
Model loaded from ..\models\random_forest.pkl
Model loaded from ..\models\xgboost.pkl
Loaded 4 models:
  - decision_tree
  - logistic_regression
  - random_forest
  - xgboost


## 2. Initialize Evaluator


In [8]:
# Initialize evaluator
evaluator = ModelEvaluator(reports_dir='../reports')
print("Evaluator initialized. Reports will be saved to ../reports/")


Evaluator initialized. Reports will be saved to ../reports/


## 3. Evaluate All Models


In [9]:
# Evaluate each model
for model_name, model in trained_models.items():
    # Format model name for display
    display_name = model_name.replace('_', ' ').title()
    evaluator.evaluate_model(model, X_test, y_test, display_name)



Evaluating Decision Tree...

Metrics for Decision Tree:
  Accuracy:  0.9952
  Precision: 0.2277
  Recall:    0.7789
  F1-Score:  0.3524
  ROC-AUC:   0.8886
  PR-AUC:    0.4971

Confusion Matrix:
  TN: 56400, FP: 251
  FN: 21, TP: 74

Classification Report:
              precision    recall  f1-score   support

      Normal       1.00      1.00      1.00     56651
       Fraud       0.23      0.78      0.35        95

    accuracy                           1.00     56746
   macro avg       0.61      0.89      0.67     56746
weighted avg       1.00      1.00      1.00     56746


Evaluating Logistic Regression...

Metrics for Logistic Regression:
  Accuracy:  0.9750
  Precision: 0.0558
  Recall:    0.8737
  F1-Score:  0.1049
  ROC-AUC:   0.9648
  PR-AUC:    0.6752

Confusion Matrix:
  TN: 55247, FP: 1404
  FN: 12, TP: 83

Classification Report:
              precision    recall  f1-score   support

      Normal       1.00      0.98      0.99     56651
       Fraud       0.06      0.87  

## 4. Model Comparison


In [10]:
# Compare all models
comparison_df = evaluator.compare_models()
comparison_df



MODEL COMPARISON
              Model  Accuracy  Precision   Recall  F1-Score  ROC-AUC   PR-AUC
      Random Forest  0.999489   0.985294 0.705263  0.822086 0.929913 0.807649
            Xgboost  0.998925   0.644068 0.800000  0.713615 0.973605 0.805578
      Decision Tree  0.995207   0.227692 0.778947  0.352381 0.888592 0.497137
Logistic Regression  0.975047   0.055817 0.873684  0.104930 0.964832 0.675156


Unnamed: 0,Model,Accuracy,Precision,Recall,F1-Score,ROC-AUC,PR-AUC
2,Random Forest,0.999489,0.985294,0.705263,0.822086,0.929913,0.807649
3,Xgboost,0.998925,0.644068,0.8,0.713615,0.973605,0.805578
0,Decision Tree,0.995207,0.227692,0.778947,0.352381,0.888592,0.497137
1,Logistic Regression,0.975047,0.055817,0.873684,0.10493,0.964832,0.675156


## 5. Generate Visualizations


In [11]:
# Plot ROC curves
evaluator.plot_roc_curves(save_path='roc_curves.png')
print("ROC curves saved to ../reports/roc_curves.png")


ROC curves saved to ..\reports\roc_curves.png
ROC curves saved to ../reports/roc_curves.png


In [12]:
# Plot PR curves
evaluator.plot_pr_curves(save_path='pr_curves.png')
print("Precision-Recall curves saved to ../reports/pr_curves.png")


PR curves saved to ..\reports\pr_curves.png
Precision-Recall curves saved to ../reports/pr_curves.png


In [13]:
# Plot confusion matrices
evaluator.plot_confusion_matrices(save_path='confusion_matrices.png')
print("Confusion matrices saved to ../reports/confusion_matrices.png")


Confusion matrices saved to ..\reports\confusion_matrices.png
Confusion matrices saved to ../reports/confusion_matrices.png


In [None]:
# Plot metrics comparison
evaluator.plot_metrics_comparison(save_path='metrics_comparison.png')
print("Metrics comparison saved to ../reports/metrics_comparison.png")


## 6. Save Results


In [14]:
# Save evaluation results to CSV
results_path = evaluator.save_results('evaluation_results.csv')
print(f"\nResults saved to: {results_path}")



MODEL COMPARISON
              Model  Accuracy  Precision   Recall  F1-Score  ROC-AUC   PR-AUC
      Random Forest  0.999489   0.985294 0.705263  0.822086 0.929913 0.807649
            Xgboost  0.998925   0.644068 0.800000  0.713615 0.973605 0.805578
      Decision Tree  0.995207   0.227692 0.778947  0.352381 0.888592 0.497137
Logistic Regression  0.975047   0.055817 0.873684  0.104930 0.964832 0.675156
Results saved to ..\reports\evaluation_results.csv

Results saved to: ..\reports\evaluation_results.csv
