# Notebook 05: Comprehensive Evaluation Metrics
## Benchmarking SAM & MedSAM Robustness under Noisy Abdominal CT Conditions

**Author:** Hoang Le Chau        
**Date:** January 2026

---

### Objective
Calculate comprehensive evaluation metrics for all model predictions:
1. Dice Coefficient - overlap-based metric
2. IoU (Jaccard Index) - intersection over union
3. Hausdorff Distance - boundary accuracy metric
4. Precision and Recall - classification metrics
5. Stability metrics - performance variance across noise levels

These metrics enable quantitative comparison of model robustness.

In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from pathlib import Path
from tqdm import tqdm
from scipy.spatial.distance import directed_hausdorff
from scipy import ndimage
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8-paper')
%matplotlib inline

print("Libraries imported successfully")

Libraries imported successfully


### 1. Mount Drive and Setup Paths

In [2]:
from google.colab import drive
drive.mount('/content/drive')

BASE_PATH = Path('/content/drive/MyDrive/Colab Notebooks/AIMA/sam_noisy/')
OUTPUT_PATH = BASE_PATH / 'SAM_Robustness_Study'
PREDICTIONS_PATH = OUTPUT_PATH / 'predictions'
RESULTS_PATH = OUTPUT_PATH / 'results'
RESULTS_PATH.mkdir(exist_ok=True)

print(f"Predictions path: {PREDICTIONS_PATH}")
print(f"Results will be saved to: {RESULTS_PATH}")

Mounted at /content/drive
Predictions path: /content/drive/MyDrive/Colab Notebooks/AIMA/sam_noisy/SAM_Robustness_Study/predictions
Results will be saved to: /content/drive/MyDrive/Colab Notebooks/AIMA/sam_noisy/SAM_Robustness_Study/results


### 2. Evaluation Metrics Implementation

In [3]:
class SegmentationMetrics:
    """
    Class implementing comprehensive segmentation evaluation metrics.
    """

    @staticmethod
    def dice_coefficient(pred, gt):
        """
        Calculate Dice coefficient (F1 score).

        Args:
            pred: Prediction mask (binary)
            gt: Ground truth mask (binary)

        Returns:
            Dice score [0, 1]
        """
        pred_binary = (pred > 0).astype(np.float32)
        gt_binary = (gt > 0).astype(np.float32)

        intersection = (pred_binary * gt_binary).sum()
        union = pred_binary.sum() + gt_binary.sum()

        if union == 0:
            return 1.0 if intersection == 0 else 0.0

        return (2.0 * intersection) / union

    @staticmethod
    def iou(pred, gt):
        """
        Calculate Intersection over Union (Jaccard Index).

        Args:
            pred: Prediction mask (binary)
            gt: Ground truth mask (binary)

        Returns:
            IoU score [0, 1]
        """
        pred_binary = (pred > 0).astype(np.float32)
        gt_binary = (gt > 0).astype(np.float32)

        intersection = (pred_binary * gt_binary).sum()
        union = pred_binary.sum() + gt_binary.sum() - intersection

        if union == 0:
            return 1.0 if intersection == 0 else 0.0

        return intersection / union

    @staticmethod
    def precision_recall(pred, gt):
        """
        Calculate precision and recall.

        Args:
            pred: Prediction mask (binary)
            gt: Ground truth mask (binary)

        Returns:
            Tuple of (precision, recall)
        """
        pred_binary = (pred > 0).astype(np.float32)
        gt_binary = (gt > 0).astype(np.float32)

        tp = (pred_binary * gt_binary).sum()
        fp = ((pred_binary - gt_binary) > 0).sum()
        fn = ((gt_binary - pred_binary) > 0).sum()

        precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0

        return precision, recall

    @staticmethod
    def hausdorff_distance(pred, gt):
        """
        Calculate Hausdorff distance between boundaries.

        Args:
            pred: Prediction mask (binary)
            gt: Ground truth mask (binary)

        Returns:
            Hausdorff distance (pixels)
        """
        pred_binary = (pred > 0).astype(np.uint8)
        gt_binary = (gt > 0).astype(np.uint8)

        if pred_binary.sum() == 0 or gt_binary.sum() == 0:
            return np.inf

        pred_boundary = pred_binary - ndimage.binary_erosion(pred_binary)
        gt_boundary = gt_binary - ndimage.binary_erosion(gt_binary)

        pred_points = np.argwhere(pred_boundary > 0)
        gt_points = np.argwhere(gt_boundary > 0)

        if len(pred_points) == 0 or len(gt_points) == 0:
            return np.inf

        hd1 = directed_hausdorff(pred_points, gt_points)[0]
        hd2 = directed_hausdorff(gt_points, pred_points)[0]

        return max(hd1, hd2)

    @staticmethod
    def compute_all_metrics(pred, gt):
        """
        Compute all metrics for a single prediction-GT pair.

        Args:
            pred: Prediction mask
            gt: Ground truth mask

        Returns:
            Dictionary of metrics
        """
        dice = SegmentationMetrics.dice_coefficient(pred, gt)
        iou_score = SegmentationMetrics.iou(pred, gt)
        precision, recall = SegmentationMetrics.precision_recall(pred, gt)
        hausdorff = SegmentationMetrics.hausdorff_distance(pred, gt)

        return {
            'dice': dice,
            'iou': iou_score,
            'precision': precision,
            'recall': recall,
            'hausdorff': hausdorff
        }

print("SegmentationMetrics class defined")

SegmentationMetrics class defined


### 3. Batch Evaluation Function

In [4]:
def evaluate_predictions(pred_path, gt_path):
    """
    Evaluate all predictions in a directory.

    Args:
        pred_path: Path to predictions.npy
        gt_path: Path to ground_truth.npy

    Returns:
        DataFrame with metrics for each image
    """
    predictions = np.load(pred_path)
    ground_truths = np.load(gt_path)

    results = []

    for idx in range(len(predictions)):
        metrics = SegmentationMetrics.compute_all_metrics(
            predictions[idx],
            ground_truths[idx]
        )
        metrics['image_idx'] = idx
        results.append(metrics)

    return pd.DataFrame(results)

def get_all_prediction_paths(predictions_base_path, dataset_name):
    """
    Get all prediction paths for a dataset.

    Args:
        predictions_base_path: Base predictions directory
        dataset_name: Name of dataset

    Returns:
        List of tuples (variant_name, model_name, pred_path, gt_path)
    """
    dataset_path = predictions_base_path / dataset_name
    paths = []

    for variant_dir in sorted(dataset_path.iterdir()):
        if variant_dir.is_dir():
            for model_dir in sorted(variant_dir.iterdir()):
                if model_dir.is_dir():
                    pred_path = model_dir / 'predictions.npy'
                    gt_path = model_dir / 'ground_truth.npy'

                    if pred_path.exists() and gt_path.exists():
                        paths.append((
                            variant_dir.name,
                            model_dir.name,
                            pred_path,
                            gt_path
                        ))

    return paths

print("Evaluation functions defined")

Evaluation functions defined


### 4. Evaluate All Predictions

In [5]:
def evaluate_dataset(dataset_name, predictions_base_path, results_save_path):
    """
    Evaluate all predictions for a dataset.

    Args:
        dataset_name: Name of dataset
        predictions_base_path: Base predictions directory
        results_save_path: Path to save results

    Returns:
        DataFrame with all results
    """
    print(f"\n{'='*70}")
    print(f"Evaluating {dataset_name.upper()} Dataset")
    print(f"{'='*70}\n")

    paths = get_all_prediction_paths(predictions_base_path, dataset_name)
    print(f"Found {len(paths)} prediction sets to evaluate\n")

    all_results = []

    for variant_name, model_name, pred_path, gt_path in tqdm(paths, desc="Evaluating"):
        metrics_df = evaluate_predictions(pred_path, gt_path)

        metrics_df['variant'] = variant_name
        metrics_df['model'] = model_name.upper()
        metrics_df['dataset'] = dataset_name

        noise_type = '_'.join(variant_name.split('_')[:-1]) if variant_name != 'clean' else 'clean'
        intensity = variant_name.split('_')[-1] if variant_name != 'clean' else 'clean'

        metrics_df['noise_type'] = noise_type
        metrics_df['intensity'] = intensity

        all_results.append(metrics_df)

    full_results = pd.concat(all_results, ignore_index=True)

    save_path = results_save_path / f"{dataset_name}_detailed_metrics.csv"
    full_results.to_csv(save_path, index=False)
    print(f"\nDetailed results saved to {save_path}")

    return full_results

print("Evaluating Liver dataset...")
liver_results = evaluate_dataset('liver', PREDICTIONS_PATH, RESULTS_PATH)

print("\nEvaluating Spleen dataset...")
spleen_results = evaluate_dataset('spleen', PREDICTIONS_PATH, RESULTS_PATH)

print("\n" + "="*70)
print("EVALUATION COMPLETE")
print("="*70)

Evaluating Liver dataset...

Evaluating LIVER Dataset

Found 38 prediction sets to evaluate



Evaluating: 100%|██████████| 38/38 [01:18<00:00,  2.06s/it]



Detailed results saved to /content/drive/MyDrive/Colab Notebooks/AIMA/sam_noisy/SAM_Robustness_Study/results/liver_detailed_metrics.csv

Evaluating Spleen dataset...

Evaluating SPLEEN Dataset

Found 38 prediction sets to evaluate



Evaluating: 100%|██████████| 38/38 [01:13<00:00,  1.94s/it]


Detailed results saved to /content/drive/MyDrive/Colab Notebooks/AIMA/sam_noisy/SAM_Robustness_Study/results/spleen_detailed_metrics.csv

EVALUATION COMPLETE





### 5. Aggregate Statistics by Noise Type and Intensity

In [6]:
def compute_aggregate_statistics(results_df):
    """
    Compute aggregate statistics grouped by variant and model.

    Args:
        results_df: DataFrame with detailed results

    Returns:
        DataFrame with aggregate statistics
    """
    grouped = results_df.groupby(['dataset', 'variant', 'model', 'noise_type', 'intensity'])

    agg_stats = grouped.agg({
        'dice': ['mean', 'std', 'min', 'max'],
        'iou': ['mean', 'std', 'min', 'max'],
        'precision': ['mean', 'std'],
        'recall': ['mean', 'std'],
        'hausdorff': ['mean', 'std', 'median']
    }).reset_index()

    agg_stats.columns = ['_'.join(col).strip('_') for col in agg_stats.columns.values]

    return agg_stats

liver_agg = compute_aggregate_statistics(liver_results)
spleen_agg = compute_aggregate_statistics(spleen_results)

liver_agg.to_csv(RESULTS_PATH / 'liver_aggregate_metrics.csv', index=False)
spleen_agg.to_csv(RESULTS_PATH / 'spleen_aggregate_metrics.csv', index=False)

print("\nAggregate statistics computed and saved")
print(f"\nLiver aggregate shape: {liver_agg.shape}")
print(f"Spleen aggregate shape: {spleen_agg.shape}")


Aggregate statistics computed and saved

Liver aggregate shape: (38, 20)
Spleen aggregate shape: (38, 20)


### 6. Performance Summary Tables

In [7]:
def create_performance_summary(agg_df, dataset_name):
    """
    Create performance summary table.

    Args:
        agg_df: Aggregate statistics DataFrame
        dataset_name: Name of dataset

    Returns:
        Summary DataFrame
    """
    summary_cols = ['variant', 'model', 'dice_mean', 'iou_mean',
                   'precision_mean', 'recall_mean', 'hausdorff_mean']
    summary = agg_df[summary_cols].copy()

    summary = summary.round(4)

    return summary

print("\n" + "="*80)
print("LIVER DATASET - PERFORMANCE SUMMARY (Top 10)")
print("="*80)
liver_summary = create_performance_summary(liver_agg, 'Liver')
print(liver_summary.head(10).to_string(index=False))

print("\n" + "="*80)
print("SPLEEN DATASET - PERFORMANCE SUMMARY (Top 10)")
print("="*80)
spleen_summary = create_performance_summary(spleen_agg, 'Spleen')
print(spleen_summary.head(10).to_string(index=False))

liver_summary.to_csv(RESULTS_PATH / 'liver_summary.csv', index=False)
spleen_summary.to_csv(RESULTS_PATH / 'spleen_summary.csv', index=False)


LIVER DATASET - PERFORMANCE SUMMARY (Top 10)
                     variant  model  dice_mean  iou_mean  precision_mean  recall_mean  hausdorff_mean
                       clean MEDSAM     0.0000    0.0000          0.0000       0.0000             inf
                       clean    SAM     0.1313    0.0739          0.0739       1.0000        375.1101
               gaussian_mild MEDSAM     0.0692    0.0391          0.0500       0.1463        312.8318
               gaussian_mild    SAM     0.1394    0.0793          0.0793       1.0000        378.5870
           gaussian_moderate MEDSAM     0.0803    0.0464          0.0582       0.1495        314.5615
           gaussian_moderate    SAM     0.1343    0.0764          0.0764       1.0000        386.2715
             gaussian_severe MEDSAM     0.0394    0.0216          0.0285       0.0750        319.6416
             gaussian_severe    SAM     0.1312    0.0743          0.0743       1.0000        397.9276
intensity_inhomogeneity_mild MEDSAM 

### 7. Model Comparison (Clean vs Noisy)

In [8]:
def compare_clean_vs_noisy(results_df, dataset_name):
    """
    Compare performance on clean vs noisy data.

    Args:
        results_df: Results DataFrame
        dataset_name: Name of dataset

    Returns:
        Comparison DataFrame
    """
    clean_results = results_df[results_df['variant'] == 'clean'].groupby('model').agg({
        'dice': 'mean',
        'iou': 'mean',
        'hausdorff': 'mean'
    }).reset_index()
    clean_results.columns = ['model', 'dice_clean', 'iou_clean', 'hausdorff_clean']

    noisy_results = results_df[results_df['variant'] != 'clean'].groupby('model').agg({
        'dice': 'mean',
        'iou': 'mean',
        'hausdorff': 'mean'
    }).reset_index()
    noisy_results.columns = ['model', 'dice_noisy', 'iou_noisy', 'hausdorff_noisy']

    comparison = pd.merge(clean_results, noisy_results, on='model')

    comparison['dice_degradation'] = comparison['dice_clean'] - comparison['dice_noisy']
    comparison['iou_degradation'] = comparison['iou_clean'] - comparison['iou_noisy']
    comparison['hausdorff_increase'] = comparison['hausdorff_noisy'] - comparison['hausdorff_clean']

    return comparison.round(4)

print("\n" + "="*80)
print("CLEAN VS NOISY COMPARISON")
print("="*80)

print("\nLiver Dataset:")
liver_comparison = compare_clean_vs_noisy(liver_results, 'Liver')
print(liver_comparison.to_string(index=False))

print("\nSpleen Dataset:")
spleen_comparison = compare_clean_vs_noisy(spleen_results, 'Spleen')
print(spleen_comparison.to_string(index=False))

liver_comparison.to_csv(RESULTS_PATH / 'liver_clean_vs_noisy.csv', index=False)
spleen_comparison.to_csv(RESULTS_PATH / 'spleen_clean_vs_noisy.csv', index=False)


CLEAN VS NOISY COMPARISON

Liver Dataset:
 model  dice_clean  iou_clean  hausdorff_clean  dice_noisy  iou_noisy  hausdorff_noisy  dice_degradation  iou_degradation  hausdorff_increase
MEDSAM      0.0000     0.0000              inf      0.0586     0.0333         314.6080           -0.0586          -0.0333                -inf
   SAM      0.1313     0.0739         375.1101      0.1320     0.0745         380.7734           -0.0007          -0.0006              5.6634

Spleen Dataset:
 model  dice_clean  iou_clean  hausdorff_clean  dice_noisy  iou_noisy  hausdorff_noisy  dice_degradation  iou_degradation  hausdorff_increase
MEDSAM      0.0000     0.0000              inf      0.0597     0.0320         357.2129           -0.0597          -0.0320                -inf
   SAM      0.0396     0.0203         417.5114      0.0404     0.0207         416.5814           -0.0008          -0.0004             -0.9299


### 8. Robustness Ranking by Noise Type

In [9]:
def rank_noise_impact(results_df, dataset_name):
    """
    Rank noise types by their impact on performance.

    Args:
        results_df: Results DataFrame
        dataset_name: Name of dataset

    Returns:
        Ranked DataFrame
    """
    clean_dice = results_df[results_df['variant'] == 'clean']['dice'].mean()

    noisy_by_type = results_df[results_df['variant'] != 'clean'].groupby('noise_type').agg({
        'dice': 'mean',
        'iou': 'mean'
    }).reset_index()

    noisy_by_type['dice_drop'] = clean_dice - noisy_by_type['dice']
    noisy_by_type = noisy_by_type.sort_values('dice_drop', ascending=False)

    return noisy_by_type.round(4)

print("\n" + "="*70)
print("NOISE IMPACT RANKING (Most to Least Harmful)")
print("="*70)

print("\nLiver Dataset:")
liver_noise_rank = rank_noise_impact(liver_results, 'Liver')
print(liver_noise_rank.to_string(index=False))

print("\nSpleen Dataset:")
spleen_noise_rank = rank_noise_impact(spleen_results, 'Spleen')
print(spleen_noise_rank.to_string(index=False))

liver_noise_rank.to_csv(RESULTS_PATH / 'liver_noise_impact_ranking.csv', index=False)
spleen_noise_rank.to_csv(RESULTS_PATH / 'spleen_noise_impact_ranking.csv', index=False)


NOISE IMPACT RANKING (Most to Least Harmful)

Liver Dataset:
             noise_type   dice    iou  dice_drop
            motion_blur 0.0867 0.0490    -0.0211
                poisson 0.0907 0.0510    -0.0250
intensity_inhomogeneity 0.0944 0.0535    -0.0288
           low_contrast 0.0980 0.0552    -0.0323
               gaussian 0.0990 0.0562    -0.0334
            salt_pepper 0.1030 0.0587    -0.0373

Spleen Dataset:
             noise_type   dice    iou  dice_drop
           low_contrast 0.0462 0.0243    -0.0264
            motion_blur 0.0467 0.0247    -0.0269
                poisson 0.0480 0.0251    -0.0282
intensity_inhomogeneity 0.0490 0.0258    -0.0292
               gaussian 0.0531 0.0279    -0.0333
            salt_pepper 0.0574 0.0303    -0.0376


### 9. Final Summary Report

In [10]:
print("\n" + "="*80)
print("EVALUATION METRICS SUMMARY")
print("="*80)

summary_info = {
    'Total Evaluations': len(liver_results) + len(spleen_results),
    'Liver Samples': len(liver_results),
    'Spleen Samples': len(spleen_results),
    'Metrics Computed': ['Dice', 'IoU', 'Precision', 'Recall', 'Hausdorff Distance'],
    'Models Evaluated': liver_results['model'].unique().tolist(),
    'Noise Types Evaluated': liver_results['noise_type'].unique().tolist()
}

for key, value in summary_info.items():
    print(f"{key}: {value}")

print("\nFiles Saved:")
saved_files = [
    'liver_detailed_metrics.csv',
    'spleen_detailed_metrics.csv',
    'liver_aggregate_metrics.csv',
    'spleen_aggregate_metrics.csv',
    'liver_summary.csv',
    'spleen_summary.csv',
    'liver_clean_vs_noisy.csv',
    'spleen_clean_vs_noisy.csv',
    'liver_noise_impact_ranking.csv',
    'spleen_noise_impact_ranking.csv'
]

for filename in saved_files:
    print(f"  - {filename}")

print("\n" + "="*80)


EVALUATION METRICS SUMMARY
Total Evaluations: 3800
Liver Samples: 1900
Spleen Samples: 1900
Metrics Computed: ['Dice', 'IoU', 'Precision', 'Recall', 'Hausdorff Distance']
Models Evaluated: ['MEDSAM', 'SAM']
Noise Types Evaluated: ['clean', 'gaussian', 'intensity_inhomogeneity', 'low_contrast', 'motion_blur', 'poisson', 'salt_pepper']

Files Saved:
  - liver_detailed_metrics.csv
  - spleen_detailed_metrics.csv
  - liver_aggregate_metrics.csv
  - spleen_aggregate_metrics.csv
  - liver_summary.csv
  - spleen_summary.csv
  - liver_clean_vs_noisy.csv
  - spleen_clean_vs_noisy.csv
  - liver_noise_impact_ranking.csv
  - spleen_noise_impact_ranking.csv

