# ROC Curve Analysis for CIFAR-10 CNN Experiments

This notebook computes ROC curves and AUC scores for CIFAR-10 CNN classification experiments.
It retrieves prediction probability files as assets from the DerivaML catalog.

**Input:** List of asset RIDs for probability files (configured via `assets` parameter)

**Output:** ROC curves comparing model performance across experiments

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize

from deriva_ml import DerivaML
from deriva_ml.execution import Execution, ExecutionConfiguration

## Parameters

Configure asset RIDs to analyze. These should be `prediction_probabilities.csv` files
from completed CIFAR-10 CNN executions.

In [None]:
# Parameters cell - injected by papermill or deriva-ml-run-notebook
host: str = "localhost"
catalog: str = "45"
# Asset RIDs for prediction probability files
assets: list[str] = ["3JSJ", "3KVC"]  # Default: multirun comparison probabilities

## Create Execution Context and Download Assets

In [None]:
# Initialize DerivaML connection
ml = DerivaML(hostname=host, catalog_id=catalog)
print(f"Connected to {ml.host_name}, catalog {ml.catalog_id}")

In [None]:
# Create execution context with assets - this downloads the probability files
workflow = ml.create_workflow(
    name="ROC Analysis",
    workflow_type="ROC Analysis Notebook",
    description="Compute ROC curves and AUC scores for CIFAR-10 CNN experiments",
)
exec_config = ExecutionConfiguration(
    workflow=workflow,
    assets=assets,
    description=f"ROC analysis of {len(assets)} experiments",
)
execution = Execution(configuration=exec_config, ml_object=ml)
print(f"Created execution {execution.execution_rid}")
print(f"Downloaded assets: {list(execution.asset_paths.keys())}")

In [None]:
# Load probability files from downloaded assets
experiments = []
for asset_table, asset_list in execution.asset_paths.items():
    for asset_path in asset_list:
        if asset_path.file_name.name == "prediction_probabilities.csv":
            df = pd.read_csv(asset_path.file_name)
            experiments.append({
                'asset_rid': asset_path.asset_rid,
                'file_name': asset_path.file_name.name,
                'data': df
            })
            print(f"Loaded {len(df)} predictions from asset {asset_path.asset_rid}")

print(f"\nSuccessfully loaded {len(experiments)} experiments")

## Get Ground Truth Labels

Retrieve ground truth labels from the Image_Classification feature table.
Ground truth labels are those without confidence scores (manually assigned).

In [None]:
# Get ground truth labels from the feature table
all_feature_values = list(ml.list_feature_values("Image", "Image_Classification"))
feature_df = pd.DataFrame(all_feature_values)

# Ground truth labels have no confidence score (manually labeled)
# Group by execution to identify which has ground truth
exec_summary = feature_df.groupby('Execution').agg({
    'Image': 'count',
    'Confidence': lambda x: x.notna().sum()
}).rename(columns={'Image': 'num_images', 'Confidence': 'with_confidence'})

# Find execution with no confidence scores (ground truth)
gt_mask = exec_summary['with_confidence'] == 0
if gt_mask.any():
    gt_execution = exec_summary[gt_mask].index[0]
else:
    gt_execution = exec_summary['num_images'].idxmax()
    
print(f"Ground truth execution: {gt_execution}")
print(f"Total ground truth labels: {exec_summary.loc[gt_execution, 'num_images']}")

In [None]:
# Extract ground truth as lookup dictionary
ground_truth = feature_df[feature_df['Execution'] == gt_execution][['Image', 'Image_Class']]
gt_lookup = dict(zip(ground_truth['Image'], ground_truth['Image_Class']))

# Get class names
class_names = sorted(ground_truth['Image_Class'].unique())
n_classes = len(class_names)
print(f"Classes ({n_classes}): {class_names}")

## Merge Predictions with Ground Truth

In [None]:
# Add ground truth to each experiment's predictions
for exp in experiments:
    df = exp['data'].copy()
    
    # Debug: show sample RIDs from predictions vs ground truth
    print(f"\nAsset {exp['asset_rid']}:")
    print(f"  Prediction Image_RIDs (first 5): {df['Image_RID'].head().tolist()}")
    print(f"  Ground truth Image keys (first 5): {list(gt_lookup.keys())[:5]}")
    
    df['True_Class'] = df['Image_RID'].map(gt_lookup)
    # Keep only images with ground truth
    matched = df['True_Class'].notna().sum()
    print(f"  Matched: {matched} / {len(df)}")
    
    df = df.dropna(subset=['True_Class'])
    exp['data'] = df
    exp['n_samples'] = len(df)
    if len(df) > 0:
        exp['accuracy'] = (df['Predicted_Class'] == df['True_Class']).mean() * 100
        print(f"  Accuracy: {exp['accuracy']:.1f}%")
    else:
        exp['accuracy'] = float('nan')
        print("  No matching samples found!")

## Compute ROC Curves

For multi-class classification, compute ROC curves using one-vs-rest approach.

In [None]:
def compute_roc_metrics(df: pd.DataFrame, class_names: list[str]) -> dict:
    """Compute ROC curves and AUC scores for multi-class predictions.
    
    Args:
        df: DataFrame with True_Class and prob_* columns
        class_names: Ordered list of class names
        
    Returns:
        Dict with fpr, tpr, roc_auc for each class and micro/macro averages
    """
    n_classes = len(class_names)
    class_to_idx = {name: i for i, name in enumerate(class_names)}
    
    # Convert labels to indices
    y_true_idx = df['True_Class'].map(class_to_idx).values
    y_true_bin = label_binarize(y_true_idx, classes=range(n_classes))
    
    # Get probability matrix
    prob_cols = [f"prob_{c}" for c in class_names]
    y_score = df[prob_cols].values
    
    # Compute per-class ROC
    fpr, tpr, roc_auc = {}, {}, {}
    for i, name in enumerate(class_names):
        fpr[name], tpr[name], _ = roc_curve(y_true_bin[:, i], y_score[:, i])
        roc_auc[name] = auc(fpr[name], tpr[name])
    
    # Micro-average
    fpr['micro'], tpr['micro'], _ = roc_curve(y_true_bin.ravel(), y_score.ravel())
    roc_auc['micro'] = auc(fpr['micro'], tpr['micro'])
    
    # Macro-average
    roc_auc['macro'] = np.mean([roc_auc[c] for c in class_names])
    
    return {'fpr': fpr, 'tpr': tpr, 'roc_auc': roc_auc}

In [None]:
# Compute ROC metrics for each experiment
for exp in experiments:
    metrics = compute_roc_metrics(exp['data'], class_names)
    exp.update(metrics)
    
    print(f"\nAsset {exp['asset_rid']}:")
    print(f"  Accuracy: {exp['accuracy']:.2f}%")
    print(f"  Micro-AUC: {exp['roc_auc']['micro']:.4f}")
    print(f"  Macro-AUC: {exp['roc_auc']['macro']:.4f}")

In [None]:
# Display AUC comparison table
if experiments:
    auc_data = []
    for exp in experiments:
        row = {'Asset': exp['asset_rid']}
        for c in class_names:
            row[c] = exp['roc_auc'][c]
        row['Micro'] = exp['roc_auc']['micro']
        row['Macro'] = exp['roc_auc']['macro']
        auc_data.append(row)

    auc_df = pd.DataFrame(auc_data).set_index('Asset')
    print("\nPer-class AUC scores:")
    display(auc_df.round(4))
else:
    print("No experiments loaded")

## Plot ROC Curves

In [None]:
def plot_roc_curves(exp: dict, class_names: list[str]):
    """Plot ROC curves for all classes in an experiment."""
    fig, ax = plt.subplots(figsize=(10, 8))
    
    fpr, tpr, roc_auc = exp['fpr'], exp['tpr'], exp['roc_auc']
    
    # Micro-average
    ax.plot(fpr['micro'], tpr['micro'], 
            label=f"Micro-avg (AUC={roc_auc['micro']:.3f})",
            color='deeppink', linestyle=':', linewidth=3)
    
    # Per-class
    colors = plt.cm.tab10(np.linspace(0, 1, len(class_names)))
    for i, name in enumerate(class_names):
        ax.plot(fpr[name], tpr[name], color=colors[i],
                label=f"{name} (AUC={roc_auc[name]:.3f})")
    
    ax.plot([0, 1], [0, 1], 'k--', alpha=0.5)
    ax.set_xlim([0, 1])
    ax.set_ylim([0, 1.05])
    ax.set_xlabel('False Positive Rate')
    ax.set_ylabel('True Positive Rate')
    ax.set_title(f"ROC Curves - Asset {exp['asset_rid']} (Acc: {exp['accuracy']:.1f}%)")
    ax.legend(loc='lower right', fontsize=9)
    ax.grid(True, alpha=0.3)
    
    return fig

In [None]:
# Plot ROC curves for each experiment
for exp in experiments:
    fig = plot_roc_curves(exp, class_names)
    plt.tight_layout()
    plt.show()

## Experiment Comparison

In [None]:
# Compare micro-average ROC curves across experiments
if len(experiments) > 1:
    fig, ax = plt.subplots(figsize=(10, 8))
    colors = plt.cm.Set1(np.linspace(0, 1, len(experiments)))
    
    for i, exp in enumerate(experiments):
        label = f"{exp['asset_rid']} (AUC={exp['roc_auc']['micro']:.3f}, Acc={exp['accuracy']:.1f}%)"
        ax.plot(exp['fpr']['micro'], exp['tpr']['micro'], 
                color=colors[i], linewidth=2, label=label)
    
    ax.plot([0, 1], [0, 1], 'k--', alpha=0.5, label='Random')
    ax.set_xlim([0, 1])
    ax.set_ylim([0, 1.05])
    ax.set_xlabel('False Positive Rate')
    ax.set_ylabel('True Positive Rate')
    ax.set_title('ROC Curve Comparison (Micro-Average)')
    ax.legend(loc='lower right')
    ax.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
else:
    print("Single experiment - no comparison plot")

## Summary

In [None]:
# Final summary
print("=" * 60)
print("CIFAR-10 ROC Analysis Summary")
print("=" * 60)
print(f"Catalog: {ml.host_name}:{ml.catalog_id}")
print(f"Execution: {execution.execution_rid}")
print(f"Ground truth: Execution {gt_execution} ({len(gt_lookup)} labels)")
print(f"Classes: {n_classes}")
print(f"Experiments analyzed: {len(experiments)}")

for exp in experiments:
    print(f"\n  Asset {exp['asset_rid']}:")
    print(f"    Samples: {exp['n_samples']}")
    print(f"    Accuracy: {exp['accuracy']:.2f}%")
    print(f"    Micro-AUC: {exp['roc_auc']['micro']:.4f}")
    print(f"    Macro-AUC: {exp['roc_auc']['macro']:.4f}")

print("\n" + "=" * 60)

In [None]:
# Complete execution and upload outputs
execution.upload_execution_outputs()
print(f"\nExecution completed: {execution.execution_rid}")