# ROC Curve Analysis for CIFAR-10 CNN Experiments

This notebook computes ROC (Receiver Operating Characteristic) curves and AUC (Area Under Curve) scores for CIFAR-10 CNN classification experiments. It compares model performance across different training configurations.

## Workflow

1. Load prediction probability files from DerivaML catalog as assets
2. Retrieve ground truth labels from the Image_Classification feature table
3. Compute per-class and micro/macro-averaged ROC curves
4. Generate comparison visualizations

## Requirements

- Prediction probability CSV files with columns: `Image_RID`, `Predicted_Class`, `prob_<classname>` for each class
- Ground truth labels stored in the Image_Classification feature (from a labeling execution with no confidence scores)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize

from deriva_ml import Experiment
from deriva_ml.execution import run_notebook

## Initialize Notebook

Initialize the notebook with DerivaML execution context. This single call:
1. Loads all configuration modules
2. Resolves the hydra-zen configuration
3. Creates the DerivaML connection
4. Creates a workflow and execution context
5. Downloads any specified assets

Override configuration at runtime:
```python
ml, execution, config = run_notebook(
    "roc_analysis",
    overrides=["assets=roc_quick_probabilities"],  # Analyze single experiment
)
```

Available asset configurations (see `src/configs/assets.py`):
- `roc_quick_probabilities` - cifar10_quick experiment only
- `roc_extended_probabilities` - cifar10_extended experiment only  
- `roc_comparison_probabilities` - both experiments (default)

In [None]:
# Initialize notebook - this single call handles all setup
ml, execution, config = run_notebook("roc_analysis", workflow_type="ROC Analysis Notebook")

print(f"Connected to {ml.host_name}, catalog {ml.catalog_id}")
print(f"Execution: {execution.execution_rid}")
print(f"Assets: {config.assets}")
print(f"Show per-class curves: {config.show_per_class}")
print(f"Confidence threshold: {config.confidence_threshold}")
print(f"Downloaded: {list(execution.asset_paths.keys())}")

## Load Probability Files from Assets

Load the prediction probability CSV files from the downloaded assets.

In [None]:
# Load probability files from downloaded assets
experiments = []
for asset_table, asset_list in execution.asset_paths.items():
    for asset_path in asset_list:
        if asset_path.file_name.name == "prediction_probabilities.csv":
            df = pd.read_csv(asset_path.file_name)
            experiments.append({
                'asset_rid': asset_path.asset_rid,
                'file_name': asset_path.file_name.name,
                'data': df
            })
            print(f"Loaded {len(df)} predictions from asset {asset_path.asset_rid}")

print(f"\nSuccessfully loaded {len(experiments)} experiments")

In [None]:
from IPython.display import display, Markdown, HTML

# Build Experiment objects for each prediction asset
# Assets are paired: [predictions_csv, config_yaml, predictions_csv, config_yaml, ...]
experiments = []
asset_rids = config.assets

for i, asset_path in enumerate(execution.asset_paths.get('Execution_Asset', [])):
    if asset_path.file_name.name == "prediction_probabilities.csv":
        # Find the source execution that produced this asset
        asset = ml.lookup_asset(asset_path.asset_rid)
        asset_executions = asset.list_executions(asset_role='Output')
        
        if asset_executions:
            exec_rid = asset_executions[0]['Execution']
            exp = Experiment(ml, exec_rid)
            
            # Load prediction data
            df = pd.read_csv(asset_path.file_name)
            
            experiments.append({
                'experiment': exp,
                'asset_rid': asset_path.asset_rid,
                'data': df,
                'name': exp.name,
                'config_choices': exp.config_choices,
                'model_config': exp.model_config,
            })
            print(f"Loaded {len(df)} predictions from {exp.name} (execution {exec_rid})")

print(f"\nSuccessfully loaded {len(experiments)} experiments")

# Display experiment configurations using Experiment class
display(Markdown("## Experiment Configurations"))

for exp_data in experiments:
    exp = exp_data['experiment']
    
    # Header with execution link
    display(HTML(f"<hr/><h3>{exp.name} (<a href='{exp.get_chaise_url()}' target='_blank'>{exp.execution_rid}</a>)</h3>"))
    
    # Description
    if exp.description:
        display(Markdown(f"**Description:** {exp.description}"))
    
    # Config choices
    if exp.config_choices:
        display(Markdown("**Configuration Choices:**"))
        choices_str = ", ".join(f"{k}={v}" for k, v in sorted(exp.config_choices.items()))
        display(HTML(f"<code>{choices_str}</code>"))
    
    # Model configuration
    model_cfg = {k: v for k, v in exp.model_config.items() if not k.startswith('_')}
    if model_cfg:
        config_df = pd.DataFrame([{'Parameter': k, 'Value': v} for k, v in sorted(model_cfg.items())]).set_index('Parameter')
        display(Markdown("**Model Configuration:**"))
        display(config_df)
    
    # Input datasets
    if exp.input_datasets:
        display(Markdown("**Input Datasets:**"))
        for ds in exp.input_datasets:
            url = f"https://{ml.host_name}/chaise/record/#{ml.catalog_id}/deriva-ml:Dataset/RID={ds.dataset_rid}"
            types_str = f" [{', '.join(ds.dataset_types)}]" if ds.dataset_types else ""
            display(HTML(f"• <a href='{url}' target='_blank'>{ds.dataset_rid}</a> v{ds.current_version}{types_str}"))
    
    # Input assets
    if exp.input_assets:
        display(Markdown("**Input Assets:**"))
        for asset in exp.input_assets:
            display(HTML(f"• <a href='{asset.get_chaise_url()}' target='_blank'>{asset.asset_rid}</a> — {asset.filename}"))

display(Markdown("---"))

## Get Ground Truth Labels

Retrieve ground truth labels from the `Image_Classification` feature table. This feature stores classification labels for images, potentially from multiple sources (executions).

**Identifying ground truth:**
- Ground truth labels are manually assigned and have **no confidence score** (NULL)
- Model predictions have confidence scores from softmax probabilities
- We identify the ground truth execution by finding labels with zero confidence values

In [None]:
# Get ground truth labels from the feature table
all_feature_values = list(ml.list_feature_values("Image", "Image_Classification"))
feature_df = pd.DataFrame(all_feature_values)

# Ground truth labels have no confidence score (manually labeled)
# Group by execution to identify which has ground truth
exec_summary = feature_df.groupby('Execution').agg({
    'Image': 'count',
    'Confidence': lambda x: x.notna().sum()
}).rename(columns={'Image': 'num_images', 'Confidence': 'with_confidence'})

# Find execution with no confidence scores (ground truth)
gt_mask = exec_summary['with_confidence'] == 0
if gt_mask.any():
    gt_execution = exec_summary[gt_mask].index[0]
else:
    gt_execution = exec_summary['num_images'].idxmax()
    
print(f"Ground truth execution: {gt_execution}")
print(f"Total ground truth labels: {exec_summary.loc[gt_execution, 'num_images']}")

In [None]:
# Extract ground truth as lookup dictionary
ground_truth = feature_df[feature_df['Execution'] == gt_execution][['Image', 'Image_Class']]
gt_lookup = dict(zip(ground_truth['Image'], ground_truth['Image_Class']))

# Get class names
class_names = sorted(ground_truth['Image_Class'].unique())
n_classes = len(class_names)
print(f"Classes ({n_classes}): {class_names}")

## Merge Predictions with Ground Truth

Join prediction data with ground truth labels using `Image_RID` as the key. Only images that have both predictions and ground truth labels will be included in the ROC analysis.

In [None]:
# Add ground truth to each experiment's predictions
for exp in experiments:
    df = exp['data'].copy()
    
    # Debug: show sample RIDs from predictions vs ground truth
    print(f"\nAsset {exp['asset_rid']}:")
    print(f"  Prediction Image_RIDs (first 5): {df['Image_RID'].head().tolist()}")
    print(f"  Ground truth Image keys (first 5): {list(gt_lookup.keys())[:5]}")
    
    df['True_Class'] = df['Image_RID'].map(gt_lookup)
    # Keep only images with ground truth
    matched = df['True_Class'].notna().sum()
    print(f"  Matched: {matched} / {len(df)}")
    
    df = df.dropna(subset=['True_Class'])
    exp['data'] = df
    exp['n_samples'] = len(df)
    if len(df) > 0:
        exp['accuracy'] = (df['Predicted_Class'] == df['True_Class']).mean() * 100
        print(f"  Accuracy: {exp['accuracy']:.1f}%")
    else:
        exp['accuracy'] = float('nan')
        print("  No matching samples found!")

## Compute ROC Curves

For multi-class classification, we use the **one-vs-rest (OvR)** approach:
- Each class gets its own ROC curve treating it as positive vs. all others
- **Micro-average**: Aggregate all classes, treating each prediction as independent
- **Macro-average**: Simple mean of per-class AUC scores (equal weight to each class)

AUC (Area Under ROC Curve) ranges from 0.5 (random) to 1.0 (perfect discrimination).

In [None]:
def compute_roc_metrics(df: pd.DataFrame, class_names: list[str]) -> dict:
    """Compute ROC curves and AUC scores for multi-class predictions.
    
    Args:
        df: DataFrame with True_Class and prob_* columns
        class_names: Ordered list of class names
        
    Returns:
        Dict with fpr, tpr, roc_auc for each class and micro/macro averages
    """
    n_classes = len(class_names)
    class_to_idx = {name: i for i, name in enumerate(class_names)}
    
    # Convert labels to indices
    y_true_idx = df['True_Class'].map(class_to_idx).values
    y_true_bin = label_binarize(y_true_idx, classes=range(n_classes))
    
    # Get probability matrix
    prob_cols = [f"prob_{c}" for c in class_names]
    y_score = df[prob_cols].values
    
    # Compute per-class ROC
    fpr, tpr, roc_auc = {}, {}, {}
    for i, name in enumerate(class_names):
        fpr[name], tpr[name], _ = roc_curve(y_true_bin[:, i], y_score[:, i])
        roc_auc[name] = auc(fpr[name], tpr[name])
    
    # Micro-average
    fpr['micro'], tpr['micro'], _ = roc_curve(y_true_bin.ravel(), y_score.ravel())
    roc_auc['micro'] = auc(fpr['micro'], tpr['micro'])
    
    # Macro-average
    roc_auc['macro'] = np.mean([roc_auc[c] for c in class_names])
    
    return {'fpr': fpr, 'tpr': tpr, 'roc_auc': roc_auc}

In [None]:
# Compute ROC metrics for each experiment
for exp in experiments:
    metrics = compute_roc_metrics(exp['data'], class_names)
    exp.update(metrics)
    
    print(f"\nAsset {exp['asset_rid']}:")
    print(f"  Accuracy: {exp['accuracy']:.2f}%")
    print(f"  Micro-AUC: {exp['roc_auc']['micro']:.4f}")
    print(f"  Macro-AUC: {exp['roc_auc']['macro']:.4f}")

In [None]:
# Display AUC comparison table
if experiments:
    auc_data = []
    for exp in experiments:
        exp_name = exp.get('name', exp['asset_rid'])
        row = {'Experiment': exp_name}
        for c in class_names:
            row[c] = exp['roc_auc'][c]
        row['Micro'] = exp['roc_auc']['micro']
        row['Macro'] = exp['roc_auc']['macro']
        auc_data.append(row)

    auc_df = pd.DataFrame(auc_data).set_index('Experiment')
    print("\nPer-class AUC scores:")
    display(auc_df.round(4))
else:
    print("No experiments loaded")

## Plot ROC Curves

In [None]:
def plot_roc_curves(exp: dict, class_names: list[str], show_per_class: bool = True):
    """Plot ROC curves for an experiment.
    
    Args:
        exp: Experiment dict with fpr, tpr, roc_auc data
        class_names: List of class names
        show_per_class: If True, plot individual class curves. If False, only micro-average.
    """
    fig, ax = plt.subplots(figsize=(10, 8))
    
    fpr, tpr, roc_auc = exp['fpr'], exp['tpr'], exp['roc_auc']
    
    # Micro-average (always shown)
    ax.plot(fpr['micro'], tpr['micro'], 
            label=f"Micro-avg (AUC={roc_auc['micro']:.3f})",
            color='deeppink', linestyle=':', linewidth=3)
    
    # Per-class curves (optional based on config)
    if show_per_class:
        colors = plt.cm.tab10(np.linspace(0, 1, len(class_names)))
        for i, name in enumerate(class_names):
            ax.plot(fpr[name], tpr[name], color=colors[i],
                    label=f"{name} (AUC={roc_auc[name]:.3f})")
    
    ax.plot([0, 1], [0, 1], 'k--', alpha=0.5)
    ax.set_xlim([0, 1])
    ax.set_ylim([0, 1.05])
    ax.set_xlabel('False Positive Rate')
    ax.set_ylabel('True Positive Rate')
    
    # Use experiment name in title
    exp_name = exp.get('name', exp['asset_rid'])
    ax.set_title(f"ROC Curves: {exp_name} (Acc: {exp['accuracy']:.1f}%)")
    
    ax.legend(loc='lower right', fontsize=9)
    ax.grid(True, alpha=0.3)
    
    return fig

In [None]:
# Plot ROC curves for each experiment (controlled by config.show_per_class)
for exp in experiments:
    # Display experiment header with link to source execution
    exp_name = exp.get('name', exp['asset_rid'])
    if 'execution_url' in exp:
        display(HTML(f"<h3>{exp_name}</h3><p>Source: <a href='{exp['execution_url']}' target='_blank'>Execution {exp['source_execution']}</a></p>"))
    else:
        display(HTML(f"<h3>{exp_name}</h3>"))
    
    fig = plot_roc_curves(exp, class_names, show_per_class=config.show_per_class)
    plt.tight_layout()
    plt.show()

## Experiment Comparison

Compare micro-averaged ROC curves across all experiments. This visualization shows how different model configurations perform relative to each other:
- Curves closer to the top-left corner indicate better performance
- The diagonal dashed line represents random classification (AUC = 0.5)

In [None]:
# Compare micro-average ROC curves across experiments
if len(experiments) > 1:
    fig, ax = plt.subplots(figsize=(10, 8))
    colors = plt.cm.Set1(np.linspace(0, 1, len(experiments)))
    
    for i, exp in enumerate(experiments):
        exp_name = exp.get('name', exp['asset_rid'])
        label = f"{exp_name} (AUC={exp['roc_auc']['micro']:.3f}, Acc={exp['accuracy']:.1f}%)"
        ax.plot(exp['fpr']['micro'], exp['tpr']['micro'], 
                color=colors[i], linewidth=2, label=label)
    
    ax.plot([0, 1], [0, 1], 'k--', alpha=0.5, label='Random')
    ax.set_xlim([0, 1])
    ax.set_ylim([0, 1.05])
    ax.set_xlabel('False Positive Rate')
    ax.set_ylabel('True Positive Rate')
    ax.set_title('ROC Curve Comparison (Micro-Average)')
    ax.legend(loc='lower right')
    ax.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
else:
    print("Single experiment - no comparison plot")

## Summary

In [None]:
# Final summary
print("=" * 60)
print("CIFAR-10 ROC Analysis Summary")
print("=" * 60)
print(f"Catalog: {ml.host_name}:{ml.catalog_id}")
print(f"Execution: {execution.execution_rid}")
print(f"Ground truth: Execution {gt_execution} ({len(gt_lookup)} labels)")
print(f"Classes: {n_classes}")
print(f"Experiments analyzed: {len(experiments)}")

for exp in experiments:
    exp_name = exp.get('name', exp['asset_rid'])
    print(f"\n  {exp_name}:")
    if 'source_execution' in exp:
        print(f"    Source Execution: {exp['source_execution']}")
    print(f"    Samples: {exp['n_samples']}")
    print(f"    Accuracy: {exp['accuracy']:.2f}%")
    print(f"    Micro-AUC: {exp['roc_auc']['micro']:.4f}")
    print(f"    Macro-AUC: {exp['roc_auc']['macro']:.4f}")

print("\n" + "=" * 60)

In [None]:
# Complete execution and upload outputs
execution.upload_execution_outputs()
print(f"\nExecution completed: {execution.execution_rid}")