# Coffea-Casa Processor-Based Workflow Test with Metrics

This notebook demonstrates the UnifiedProcessor workflow with coffea.processor.Runner on Coffea-Casa, including skimming, analysis, histogramming, statistics steps, and **comprehensive performance metrics collection**.

## Workflow Overview

1. Setup Python path for intccms package
2. Install dependencies and register modules for cloud pickle
3. Acquire Dask client from Coffea-Casa environment
4. Configure analysis parameters (including metrics)
5. Run metadata extraction
6. Initialize UnifiedProcessor
7. Run processor with coffea.processor.Runner
8. **Collect and display performance metrics**
9. Save histograms
10. Run statistical analysis (if enabled)

In [None]:
# Setup Python path to include intccms package
import sys
import time
from pathlib import Path

# Add src directory to Python path
repo_root = Path.cwd()
src_dir = repo_root / "src"
examples_dir = repo_root
if str(src_dir) not in sys.path:
    sys.path.insert(0, str(src_dir))
if str(examples_dir) not in sys.path:
    sys.path.insert(0, str(examples_dir))
print(f"‚úÖ Added {src_dir} to Python path")
print(f"‚úÖ Added {examples_dir} to Python path")

In [None]:
COFFEA_VERSION = "2025.10.3.dev17+g2cde65fb6" # 2025.10.2
COFFEA_PIP = "git+https://github.com/scikit-hep/coffea@master"
try:
    import omegaconf
except ImportError:
    print("‚ö†Ô∏è omegaconf not found, installing...")
    ! pip install omegaconf

try:
    import coffea
    print("Coffea version: ", coffea.__version__)
    # assert coffea.__version__ == "2025.10.3.dev9+g41c84f7a9"
except (ImportError, AssertionError):
    print("‚ö†Ô∏è coffea not found or incorrect version, installing...")
    ! pip install $COFFEA_PIP
print("‚úÖ All dependencies are installed.")

In [None]:
# Imports and cloudpickle registration
import copy
import os

os.environ['AWS_ACCESS_KEY_ID'] = ""
os.environ['AWS_SECRET_ACCESS_KEY'] = ""

from dask.distributed import Client, PipInstall
from coffea.processor import DaskExecutor
from coffea.nanoevents import NanoAODSchema

import cloudpickle
import intccms
import example_cms

# Register modules for cloud pickle
cloudpickle.register_pickle_by_value(intccms)
cloudpickle.register_pickle_by_value(example_cms)

from example_cms.configs.configuration import config as original_config
from intccms.schema import Config, load_config_with_restricted_cli
from intccms.utils.output import OutputDirectoryManager
from intccms.metadata_extractor import DatasetMetadataManager
from intccms.datasets import DatasetManager
from intccms.analysis import run_processor_workflow

## Acquire Dask Client

Coffea-Casa provides a shared scheduler. Connect to it and register dependencies.

In [None]:
def acquire_client():
    """Acquire Dask client from Coffea-Casa environment."""
    client = Client("tls://localhost:8786")
    dependencies = [COFFEA_PIP] #["coffea==2025.10.2"]
    client.register_plugin(PipInstall(packages=dependencies))
    cluster = None  # no local cluster in this mode
    return client, cluster

## Configuration Setup

Configure analysis parameters including which processes to run, output settings, and **metrics collection**.

In [None]:
# Configuration setup
config = copy.deepcopy(original_config)

# Limit files for testing
config["datasets"]["max_files"] = None

# Use local output directory
config["general"]["output_dir"] = "example_cms/outputs/"

# Configuration flags
config["general"]["read_from_cache"] = False
config["general"]["run_metadata_generation"] = False
config["general"]["run_processor"] = True  # Set to False to skip processor and load saved histograms
config["general"]["save_skimmed_output"] = False  # Set to True to save filtered events to disk
config["general"]["run_analysis"] = True
config["general"]["run_histogramming"] = True
config["general"]["run_systematics"] = True
config["general"]["run_statistics"] = False

# ===== ENABLE METRICS COLLECTION =====
config["general"]["metrics"] = {
    "enable": True,                    # Master switch
    "track_workers": True,             # Enable scheduler-based tracking
    "save_measurements": True,         # Save to disk
}

# Test only signal dataset
#config["general"]["processes"] = ["data"]

cli_args = []
full_config = load_config_with_restricted_cli(config, cli_args)
validated_config = Config(**full_config)

print(f"‚úÖ Configuration loaded with max_files={validated_config.datasets.max_files}")
print(f"   - run_processor: {validated_config.general.run_processor}")
print(f"   - save_skimmed_output: {validated_config.general.save_skimmed_output}")
print(f"   - run_analysis: {validated_config.general.run_analysis}")
print(f"   - run_histogramming: {validated_config.general.run_histogramming}")
print(f"   - run_systematics: {validated_config.general.run_systematics}")
print(f"   - run_statistics: {validated_config.general.run_statistics}")
print(f"   - metrics.enable: {validated_config.general.metrics.enable}")
print(f"   - metrics.track_workers: {validated_config.general.metrics.track_workers}")

## Run Complete Workflow

Execute the full processor workflow with proper cleanup in a try/finally block.

In [None]:
try:
    client, cluster = acquire_client()
    print(f"‚úÖ Connected to Dask scheduler")
    print(f"üìä Dashboard: {client.dashboard_link}")
    
    # Output Manager Setup
    output_manager = OutputDirectoryManager(
        root_output_dir=validated_config.general.output_dir,
        cache_dir=validated_config.general.cache_dir,
        metadata_dir=validated_config.general.metadata_dir,
        skimmed_dir=validated_config.general.skimmed_dir
    )
    print(f"‚úÖ Output directory: {output_manager.root_output_dir}")

    # Step 1: Metadata Extraction
    print("\nüìã Extracting metadata...")
    dataset_manager = DatasetManager(validated_config.datasets)
    metadata_generator = DatasetMetadataManager(
        dataset_manager=dataset_manager,
        output_manager=output_manager,
        executor=DaskExecutor(client=client),
        config=validated_config,
    )
    metadata_generator.run(
        generate_metadata=validated_config.general.run_metadata_generation,
        processes_filter=validated_config.general.processes if hasattr(validated_config.general, 'processes') else None
    )

    metadata_lookup = metadata_generator.build_metadata_lookup()
    workitems = metadata_generator.workitems

    print(f"‚úÖ Generated {len(workitems)} workitems")

    # Show first few workitems
    print("\nüîç Workitem Details (first 5):")
    for i, wi in enumerate(workitems[:5]):
        print(f"  {i}: dataset='{wi.dataset}' process='{wi.usermeta.get('process', 'N/A')}'")
    if len(workitems) > 5:
        print(f"  ... and {len(workitems) - 5} more")

    # Step 2: Run Processor Workflow (or load saved histograms)
    print("\nüöÄ Running processor workflow...")
    t0 = time.perf_counter()
    output, report, metrics = run_processor_workflow(
        config=validated_config,
        output_manager=output_manager,
        metadata_lookup=metadata_lookup,
        workitems=workitems[:],
        executor=DaskExecutor(client=client, treereduction=6, retries=0),
        schema=NanoAODSchema,
    )
    t1 = time.perf_counter()
    print("‚úÖ Processor workflow complete!")

    # Step 3: Display Results
    print("\n" + "=" * 60)
    print("üìä Results:")
    print("=" * 60)

    if validated_config.general.run_processor:
        print(f"üìä Total events processed: {output.get('processed_events', 0):,}")
        if 'skimmed_events' in output:
            print(f"‚úÇÔ∏è  Events after skim: {output.get('skimmed_events', 0):,}")

    # Histograms are auto-saved by processor
    if output and "histograms" in output:
        num_histograms = sum(len(hists) for hists in output["histograms"].values())
        print(f"üìà Total histograms: {num_histograms}")
        print(f"üìà Channels: {list(output['histograms'].keys())}")
        print(f"‚úÖ Histograms auto-saved to: {output_manager.histograms_dir}")
        print(f"   - processor_histograms.pkl (for loading with run_processor=False)")
        print(f"   - histograms.root (for downstream tools)")
    else:
        print("\n‚ö†Ô∏è  No histograms produced (run_histogramming may be disabled)")

    # Step 4: Run Statistical Analysis
    if validated_config.general.run_statistics and output and "histograms" in output:
        print("\nüìä Running statistical analysis...")

        # Create analysis instance for statistics
        from intccms.analysis.nondiff import NonDiffAnalysis

        analysis = NonDiffAnalysis(validated_config, output_manager)
        # Set histograms from processor output
        analysis.nD_hists_per_region = output["histograms"]

        # Check if cabinetry config exists
        if hasattr(validated_config, 'statistics') and hasattr(validated_config.statistics, 'cabinetry_config'):
            cabinetry_config_path = validated_config.statistics.cabinetry_config

            # Check if file exists
            if Path(cabinetry_config_path).exists():
                print(f"‚úÖ Using cabinetry config: {cabinetry_config_path}")
                analysis.run_statistics(cabinetry_config_path)
                print(f"‚úÖ Statistical analysis complete!")
                print(f"üìä Plots saved to: {output_manager.statistics_dir}")
            else:
                print(f"‚ö†Ô∏è  Cabinetry config not found: {cabinetry_config_path}")
                print(f"   Skipping statistics step")
        else:
            print(f"‚ö†Ô∏è  No cabinetry_config specified in configuration")
            print(f"   Skipping statistics step")
    else:
        print("\n‚ö†Ô∏è  Statistics step skipped (disabled or no histograms)")

    # Summary
    print("\n" + "=" * 60)
    print("‚úÖ Complete processor workflow finished!")
    print("=" * 60)

finally:
    # Cleanup
    print("\nüßπ Cleaning up...")
    # Note: Don't close client yet - we need it for metrics display
    print("‚úÖ Workflow complete!")

## Performance Metrics

Display comprehensive performance metrics collected during processing.

In [None]:
# Display Coffea Report
report

In [None]:
# Import Rich for beautiful table display
from rich.console import Console

console = Console()

print("\n" + "=" * 60)
print("üìä Processing Metrics")
print("=" * 60)

In [None]:
# Display metrics if collection was enabled
if metrics:
    from intccms.metrics import (
        format_throughput_table,
        format_event_processing_table,
        format_resources_table,
        format_timing_table,
    )
    
    print("\nüìà Throughput Metrics")
    console.print(format_throughput_table(metrics))
    
    print("\n‚ö° Event Processing Metrics")
    console.print(format_event_processing_table(metrics))
    
    print("\nüñ•Ô∏è  Resource Utilization")
    console.print(format_resources_table(metrics))
    
    print("\n‚è±Ô∏è  Timing Breakdown")
    console.print(format_timing_table(metrics))
else:
    print("‚ö†Ô∏è  Metrics collection was disabled (set config.general.metrics.enable=True)")

## Manual Calculations for Verification

Compare automated metrics to manual calculations from the coffea report.

In [None]:
print("\n" + "=" * 60)
print("üîç Manual Calculations (for verification)")
print("=" * 60)

if report:
    print(f"data read: {report['bytesread'] / 1000**3:.2f} GB in {report['chunks']} chunks")
    print(f"")
    print(f"core-average event rate using 'processtime': {report['entries'] / 1000 / report['processtime']:.2f} kHz")
    print(f"core-average data rate using 'processtime': {report['bytesread'] / 1000**3 * 8 / report['processtime']:.2f} Gbps")
    print(f"")
    print(f"average event rate using walltime: {report['entries'] / 1000 / (t1 - t0):.2f} kHz")
    print(f"average data rate using walltime: {report['bytesread'] / 1000**3 * 8 / (t1 - t0):.2f} Gbps")
    print(f"")
    print(f"Number of branches read: {len(report['columns'])}")
    
    print("\n‚úÖ Compare manual calculations to metrics tables above!")
    print("   - Wall-clock rates should match 'Event Rate (Wall Clock)' and 'Data Rate'")
    print("   - Processtime rates should match 'Event Rate (Aggregated)'")
else:
    print("No report available")

## Dask Performance Report

Link to the detailed Dask performance report HTML file.

In [None]:
if metrics and validated_config.general.metrics.track_workers:
    perf_report_path = output_manager.benchmarks_dir / "latest" / "dask_performance.html"
    print(f"\nüìä Dask Performance Report: {perf_report_path}")
    print("   Download this file and open in a browser for detailed task timeline visualization")
    print("   Includes task execution timeline, worker utilization, and communication patterns")
else:
    print("\n‚ö†Ô∏è  Performance report not generated (metrics.track_workers=False)")

## Performance Visualizations

Generate plots to visualize worker scaling, memory/CPU utilization, and throughput over time.

In [None]:
# Generate visualizations if worker tracking was enabled
if metrics and validated_config.general.metrics.track_workers:
    from intccms.metrics import (
        load_worker_timeline,
        plot_summary_dashboard,
        plot_worker_count_timeline,
        plot_memory_utilization_timeline,
        plot_cpu_utilization_timeline,
        plot_scaling_efficiency,
    )
    
    # Find the latest benchmark directory
    benchmarks_dir = output_manager.benchmarks_dir
    latest_dirs = sorted(benchmarks_dir.glob("*"), key=lambda p: p.name, reverse=True)
    
    if latest_dirs:
        measurement_path = latest_dirs[0]  # Most recent timestamped directory
        print(f"üìä Loading worker tracking data from: {measurement_path}")
        
        try:
            # Load tracking data
            tracking_data = load_worker_timeline(measurement_path)
            
            print(f"   - Tracking duration: {len(tracking_data['worker_counts'])} samples")
            print(f"   - Workers tracked: {len(tracking_data['worker_memory'])}")
            
            # Generate summary dashboard
            print("\nüìä Generating summary dashboard...")
            fig = plot_summary_dashboard(
                tracking_data, 
                metrics, 
                output_path=measurement_path / "summary_dashboard.png"
            )
            
            print(f"‚úÖ Dashboard saved to: {measurement_path / 'summary_dashboard.png'}")
            
        except Exception as e:
            print(f"‚ö†Ô∏è  Failed to generate visualizations: {e}")
    else:
        print("‚ö†Ô∏è  No benchmark directories found")
else:
    print("‚ö†Ô∏è  Worker tracking was disabled - no visualizations to generate")

In [None]:
# Optional: Generate individual plots for detailed analysis
# Uncomment the plots you want to generate

if metrics and validated_config.general.metrics.track_workers and latest_dirs:
    measurement_path = latest_dirs[0]
    
    try:
        tracking_data = load_worker_timeline(measurement_path)
        
        # Worker count timeline
        # fig, ax = plot_worker_count_timeline(
        #     tracking_data, 
        #     output_path=measurement_path / "worker_count.png"
        # )
        
        # Memory utilization timeline
        # fig, ax = plot_memory_utilization_timeline(
        #     tracking_data,
        #     output_path=measurement_path / "memory_utilization.png"
        # )
        
        # CPU utilization timeline
        # fig, ax = plot_cpu_utilization_timeline(
        #     tracking_data,
        #     output_path=measurement_path / "cpu_utilization.png"
        # )
        
        # Scaling efficiency
        # fig, ax = plot_scaling_efficiency(
        #     tracking_data,
        #     metrics,
        #     output_path=measurement_path / "scaling_efficiency.png"
        # )
        
        print("üí° Tip: Uncomment the plots above to generate individual visualizations")
        
    except Exception as e:
        print(f"‚ö†Ô∏è  Error: {e}")

## Cleanup

Close the Dask client connection.

In [None]:
# Close client
client.close()
print("‚úÖ Dask client closed")