# Coffea-Casa Processor-Based Workflow Test

This notebook demonstrates the UnifiedProcessor workflow with coffea.processor.Runner on Coffea-Casa, including skimming, analysis, histogramming, and statistics steps.

## Workflow Overview

1. Setup Python path for intccms package
2. Install dependencies and register modules for cloud pickle
3. Acquire Dask client from Coffea-Casa environment
4. Configure analysis parameters
5. Run metadata extraction
6. Initialize UnifiedProcessor
7. Run processor with coffea.processor.Runner
8. Save histograms
9. Run statistical analysis (if enabled)

In [1]:
# Setup Python path to include intccms package
import sys
import time
from pathlib import Path

# Add src directory to Python path
repo_root = Path.cwd()
src_dir = repo_root / "src"
examples_dir = repo_root
if str(src_dir) not in sys.path:
    sys.path.insert(0, str(src_dir))
if str(examples_dir) not in sys.path:
    sys.path.insert(0, str(examples_dir))
print(f"‚úÖ Added {src_dir} to Python path")
print(f"‚úÖ Added {examples_dir} to Python path")

‚úÖ Added /home/cms-jovyan/integration-challenge/cms/src to Python path
‚úÖ Added /home/cms-jovyan/integration-challenge/cms to Python path


In [2]:
COFFEA_VERSION = "=2025.11.0"
COFFEA_PIP = f"coffea=={COFFEA_VERSION}"
DEPS = [COFFEA_PIP]
try:
    import omegaconf
except ImportError:
    print("‚ö†Ô∏è omegaconf not found, installing...")
    ! pip install omegaconf

In [3]:
# Imports and cloudpickle registration
import copy
import os

os.environ['AWS_ACCESS_KEY_ID'] = ""
os.environ['AWS_SECRET_ACCESS_KEY'] = ""

from dask.distributed import Client, PipInstall
from coffea.processor import DaskExecutor, IterativeExecutor
from coffea.nanoevents import NanoAODSchema

import cloudpickle
import intccms
import example_cms

# Register modules for cloud pickle
cloudpickle.register_pickle_by_value(intccms)
cloudpickle.register_pickle_by_value(example_cms)

from example_cms.configs.configuration import config as original_config
from intccms.schema import Config, load_config_with_restricted_cli
from intccms.utils.output import OutputDirectoryManager
from intccms.metadata_extractor import DatasetMetadataManager
from intccms.datasets import DatasetManager
from intccms.analysis import run_processor_workflow

## Acquire Dask Client

Coffea-Casa provides a shared scheduler. Connect to it and register dependencies.

In [9]:
from dask.distributed import WorkerPlugin
from contextlib import contextmanager

class RedirectStderrToStdout(WorkerPlugin):
    def setup(self, worker):
        # crude but effective: route stderr to stdout
        sys.stderr = sys.stdout

@contextmanager
def acquire_client(af="gateway"):
    """Context manager to acquire and safely close a Dask client from a Coffea-Casa environment."""
    client = None
    cluster = None
    try:
        if af == "condor":
            client = Client("tls://localhost:8786")
            client.register_plugin(PipInstall(packages=DEPS))
            cluster = None
        
        elif af == "gateway":
            def set_env(dask_worker):
                config_path = str(Path(dask_worker.local_directory) / 'access_token')
                os.environ["BEARER_TOKEN_FILE"] = config_path
                os.chmod(config_path, 0o600)
                os.chmod("/etc/grid-security/certificates", 0o755)
    
            num_workers = 350   #number of workers desired
            from dask_gateway import Gateway
            gateway = Gateway()
            clusters = gateway.list_clusters()
            cluster = gateway.connect(clusters[0].name)
            client = cluster.get_client()
            cluster.scale(num_workers)
            client.wait_for_workers(num_workers)
            client.upload_file("/etc/cmsaf-secrets-chown/access_token")     
            client.register_worker_callbacks(setup=set_env)
            client.register_plugin(PipInstall(packages=DEPS))
            # Register from client *before* workers start
            client.register_plugin(RedirectStderrToStdout(), name="redirect-stderr")
        
        print(f"‚úÖ Connected to Dask scheduler")
        print(f"üìä Dashboard: {client.dashboard_link}")
        
        yield client, cluster
    finally:
        if client is not None:
            client.close()

## Configuration Setup

Configure analysis parameters including which processes to run and output settings.

In [10]:
# Configuration setup
config = copy.deepcopy(original_config)

# Limit files for testing
config["datasets"]["max_files"] = None

# Use local output directory
config["general"]["output_dir"] = "example_cms/outputs/"

# Configuration flags
config["general"]["read_from_cache"] = False
config["general"]["run_metadata_generation"] = True
config["general"]["run_processor"] = True  # Set to False to skip processor and load saved histograms
config["general"]["save_skimmed_output"] = False  # Set to True to save filtered events to disk
config["general"]["run_analysis"] = True
config["general"]["run_histogramming"] = True
config["general"]["run_systematics"] = True
config["general"]["run_statistics"] = False

# Test only signal dataset
#config["general"]["processes"] = ["data"]

cli_args = []
full_config = load_config_with_restricted_cli(config, cli_args)
validated_config = Config(**full_config)

## Run Complete Workflow

Execute the full processor workflow with proper cleanup in a try/finally block.

In [11]:
# Set up output manager
output_manager = OutputDirectoryManager(
    root_output_dir=validated_config.general.output_dir,
    cache_dir=validated_config.general.cache_dir,
    metadata_dir=validated_config.general.metadata_dir,
    skimmed_dir=validated_config.general.skimmed_dir
)

In [12]:
# Set up metadata generator from datasets
# TODO:: update method for redirector?
dataset_manager = DatasetManager(validated_config.datasets)

In [13]:
# Extract the metadata
with acquire_client(af="gateway") as (client, cluster):
    metadata_generator = DatasetMetadataManager(
        dataset_manager=dataset_manager,
        output_manager=output_manager,
        executor=DaskExecutor(client=client),
        config=validated_config,
    )
    metadata_generator.run(
        generate_metadata=validated_config.general.run_metadata_generation,
        processes_filter=validated_config.general.processes if hasattr(validated_config.general, 'processes') else None
    )

‚úÖ Connected to Dask scheduler
üìä Dashboard: /services/dask-gateway/clusters/cmsaf-dev.70b76ebddab44bd8890f0ab9631b10ee/status


Output()

In [14]:
# Build look up table for metadata and extract workitems
metadata_lookup = metadata_generator.build_metadata_lookup()
workitems = metadata_generator.workitems
print(f"‚úÖ Generated {len(workitems)} workitems")

‚úÖ Generated 39603 workitems


In [15]:
# Run processor workflow 
with acquire_client(af="gateway") as (client, cluster):
    print("\nüöÄ Running processor workflow...")
    t0 = time.perf_counter()
    output, report, metrics = run_processor_workflow(
        config=validated_config,
        output_manager=output_manager,
        metadata_lookup=metadata_lookup,
        workitems=workitems,
        executor=DaskExecutor(client=client, treereduction=8, retries=0),
        schema=NanoAODSchema,
    )
    t1 = time.perf_counter()
    print(f"‚úÖ Processor workflow complete in {t1-t0} seconds!")

# Print a small summary of number of events processed
if validated_config.general.run_processor:
    print(f"üìä Total events processed: {output.get('processed_events', 0):,}")
    if 'skimmed_events' in output:
        print(f"‚úÇÔ∏è  Events after skim: {output.get('skimmed_events', 0):,}")

‚úÖ Connected to Dask scheduler
üìä Dashboard: /services/dask-gateway/clusters/cmsaf-dev.70b76ebddab44bd8890f0ab9631b10ee/status

üöÄ Running processor workflow...


Output()

‚úÖ Processor workflow complete in 256.25230254698545 seconds!
üìä Total events processed: 6,827,277,103
‚úÇÔ∏è  Events after skim: 297,458,382


In [None]:
print(f"data read: {report["bytesread"] / 1000**3:.2f} GB in {report["chunks"]} chunks")

print(f"core-average event rate using \'processtime\': {report["entries"] / 1000 / report["processtime"]:.2f} kHz")
print(f"core-average data rate using \'processtime\': {report["bytesread"] / 1000**3 * 8 / report["processtime"]:.2f} Gbps")

print(f"average event rate using walltime: {report["entries"] / 1000 / (t1 - t0):.2f} kHz")
print(f"average data rate using walltime: {report["bytesread"] / 1000**3 * 8 / (t1 - t0):.2f} Gbps")

print(f"Number of branches read: {len(report["columns"])}")

In [None]:
# Import Rich for beautiful table display
from rich.console import Console

console = Console()

print("\n" + "=" * 60)
print("üìä Processing Metrics")
print("=" * 60)

In [None]:
# Display metrics if collection was enabled
if metrics:
    from intccms.metrics import (
        format_throughput_table,
        format_event_processing_table,
        format_resources_table,
        format_timing_table,
    )
    
    print("\nüìà Throughput Metrics")
    console.print(format_throughput_table(metrics))
    
    print("\n‚ö° Event Processing Metrics")
    console.print(format_event_processing_table(metrics))
    
    print("\nüñ•Ô∏è  Resource Utilization")
    console.print(format_resources_table(metrics))
    
    print("\n‚è±Ô∏è  Timing Breakdown")
    console.print(format_timing_table(metrics))
else:
    print("‚ö†Ô∏è  Metrics collection was disabled (set config.general.metrics.enable=True)")

In [None]:
# Step 4: Run Statistical Analysis
if validated_config.general.run_statistics and output and "histograms" in output:
    print("\nüìä Running statistical analysis...")

    # Create analysis instance for statistics
    from intccms.analysis.nondiff import NonDiffAnalysis

    analysis = NonDiffAnalysis(validated_config, output_manager)
    # Set histograms from processor output
    analysis.nD_hists_per_region = output["histograms"]

    # Check if cabinetry config exists
    if hasattr(validated_config, 'statistics') and hasattr(validated_config.statistics, 'cabinetry_config'):
        cabinetry_config_path = validated_config.statistics.cabinetry_config

        # Check if file exists
        if Path(cabinetry_config_path).exists():
            print(f"‚úÖ Using cabinetry config: {cabinetry_config_path}")
            analysis.run_statistics(cabinetry_config_path)
            print(f"‚úÖ Statistical analysis complete!")
            print(f"üìä Plots saved to: {output_manager.statistics_dir}")
        else:
            print(f"‚ö†Ô∏è  Cabinetry config not found: {cabinetry_config_path}")
            print(f"   Skipping statistics step")
    else:
        print(f"‚ö†Ô∏è  No cabinetry_config specified in configuration")
        print(f"   Skipping statistics step")
else:
    print("\n‚ö†Ô∏è  Statistics step skipped (disabled or no histograms)")
