# Coffea-Casa Processor-Based Workflow Test

This notebook demonstrates the UnifiedProcessor workflow with coffea.processor.Runner on Coffea-Casa, including skimming, analysis, histogramming, and statistics steps.

## Workflow Overview

1. Setup Python path for intccms package
2. Install dependencies and register modules for cloud pickle
3. Acquire Dask client from Coffea-Casa environment
4. Configure analysis parameters
5. Run metadata extraction
6. Initialize UnifiedProcessor
7. Run processor with coffea.processor.Runner
8. Save histograms
9. Run statistical analysis (if enabled)

In [None]:
# Setup Python path to include intccms package
import sys
import time
from pathlib import Path

# Add src directory to Python path
repo_root = Path.cwd()
src_dir = repo_root / "src"
if str(src_dir) not in sys.path:
    sys.path.insert(0, str(src_dir))

print(f"‚úÖ Added {src_dir} to Python path")

In [4]:
COFFEA_VERSION = "2025.10.3.dev17+g2cde65fb6" # 2025.10.2
COFFEA_PIP = "git+https://github.com/scikit-hep/coffea@ikrommyd/choose-errors-in-skipbadfiles"
try:
    import omegaconf
except ImportError:
    print("‚ö†Ô∏è omegaconf not found, installing...")
    ! pip install omegaconf

try:
    import coffea
    assert coffea.__version__ == "2025.10.3.dev17+g2cde65fb6"
except (ImportError, AssertionError):
    print("‚ö†Ô∏è coffea not found or incorrect version, installing...")
    ! pip install $COFFEA_PIP
print("‚úÖ All dependencies are installed.")

‚ö†Ô∏è coffea not found or incorrect version, installing...
Collecting git+https://github.com/scikit-hep/coffea@ikrommyd/choose-errors-in-skipbadfiles
  Cloning https://github.com/scikit-hep/coffea (to revision ikrommyd/choose-errors-in-skipbadfiles) to /private/var/folders/0v/cvfdg7wn2k59f7d0pp1wdsy80000gn/T/pip-req-build-fgic9q1z
  Running command git clone --filter=blob:none --quiet https://github.com/scikit-hep/coffea /private/var/folders/0v/cvfdg7wn2k59f7d0pp1wdsy80000gn/T/pip-req-build-fgic9q1z
  Running command git checkout -b ikrommyd/choose-errors-in-skipbadfiles --track origin/ikrommyd/choose-errors-in-skipbadfiles
  Switched to a new branch 'ikrommyd/choose-errors-in-skipbadfiles'
  branch 'ikrommyd/choose-errors-in-skipbadfiles' set up to track 'origin/ikrommyd/choose-errors-in-skipbadfiles'.
  Resolved https://github.com/scikit-hep/coffea to commit 2cde65fb6a7584c83bfbfa9c5f9acae42740d916
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build 

In [None]:
# Imports and cloudpickle registration
import copy
import os

os.environ['AWS_ACCESS_KEY_ID'] = ""
os.environ['AWS_SECRET_ACCESS_KEY'] = ""

from dask.distributed import Client, PipInstall
from coffea.processor import DaskExecutor
from coffea.nanoevents import NanoAODSchema

import cloudpickle
import intccms
import example_cms

# Register modules for cloud pickle
cloudpickle.register_pickle_by_value(intccms)
cloudpickle.register_pickle_by_value(example_cms)

from example_cms.configs.configuration import config as original_config
from intccms.utils.schema import Config, load_config_with_restricted_cli
from intccms.utils.output_manager import OutputDirectoryManager
from intccms.metadata_extractor import DatasetMetadataManager
from intccms.utils.datasets import DatasetManager
from intccms.analysis import run_processor_workflow

## Acquire Dask Client

Coffea-Casa provides a shared scheduler. Connect to it and register dependencies.

In [None]:
def acquire_client():
    """Acquire Dask client from Coffea-Casa environment."""
    dependencies = [COFFEA_PIP] #["coffea==2025.10.2"]
    client = Client("tls://localhost:8786")
    client.register_plugin(PipInstall(packages=dependencies))
    cluster = None  # no local cluster in this mode
    return client, cluster

## Configuration Setup

Configure analysis parameters including which processes to run and output settings.

In [None]:
# Configuration setup
config = copy.deepcopy(original_config)

# Limit files for testing
config["datasets"]["max_files"] = None

# Use local output directory
config["general"]["output_dir"] = "example_cms/outputs/"

# Configuration flags
config["general"]["read_from_cache"] = False
config["general"]["run_metadata_generation"] = False
config["general"]["run_processor"] = True  # Set to False to skip processor and load saved histograms
config["general"]["save_skimmed_output"] = False  # Set to True to save filtered events to disk
config["general"]["run_analysis"] = True
config["general"]["run_histogramming"] = True
config["general"]["run_systematics"] = True
config["general"]["run_statistics"] = True

# Test only signal dataset
#config["general"]["processes"] = ["data"]

cli_args = []
full_config = load_config_with_restricted_cli(config, cli_args)
validated_config = Config(**full_config)

print(f"‚úÖ Configuration loaded with max_files={validated_config.datasets.max_files}")
print(f"   - run_processor: {validated_config.general.run_processor}")
print(f"   - save_skimmed_output: {validated_config.general.save_skimmed_output}")
print(f"   - run_analysis: {validated_config.general.run_analysis}")
print(f"   - run_histogramming: {validated_config.general.run_histogramming}")
print(f"   - run_systematics: {validated_config.general.run_systematics}")
print(f"   - run_statistics: {validated_config.general.run_statistics}")

## Run Complete Workflow

Execute the full processor workflow with proper cleanup in a try/finally block.

In [None]:
try:
    client, cluster = acquire_client()
    print(f"‚úÖ Connected to Dask scheduler")
    print(f"üìä Dashboard: {client.dashboard_link}")
    # Output Manager Setup
    output_manager = OutputDirectoryManager(
        root_output_dir=validated_config.general.output_dir,
        cache_dir=validated_config.general.cache_dir,
        metadata_dir=validated_config.general.metadata_dir,
        skimmed_dir=validated_config.general.skimmed_dir
    )
    print(f"‚úÖ Output directory: {output_manager.root_output_dir}")

    # Step 1: Metadata Extraction
    print("\nüìã Extracting metadata...")
    dataset_manager = ConfigurableDatasetManager(validated_config.datasets)
    metadata_generator = DatasetMetadataManager(
        dataset_manager=dataset_manager,√ü
        output_manager=output_manager,
        executor=DaskExecutor(client=client),
    )
    metadata_generator.run(
        generate_metadata=validated_config.general.run_metadata_generation,
        processes_filter=validated_config.general.processes if hasattr(validated_config.general, 'processes') else None
    )

    metadata_lookup = metadata_generator.build_metadata_lookup()
    workitems = metadata_generator.workitems


    print(f"‚úÖ Generated {len(workitems)} workitems")

    # Show first few workitems
    print("\nüîç Workitem Details (first 5):")
    for i, wi in enumerate(workitems[:5]):
        print(f"  {i}: dataset='{wi.dataset}' process='{wi.usermeta.get('process', 'N/A')}'")
    if len(workitems) > 5:
        print(f"  ... and {len(workitems) - 5} more")

    # Step 2: Run Processor Workflow (or load saved histograms)
    print("\nüöÄ Running processor workflow...")
    t0 = time.perf_counter()
    output, report = run_processor_workflow(
        config=validated_config,
        output_manager=output_manager,
        metadata_lookup=metadata_lookup,
        workitems=workitems,
        executor=DaskExecutor(client=client),
        schema=NanoAODSchema,
    )
    t1 = time.perf_counter()
    print("‚úÖ Processor workflow complete!")

    # Step 3: Display Results
    print("\n" + "=" * 60)
    print("üìä Results:")
    print("=" * 60)

    if validated_config.general.run_processor:
        print(f"üìä Total events processed: {output.get('processed_events', 0):,}")
        if 'skimmed_events' in output:
            print(f"‚úÇÔ∏è  Events after skim: {output.get('skimmed_events', 0):,}")

    # Histograms are auto-saved by processor
    if output and "histograms" in output:
        num_histograms = sum(len(hists) for hists in output["histograms"].values())
        print(f"üìà Total histograms: {num_histograms}")
        print(f"üìà Channels: {list(output['histograms'].keys())}")
        print(f"‚úÖ Histograms auto-saved to: {output_manager.get_histograms_dir()}")
        print(f"   - processor_histograms.pkl (for loading with run_processor=False)")
        print(f"   - histograms.root (for downstream tools)")
    else:
        print("\n‚ö†Ô∏è  No histograms produced (run_histogramming may be disabled)")

    # Step 4: Run Statistical Analysis
    if validated_config.general.run_statistics and output and "histograms" in output:
        print("\nüìä Running statistical analysis...")

        # Create analysis instance for statistics
        from intccms.analysis.nondiff import NonDiffAnalysis

        analysis = NonDiffAnalysis(validated_config, output_manager)
        # Set histograms from processor output
        analysis.nD_hists_per_region = output["histograms"]

        # Check if cabinetry config exists
        if hasattr(validated_config, 'statistics') and hasattr(validated_config.statistics, 'cabinetry_config'):
            cabinetry_config_path = validated_config.statistics.cabinetry_config

            # Check if file exists√ü
            if Path(cabinetry_config_path).exists():
                print(f"‚úÖ Using cabinetry config: {cabinetry_config_path}")
                analysis.run_statistics(cabinetry_config_path)
                print(f"‚úÖ Statistical analysis complete!")
                print(f"üìä Plots saved to: {output_manager.get_statistics_dir()}")
            else:
                print(f"‚ö†Ô∏è  Cabinetry config not found: {cabinetry_config_path}")
                print(f"   Skipping statistics step")
        else:
            print(f"‚ö†Ô∏è  No cabinetry_config specified in configuration")
            print(f"   Skipping statistics step")
    else:
        print("\n‚ö†Ô∏è  Statistics step skipped (disabled or no histograms)")

    # Summary
    print("\n" + "=" * 60)
    print("‚úÖ Complete processor workflow finished!")
    print("=" * 60)

finally:
    # Cleanup
    print("\nüßπ Cleaning up...")
    client.close()
    print("‚úÖ Done!")