# CMS Z' single-lepton: Skimming workflows

This notebook demonstrates the four skimming workflow modes supported by the processor,
with performance metrics collected via [roastcoffea](https://github.com/MoAly98/roastcoffea).

| Mode | `save_skimmed_output` | `run_analysis` | Description |
|------|-----------------------|----------------|-------------|
| **1. Skim + Analysis** | `True` | `True` | Skim events to disk **and** run histogramming in one pass |
| **2. Analysis only** | `False` | `True` | Apply skim filter on-the-fly, no files saved |
| **3. Skim only** | `True` | `False` | Save skimmed files to disk, skip histogramming |
| **4. Analysis on skimmed** | `False` | `True` | Read previously skimmed files as input (`use_skimmed_input=True`) |

Each mode cell is independent â€” run any subset in any order (Mode 4 requires skimmed files from Mode 1 or 3).

## Setup

### AF flag
Set the analysis facility. Each AF has its own Dask client setup.

In [None]:
AF = "coffeacasa-condor"  # options: [coffeacasa-condor, coffeacasa-gateway, purdue-af]
AUTO_CLOSE_CLIENT = False

### Imports and dependencies

In [None]:
# Setup Python path to include intccms package
import sys
from pathlib import Path

repo_root = Path.cwd()
src_dir = repo_root / "src"
examples_dir = repo_root
if str(src_dir) not in sys.path:
    sys.path.insert(0, str(src_dir))
if str(examples_dir) not in sys.path:
    sys.path.insert(0, str(examples_dir))

In [None]:
try:
    import omegaconf
except ImportError:
    print("omegaconf not found, installing...")
    ! pip install omegaconf;

try:
    import roastcoffea
except ImportError:
    print("roastcoffea not found, installing...")
    ! pip install roastcoffea;
    import roastcoffea

In [None]:
COFFEA_VERSION = "2025.12.0"
COFFEA_PIP = f"coffea=={COFFEA_VERSION}" if "git" not in COFFEA_VERSION else COFFEA_VERSION

! pip install $COFFEA_PIP ;

WORKER_DEPENDENCIES = [COFFEA_PIP, "roastcoffea==0.1.2"]

In [None]:
import cloudpickle
import copy
import os
import time

import fsspec
from coffea.processor import DaskExecutor
from coffea.nanoevents import NanoAODSchema

from intccms.schema import Config, load_config_with_restricted_cli
from intccms.utils.output import OutputDirectoryManager
from intccms.utils.tools import load_dotenv
from intccms.utils.dask_client import acquire_client
from intccms.metadata_extractor import DatasetMetadataManager
from intccms.datasets import DatasetManager
from intccms.analysis import run_processor_workflow
from intccms.analysis.processor import UnifiedProcessor

from roastcoffea import MetricsCollector
from roastcoffea.export.reporter import (
    format_throughput_table,
    format_event_processing_table,
    format_timing_table,
)
from roastcoffea.visualization.plots import (
    plot_worker_count_timeline,
    plot_throughput_timeline,
    plot_runtime_distribution,
)
from rich.console import Console

In [None]:
import intccms
import example_cms

cloudpickle.register_pickle_by_value(intccms)
cloudpickle.register_pickle_by_value(example_cms)
cloudpickle.register_pickle_by_value(roastcoffea)

### Base configuration

In [None]:
from example_cms.configs.configuration import config as original_config

config = copy.deepcopy(original_config)

config["datasets"]["max_files"] = None
config["general"]["output_dir"] = "example_cms/outputs/"
config["general"]["run_metadata_generation"] = False
config["general"]["run_processor"] = True

# Defaults -- each mode cell overrides these
config["general"]["run_analysis"] = False
config["general"]["save_skimmed_output"] = False
config["general"]["use_skimmed_input"] = False
config["general"]["run_histogramming"] = False
config["general"]["run_systematics"] = False
config["general"]["run_statistics"] = False

cli_args = []
full_config = load_config_with_restricted_cli(config, cli_args)
base_config = Config(**full_config)

### Skimming output configuration

Configure the output format and destination for skimmed files.
Secrets (AWS keys) are loaded from an untracked `.env` file via `load_dotenv()`.

In [None]:
# Uncomment one of the three presets below to select the output backend.
# Each preset sets OUTPUT_FORMAT and OUTPUT_DIR; the rest is derived automatically.

# --- Parquet on S3 ---
OUTPUT_FORMAT = "parquet"
OUTPUT_DIR = "s3:///skim_out"
S3_ENDPOINT = "https://red-s3.unl.edu/cmsaf-test-oshadura"

# --- ROOT TTree on XRootD ---
# OUTPUT_FORMAT = "ttree"
# OUTPUT_DIR = "root://xrootd-local.unl.edu:1094//store/user/maly/skim_ttree/"

# --- ROOT RNTuple on XRootD ---
# OUTPUT_FORMAT = "rntuple"
# OUTPUT_DIR = "root://xrootd-local.unl.edu:1094//store/user/maly/skim_rntuple/"

# --- Derived configuration (no need to edit below) ---

to_kwargs = {}
from_kwargs = {}
PROPAGATE_AWS = False

if OUTPUT_DIR.startswith("s3://"):
    load_dotenv()
    storage_options = {
        "key": os.environ["AWS_ACCESS_KEY_ID"],
        "secret": os.environ["AWS_SECRET_ACCESS_KEY"],
        "client_kwargs": {"endpoint_url": S3_ENDPOINT},
    }
    to_kwargs["storage_options"] = storage_options
    to_kwargs["compression"] = "zstd"
    from_kwargs["storage_options"] = storage_options
    PROPAGATE_AWS = True

base_config.preprocess.skimming.output.format = OUTPUT_FORMAT
base_config.preprocess.skimming.output.output_dir = OUTPUT_DIR
base_config.preprocess.skimming.output.to_kwargs = to_kwargs
base_config.preprocess.skimming.output.from_kwargs = from_kwargs

print(f"Output format: {OUTPUT_FORMAT}")
print(f"Output dir:    {OUTPUT_DIR}")
if PROPAGATE_AWS:
    print("AWS credentials will be propagated to workers")

### Data redirector

In [None]:
REDIRECTOR = "root://xcache/"

for dataset in base_config.datasets.datasets:
    dataset.redirector = REDIRECTOR

print(f"Redirector set to: {REDIRECTOR}")

### Output manager, dataset manager, and metadata

In [None]:
output_manager = OutputDirectoryManager(
    root_output_dir=base_config.general.output_dir,
    cache_dir=base_config.general.cache_dir,
    metadata_dir=base_config.general.metadata_dir,
    skimmed_dir=base_config.general.skimmed_dir,
)

dataset_manager = DatasetManager(base_config.datasets)

metadata_generator = DatasetMetadataManager(
    dataset_manager=dataset_manager,
    output_manager=output_manager,
    config=base_config,
)

if metadata_generator.generate_metadata:
    with acquire_client(AF, close_after=AUTO_CLOSE_CLIENT, pip_packages=WORKER_DEPENDENCIES) as (client, cluster):
        metadata_generator.run(executor=DaskExecutor(client=client))
else:
    metadata_generator.run()

metadata_lookup = metadata_generator.build_metadata_lookup()
workitems = metadata_generator.workitems
print(f"Generated {len(workitems)} workitems")

### Helpers

In [None]:
# Shared results dict for cross-mode comparison
results = {}
console = Console()


def measure_skimmed_size(output, skim_output_config):
    """Stat all output files from manifest_entries to get total skimmed bytes."""
    entries = output.get("manifest_entries", [])
    if not entries:
        return 0

    # Build storage_options for fsspec from the skim output config
    so = dict(skim_output_config.from_kwargs) if skim_output_config.from_kwargs else {}

    total_bytes = 0
    for entry in entries:
        path = entry["output_file"]
        try:
            of = fsspec.open(path, **(so.get("storage_options", so)))
            info = of.fs.info(of.path)
            total_bytes += info.get("size", 0)
        except Exception as e:
            print(f"  Could not stat {path}: {e}")
    return total_bytes

---
## Mode 1: Skim + Analysis

Skim events to disk **and** run the full analysis (histogramming + systematics) in a single pass.

In [None]:
cfg = copy.deepcopy(base_config)
cfg.general.save_skimmed_output = True
cfg.general.run_analysis = True
cfg.general.run_histogramming = True
cfg.general.run_systematics = True
cfg.general.use_skimmed_input = False

with acquire_client(AF, close_after=AUTO_CLOSE_CLIENT, pip_packages=WORKER_DEPENDENCIES, propagate_aws_env=PROPAGATE_AWS) as (client, cluster):
    processor = UnifiedProcessor(
        config=cfg, output_manager=output_manager, metadata_lookup=metadata_lookup,
    )
    with MetricsCollector(
        client=client, processor_instance=processor,
        track_workers=True, worker_tracking_interval=1.0,
    ) as collector:
        t0 = time.perf_counter()
        output, report = run_processor_workflow(
            config=cfg, output_manager=output_manager,
            metadata_lookup=metadata_lookup, workitems=workitems,
            executor=DaskExecutor(client=client, treereduction=8, retries=0),
            schema=NanoAODSchema,
        )
        t1 = time.perf_counter()
        collector.extract_metrics_from_output(output)
        collector.set_coffea_report(report)

    metrics = collector.get_metrics()
    skimmed_bytes = measure_skimmed_size(output, cfg.preprocess.skimming.output)
    save_time = metrics.get("sections", {}).get("save_skimmed", {}).get("total_duration", 0)

    results["skim_and_analysis"] = {
        "metrics": metrics,
        "tracking_data": collector.tracking_data,
        "wall_time": t1 - t0,
        "processed_events": output.get("processed_events", 0),
        "skimmed_events": output.get("skimmed_events", 0),
        "skimmed_bytes": skimmed_bytes,
        "save_skimmed_time": save_time,
    }

print(f"Mode 1 complete in {t1-t0:.1f}s")
print(f"  Processed: {output.get('processed_events', 0):,}")
print(f"  Skimmed:   {output.get('skimmed_events', 0):,}")
print(f"  Save time: {save_time:.1f}s")
print(f"  File size: {skimmed_bytes / 1e6:.1f} MB")

---
## Mode 2: Analysis only (no skim)

Apply the skim filter on-the-fly and run the analysis. No skimmed files are saved to disk.

In [None]:
cfg = copy.deepcopy(base_config)
cfg.general.save_skimmed_output = False
cfg.general.run_analysis = True
cfg.general.run_histogramming = True
cfg.general.run_systematics = True
cfg.general.use_skimmed_input = False

with acquire_client(AF, close_after=AUTO_CLOSE_CLIENT, pip_packages=WORKER_DEPENDENCIES, propagate_aws_env=PROPAGATE_AWS) as (client, cluster):
    processor = UnifiedProcessor(
        config=cfg, output_manager=output_manager, metadata_lookup=metadata_lookup,
    )
    with MetricsCollector(
        client=client, processor_instance=processor,
        track_workers=True, worker_tracking_interval=1.0,
    ) as collector:
        t0 = time.perf_counter()
        output, report = run_processor_workflow(
            config=cfg, output_manager=output_manager,
            metadata_lookup=metadata_lookup, workitems=workitems,
            executor=DaskExecutor(client=client, treereduction=8, retries=0),
            schema=NanoAODSchema,
        )
        t1 = time.perf_counter()
        collector.extract_metrics_from_output(output)
        collector.set_coffea_report(report)

    results["analysis_only"] = {
        "metrics": collector.get_metrics(),
        "tracking_data": collector.tracking_data,
        "wall_time": t1 - t0,
        "processed_events": output.get("processed_events", 0),
        "skimmed_events": output.get("skimmed_events", 0),
        "skimmed_bytes": 0,
        "save_skimmed_time": 0,
    }

print(f"Mode 2 complete in {t1-t0:.1f}s")
print(f"  Processed: {output.get('processed_events', 0):,}")
print(f"  Skimmed:   {output.get('skimmed_events', 0):,}")

---
## Mode 3: Skim only (no analysis)

Save skimmed files to disk without running histogramming or systematics.

In [None]:
cfg = copy.deepcopy(base_config)
cfg.general.save_skimmed_output = True
cfg.general.run_analysis = False
cfg.general.run_histogramming = False
cfg.general.run_systematics = False
cfg.general.use_skimmed_input = False

with acquire_client(AF, close_after=AUTO_CLOSE_CLIENT, pip_packages=WORKER_DEPENDENCIES, propagate_aws_env=PROPAGATE_AWS) as (client, cluster):
    processor = UnifiedProcessor(
        config=cfg, output_manager=output_manager, metadata_lookup=metadata_lookup,
    )
    with MetricsCollector(
        client=client, processor_instance=processor,
        track_workers=True, worker_tracking_interval=1.0,
    ) as collector:
        t0 = time.perf_counter()
        output, report = run_processor_workflow(
            config=cfg, output_manager=output_manager,
            metadata_lookup=metadata_lookup, workitems=workitems,
            executor=DaskExecutor(client=client, treereduction=8, retries=0),
            schema=NanoAODSchema,
        )
        t1 = time.perf_counter()
        collector.extract_metrics_from_output(output)
        collector.set_coffea_report(report)

    metrics = collector.get_metrics()
    skimmed_bytes = measure_skimmed_size(output, cfg.preprocess.skimming.output)
    save_time = metrics.get("sections", {}).get("save_skimmed", {}).get("total_duration", 0)

    results["skim_only"] = {
        "metrics": metrics,
        "tracking_data": collector.tracking_data,
        "wall_time": t1 - t0,
        "processed_events": output.get("processed_events", 0),
        "skimmed_events": output.get("skimmed_events", 0),
        "skimmed_bytes": skimmed_bytes,
        "save_skimmed_time": save_time,
    }

print(f"Mode 3 complete in {t1-t0:.1f}s")
print(f"  Processed: {output.get('processed_events', 0):,}")
print(f"  Skimmed:   {output.get('skimmed_events', 0):,}")
print(f"  Save time: {save_time:.1f}s")
print(f"  File size: {skimmed_bytes / 1e6:.1f} MB")

---
## Mode 4: Analysis on skimmed files

Read previously skimmed files (from Mode 1 or 3) and run the analysis on them.
Requires that skimmed files exist at the configured output location.

In [None]:
cfg = copy.deepcopy(base_config)
cfg.general.save_skimmed_output = False
cfg.general.run_analysis = True
cfg.general.run_histogramming = True
cfg.general.run_systematics = True
cfg.general.use_skimmed_input = True

with acquire_client(AF, close_after=AUTO_CLOSE_CLIENT, pip_packages=WORKER_DEPENDENCIES, propagate_aws_env=PROPAGATE_AWS) as (client, cluster):
    processor = UnifiedProcessor(
        config=cfg, output_manager=output_manager, metadata_lookup=metadata_lookup,
    )
    with MetricsCollector(
        client=client, processor_instance=processor,
        track_workers=True, worker_tracking_interval=1.0,
    ) as collector:
        t0 = time.perf_counter()
        output, report = run_processor_workflow(
            config=cfg, output_manager=output_manager,
            metadata_lookup=metadata_lookup, workitems=None,
            executor=DaskExecutor(client=client, treereduction=8, retries=0),
            schema=NanoAODSchema,
        )
        t1 = time.perf_counter()
        collector.extract_metrics_from_output(output)
        collector.set_coffea_report(report)

    results["analysis_on_skimmed"] = {
        "metrics": collector.get_metrics(),
        "tracking_data": collector.tracking_data,
        "wall_time": t1 - t0,
        "processed_events": output.get("processed_events", 0),
        "skimmed_events": output.get("skimmed_events", 0),
        "skimmed_bytes": 0,
        "save_skimmed_time": 0,
    }

print(f"Mode 4 complete in {t1-t0:.1f}s")
print(f"  Processed: {output.get('processed_events', 0):,}")

---
## Metrics comparison

Compare performance across all workflow modes that were executed.

### Wall-clock timing

In [None]:
print(f"{'Mode':<25} {'Wall (s)':>10} {'Save (s)':>10} {'Processed':>12} {'Skimmed':>10} {'Size (MB)':>10}")
print("-" * 81)
for mode_name, r in results.items():
    size_mb = r["skimmed_bytes"] / 1e6 if r["skimmed_bytes"] else 0
    print(
        f"{mode_name:<25} {r['wall_time']:>10.1f} {r['save_skimmed_time']:>10.1f} "
        f"{r['processed_events']:>12,} {r['skimmed_events']:>10,} {size_mb:>10.1f}"
    )

### Throughput and timing tables (roastcoffea)

In [None]:
for mode_name, r in results.items():
    metrics = r["metrics"]
    print(f"\n{'='*60}")
    print(f"  {mode_name}")
    print(f"{'='*60}")

    print("\nThroughput")
    console.print(format_throughput_table(metrics))

    print("\nEvent Processing")
    console.print(format_event_processing_table(metrics))

    print("\nTiming Breakdown")
    console.print(format_timing_table(metrics))

### Performance visualizations

In [None]:
import matplotlib.pyplot as plt

for mode_name, r in results.items():
    metrics = r["metrics"]
    tracking_data = r["tracking_data"]

    print(f"\n--- {mode_name} ---")

    try:
        fig, ax = plot_worker_count_timeline(tracking_data)
        ax.set_title(f"{mode_name}: Worker Count")
        plt.show()
    except Exception as e:
        print(f"  Worker count plot unavailable: {e}")

    try:
        fig, ax = plot_throughput_timeline(metrics["chunk_info"], tracking_data)
        ax.set_title(f"{mode_name}: Throughput")
        plt.show()
    except Exception as e:
        print(f"  Throughput plot unavailable: {e}")

    try:
        fig, ax = plot_runtime_distribution(metrics.get("raw_chunk_metrics"))
        ax.set_title(f"{mode_name}: Runtime Distribution")
        plt.show()
    except Exception as e:
        print(f"  Runtime distribution plot unavailable: {e}")