# Skimming to S3 (Parquet)
Tests the intccms skimming infrastructure writing parquet to S3 via a coffea processor.

In [2]:
import sys
from pathlib import Path

repo_root = Path.cwd().parent
src_dir = repo_root / "src"
if str(src_dir) not in sys.path:
    sys.path.insert(0, str(src_dir))
if str(repo_root) not in sys.path:
    sys.path.insert(0, str(repo_root))

try:
    import omegaconf
except Exception:
    ! pip install omegaconf

In [None]:
import os

from intccms.utils.tools import load_dotenv

load_dotenv("../.env")

INPUT_FILE = "root://xcache//store/mc/RunIISummer20UL18NanoAODv9/TTToSemiLeptonic_TuneCP5_13TeV-powheg-pythia8/NANOAODSIM/106X_upgrade2018_realistic_v16_L1v1-v1/120000/0E9EA19A-AE0E-3149-88C3-D733240FF5AB.root"
DATASET_NAME = "ttbar"
TREE_NAME = "Events"
CHUNK_SIZE = 100_000

# S3 configuration
AWS_ID = os.environ["AWS_ACCESS_KEY_ID"]
AWS_SECRET = os.environ["AWS_SECRET_ACCESS_KEY"]
print(AWS_ID, AWS_SECRET)
S3_ENDPOINT = "https://red-s3.unl.edu/cmsaf-test-oshadura"
OUTPUT_DIR = "s3:///skim_test_s3_out"

STORAGE_OPTIONS = {
    "key": AWS_ID,
    "secret": AWS_SECRET,
    "client_kwargs": {"endpoint_url": S3_ENDPOINT},
}

# AF options: [iterative, coffeacasa-condor, coffeacasa-gateway, purdue-af]
AF = "coffeacasa-condor"
NUM_WORKERS = 10
AUTO_CLOSE_CLIENT = False

BRANCHES = {
    "event": ["run", "luminosityBlock", "event"],
    "Muon": ["pt", "eta", "phi", "mass"],
    "PuppiMET": ["pt", "phi"],
}

In [4]:
import hashlib
import time

import awkward as ak
import cloudpickle
from coffea.analysis_tools import PackedSelection
from coffea.nanoevents import NanoAODSchema
from coffea.processor import ProcessorABC, Runner, IterativeExecutor, DaskExecutor

import intccms
from intccms.schema import SkimmingConfig
from intccms.skimming.io.writers import ParquetWriter
from intccms.skimming.pipeline.stages import build_column_list, extract_columns, save_events
from intccms.utils.dask_client import acquire_client, live_prints
from intccms.utils.functors import SelectionExecutor

cloudpickle.register_pickle_by_value(intccms)

In [5]:
def skim_selection(puppimet, hlt):
    selection = PackedSelection()
    selection.add("trigger", hlt.Mu50)
    selection.add("met_cut", puppimet.pt > 50)
    selection.add("skim", selection.all("trigger", "met_cut"))
    return selection

In [6]:
class SkimProcessor(ProcessorABC):

    def __init__(self, skim_config, branches, output_dir, storage_options=None):
        self.skim_config = skim_config
        self.output_dir = output_dir
        self.storage_options = storage_options or {}
        self.columns_to_keep, _ = build_column_list(branches)
        self.writer = ParquetWriter()

    @property
    def accumulator(self):
        return {"total_in": 0, "total_out": 0}

    def _build_output_path(self, events):
        file_hash = hashlib.md5(events.metadata["filename"].encode()).hexdigest()[:8]
        entry_start = events.metadata["entrystart"]
        entry_stop = events.metadata["entrystop"]
        dataset = events.metadata["dataset"]
        return f"{self.output_dir.rstrip('/')}/{dataset}/{file_hash}_{entry_start}_{entry_stop}.parquet"

    def process(self, events):
        output = self.accumulator
        dataset = events.metadata["dataset"]
        n_in = len(events)

        executor = SelectionExecutor(self.skim_config)
        mask = executor.execute(events)
        events = events[mask]
        n_out = len(events)

        if n_out > 0:
            output_columns = extract_columns(events, self.columns_to_keep)
            out_path = self._build_output_path(events)
            save_events(
                self.writer, output_columns, out_path,
                compression="zstd", storage_options=self.storage_options,
            )

        output["total_in"] = n_in
        output["total_out"] = n_out
        print(f"  {dataset} [{events.metadata['entrystart']}:{events.metadata['entrystop']}]: {n_out}/{n_in} events passed")
        return output

    def postprocess(self, accumulator):
        return accumulator

In [7]:
fileset = {DATASET_NAME: {"files": [INPUT_FILE], "treename": TREE_NAME}}

skim_config = SkimmingConfig(
    function=skim_selection,
    use=[("PuppiMET", None), ("HLT", None)],
)

processor = SkimProcessor(
    skim_config=skim_config,
    branches=BRANCHES,
    output_dir=OUTPUT_DIR,
    storage_options=STORAGE_OPTIONS,
)

t0 = time.perf_counter()

if AF == "iterative":
    runner = Runner(
        executor=IterativeExecutor(),
        schema=NanoAODSchema,
        chunksize=CHUNK_SIZE,
    )
    output = runner(fileset, treename=TREE_NAME, processor_instance=processor)
else:
    with acquire_client(AF, num_workers=NUM_WORKERS, close_after=AUTO_CLOSE_CLIENT, propagate_aws_env=True) as (client, cluster):
        #stop = live_prints(client)
        runner = Runner(
            executor=DaskExecutor(client=client),
            schema=NanoAODSchema,
            chunksize=CHUNK_SIZE,
        )
        output = runner(fileset, treename=TREE_NAME, processor_instance=processor)
        #stop.set()

t1 = time.perf_counter()
total_in = output["total_in"]
total_out = output["total_out"]
print(f"\nDone in {t1-t0:.1f}s: {total_out:,}/{total_in:,} events kept ({100*total_out/total_in:.1f}%)")

Output()

Output()


Done in 43.7s: 95,634/1,290,000 events kept (7.4%)


In [8]:
from intccms.skimming.io.readers import get_reader

reader = get_reader("parquet")
file_hash = hashlib.md5(INPUT_FILE.encode()).hexdigest()[:8]
test_file = f"{OUTPUT_DIR.rstrip('/')}/{DATASET_NAME}/{file_hash}_0_99231.parquet"
print(test_file)
events = reader.read(test_file, tree_name=None, storage_options=STORAGE_OPTIONS)
print(f"Read back {test_file.split('/')[-1]}: {len(events)} events, fields={events.fields}")

s3:///skim_test_s3_out/ttbar/39ccc721_0_99231.parquet
Read back 39ccc721_0_99231.parquet: 7492 events, fields=['run', 'luminosityBlock', 'event', 'Muon', 'PuppiMET']


In [9]:
if AF != "iterative" and not AUTO_CLOSE_CLIENT:
    client.close()