In [1]:
from pathlib import Path

import awkward as ak
import dask
import dask_awkward as dak
import hist.dask
import coffea
import numpy as np
import uproot
from dask.distributed import Client

from coffea.nanoevents import NanoEventsFactory, PHYSLITESchema
from coffea.analysis_tools import PackedSelection
from coffea import dataset_tools

import time
import warnings

warnings.filterwarnings("ignore")

# local: single thread, single worker
from dask.distributed import LocalCluster, Client, progress
cluster = LocalCluster(n_workers=1, processes=False, threads_per_worker=1)
client = Client(cluster)

# for UChicago
# client = Client("tcp://dask-alheld-a76c9434-b.af-jupyter:8786")

print(f"awkward: {ak.__version__}")
print(f"dask-awkward: {dak.__version__}")
print(f"uproot: {uproot.__version__}")
print(f"hist: {hist.__version__}")
print(f"coffea: {coffea.__version__}")

awkward: 2.6.2
dask-awkward: 2024.3.0
uproot: 5.3.1
hist: 2.7.2
coffea: 2024.3.0


### interactive coffea for debugging

In [2]:
fname = "/data/alheld/200gbps-atlas/mc23_13p6TeV.601229.PhPy8EG_A14_ttbar_hdamp258p75_SingleLep.deriv.DAOD_PHYSLITE.e8514_s4162_r14622_p6026/DAOD_PHYSLITE.37223155._000001.pool.root.1"
treename = "CollectionTree"
events = NanoEventsFactory.from_root({fname: treename}, schemaclass=PHYSLITESchema).events()

### distributed coffea

In [3]:
def materialize_branches(events):
    # track number of events
    num_events = ak.num(events, axis=0)

    # this will read around 10% of the file
    # materialize branches, just derive integers from them that will be aggregated to avoid memory issues
    _counter = 0
    _counter += ak.count_nonzero(events.Jets.EnergyPerSampling)
    _counter += ak.count_nonzero(events.Jets.SumPtTrkPt500)
    _counter += ak.count_nonzero(events.Jets.TrackWidthPt1000)
    _counter += ak.count_nonzero(events.Jets.NumTrkPt500)
    _counter += ak.count_nonzero(events.Jets.NumTrkPt1000)
    _counter += ak.count_nonzero(events.Jets.SumPtChargedPFOPt500)
    _counter += ak.count_nonzero(events.Jets.Timing)
    _counter += ak.count_nonzero(events.Jets.JetConstitScaleMomentum_eta)
    _counter += ak.count_nonzero(events.Jets.ActiveArea4vec_eta)
    _counter += ak.count_nonzero(events.Jets.DetectorEta)
    _counter += ak.count_nonzero(events.Jets.eta)
    _counter += ak.count_nonzero(events.Jets.JetConstitScaleMomentum_phi)
    _counter += ak.count_nonzero(events.Jets.ActiveArea4vec_phi)
    _counter += ak.count_nonzero(events.Jets.phi)
    _counter += ak.count_nonzero(events.Jets.JetConstitScaleMomentum_m)
    _counter += ak.count_nonzero(events.Jets.JetConstitScaleMomentum_pt)
    _counter += ak.count_nonzero(events.Jets.Width)
    _counter += ak.count_nonzero(events.Jets.EMFrac)
    _counter += ak.count_nonzero(events.Jets.pt)
    _counter += ak.count_nonzero(events.Jets.m)
    _counter += ak.count_nonzero(events.Jets.ActiveArea4vec_m)
    _counter += ak.count_nonzero(events.Jets.ActiveArea4vec_pt)
    _counter += ak.count_nonzero(events.Jets.DFCommonJets_QGTagger_TracksWidth)
    _counter += ak.count_nonzero(events.Jets.JVFCorr)
    _counter += ak.count_nonzero(events.Jets.DFCommonJets_QGTagger_TracksC1)
    _counter += ak.count_nonzero(events.Jets.PSFrac)
    _counter += ak.count_nonzero(events.Jets.DFCommonJets_QGTagger_NTracks)
    _counter += ak.count_nonzero(events.Jets.DFCommonJets_fJvt)
    _counter += ak.count_nonzero(events.Jets.PartonTruthLabelID)
    _counter += ak.count_nonzero(events.Jets.HadronConeExclExtendedTruthLabelID)
    _counter += ak.count_nonzero(events.Jets.ConeTruthLabelID)
    _counter += ak.count_nonzero(events.Jets.HadronConeExclTruthLabelID)

    return {"nevts": num_events, "_counter": _counter}

just run over a local ttbar file here as an example

In [4]:
fileset = {"ttbar": {"files": {fname: treename}}}
fileset

{'ttbar': {'files': {'/data/alheld/200gbps-atlas/mc23_13p6TeV.601229.PhPy8EG_A14_ttbar_hdamp258p75_SingleLep.deriv.DAOD_PHYSLITE.e8514_s4162_r14622_p6026/DAOD_PHYSLITE.37223155._000001.pool.root.1': 'CollectionTree'}}}

In [5]:
%%time
# pre-process
samples, _ = dataset_tools.preprocess(fileset, step_size=500_000)

CPU times: user 1.44 s, sys: 23.8 ms, total: 1.46 s
Wall time: 1.45 s


In [6]:
def filter_name(name):
    return name in [
        "AnalysisJetsAuxDyn.EnergyPerSampling",
        "AnalysisJetsAuxDyn.SumPtTrkPt500",
        "AnalysisJetsAuxDyn.TrackWidthPt1000",
        "AnalysisJetsAuxDyn.NumTrkPt500",
        "AnalysisJetsAuxDyn.NumTrkPt1000",
        "AnalysisJetsAuxDyn.SumPtChargedPFOPt500",
        "AnalysisJetsAuxDyn.Timing",
        "AnalysisJetsAuxDyn.JetConstitScaleMomentum_eta",
        "AnalysisJetsAuxDyn.ActiveArea4vec_eta",
        "AnalysisJetsAuxDyn.DetectorEta",
        "AnalysisJetsAuxDyn.eta",
        "AnalysisJetsAuxDyn.JetConstitScaleMomentum_phi",
        "AnalysisJetsAuxDyn.ActiveArea4vec_phi",
        "AnalysisJetsAuxDyn.phi",
        "AnalysisJetsAuxDyn.JetConstitScaleMomentum_m",
        "AnalysisJetsAuxDyn.JetConstitScaleMomentum_pt",
        "AnalysisJetsAuxDyn.Width",
        "AnalysisJetsAuxDyn.EMFrac",
        "AnalysisJetsAuxDyn.pt",
        "AnalysisJetsAuxDyn.m",
        "AnalysisJetsAuxDyn.ActiveArea4vec_m",
        "AnalysisJetsAuxDyn.ActiveArea4vec_pt",
        "AnalysisJetsAuxDyn.DFCommonJets_QGTagger_TracksWidth",
        "AnalysisJetsAuxDyn.JVFCorr",
        "AnalysisJetsAuxDyn.DFCommonJets_QGTagger_TracksC1",
        "AnalysisJetsAuxDyn.PSFrac",
        "AnalysisJetsAuxDyn.DFCommonJets_QGTagger_NTracks",
        "AnalysisJetsAuxDyn.DFCommonJets_fJvt",
        "AnalysisJetsAuxDyn.PartonTruthLabelID",
        "AnalysisJetsAuxDyn.HadronConeExclExtendedTruthLabelID",
        "AnalysisJetsAuxDyn.ConeTruthLabelID",
        "AnalysisJetsAuxDyn.HadronConeExclTruthLabelID",
        "AnalysisJetsAuxDyn.truthParticleLink",
        "AnalysisJetsAuxDyn.firstEgMotherTruthParticleLink",
        "AnalysisJetsAuxDyn.momentumBalanceSignificance",
        "AnalysisJetsAuxDyn.topoetcone20_CloseByCorr",
        "AnalysisJetsAuxDyn.scatteringCurvatureSignificance",
        "AnalysisJetsAuxDyn.neflowisol20_CloseByCorr",
        "AnalysisJetsAuxDyn.scatteringNeighbourSignificance",
        "AnalysisJetsAuxDyn.topoetcone20",
        "AnalysisJetsAuxDyn.topoetcone30",
        "AnalysisJetsAuxDyn.topoetcone40",
    ]

In [7]:
%%time
# create the task graph
# filter_name seems to not do anything here in terms of performance
tasks = dataset_tools.apply_to_fileset(materialize_branches, samples, uproot_options={"allow_read_errors_with_report": True, "filter_name": filter_name}, schemaclass=PHYSLITESchema)

CPU times: user 1.67 s, sys: 44.7 ms, total: 1.72 s
Wall time: 1.7 s


execute task graph

In [8]:
%%time
# execute
t0 = time.perf_counter()
((out, report),) = dask.compute(tasks)  # feels strange that this is a tuple-of-tuple
t1 = time.perf_counter()

print(f"total time spent in uproot reading data: {ak.sum([v['duration'] for v in report.values()]):.2f} s")
print(f"wall time: {t1-t0:.2f}s")

total time spent in uproot reading data: 16.83 s
wall time: 17.58s
CPU times: user 12.4 s, sys: 14.6 s, total: 27 s
Wall time: 17.6 s


In [9]:
out

{'ttbar': {'nevts': 30000, '_counter': 20461220}}

In [10]:
event_rate = out["ttbar"]["nevts"] / (t1-t0)
print(f"event rate: {event_rate / 1_000:.2f} kHz")

event rate: 1.71 kHz


In [11]:
report["ttbar"]  # need latest uproot to get more performance stats