In [None]:
! xrdcp -f root://xrootd-local.unl.edu//store/user/IDAP/zstd_files.json zstd_files.json

In [None]:
from pathlib import Path

import awkward as ak
import dask
import dask_awkward as dak
import hist.dask
import coffea
import numpy as np
import uproot
from dask.distributed import Client

from coffea.nanoevents import NanoEventsFactory, NanoAODSchema
from coffea.analysis_tools import PackedSelection
from coffea import dataset_tools

import time
import warnings

# import utils
# utils.plotting.set_style()

warnings.filterwarnings("ignore")
NanoAODSchema.warn_missing_crossrefs = False # silences warnings about branches we will not use here

# local: single thread, single worker
from dask.distributed import LocalCluster, Client, progress
# cluster = LocalCluster(n_workers=1, processes=False, threads_per_worker=1)
# client = Client(cluster)

# for coffea-casa
client = Client("tls://localhost:8786")

print(f"awkward: {ak.__version__}")
print(f"dask-awkward: {dak.__version__}")
print(f"uproot: {uproot.__version__}")
print(f"hist: {hist.__version__}")
print(f"coffea: {coffea.__version__}")

In [None]:
def task(events):
    # track number of events
    num_events = ak.num(events, axis=0)
    
    # hit all the other branches, just derive integers from them that will be aggregated to avoid memory issues
    _counter = 0
    _counter += ak.count_nonzero(events.GenPart.pt)
    _counter += ak.count_nonzero(events.GenPart.eta)
    _counter += ak.count_nonzero(events.GenPart.phi)
    _counter += ak.count_nonzero(events.CorrT1METJet.phi)
    _counter += ak.count_nonzero(events.GenJet.pt)
    _counter += ak.count_nonzero(events.CorrT1METJet.eta)
    _counter += ak.count_nonzero(events.SoftActivityJet.pt)
    _counter += ak.count_nonzero(events.Jet.eta)
    _counter += ak.count_nonzero(events.Jet.phi)
    _counter += ak.count_nonzero(events.SoftActivityJet.eta)
    _counter += ak.count_nonzero(events.SoftActivityJet.phi)
    _counter += ak.count_nonzero(events.LHEPart.eta)
    _counter += ak.count_nonzero(events.LHEPart.phi)
    _counter += ak.count_nonzero(events.CorrT1METJet.rawPt)
    _counter += ak.count_nonzero(events.Jet.btagDeepFlavB)
    _counter += ak.count_nonzero(events.GenJet.eta)
    _counter += ak.count_nonzero(events.GenPart.mass)
    _counter += ak.count_nonzero(events.GenJet.phi)
    _counter += ak.count_nonzero(events.Jet.puIdDisc)
    _counter += ak.count_nonzero(events.CorrT1METJet.muonSubtrFactor)
    _counter += ak.count_nonzero(events.Jet.btagDeepFlavCvL)
    _counter += ak.count_nonzero(events.LHEPart.mass)
    _counter += ak.count_nonzero(events.LHEPart.pt)
    _counter += ak.count_nonzero(events.Jet.btagDeepFlavQG)
    _counter += ak.count_nonzero(events.Jet.mass)
    _counter += ak.count_nonzero(events.Jet.pt)
    _counter += ak.count_nonzero(events.GenPart.pdgId)
    _counter += ak.count_nonzero(events.Jet.btagDeepFlavCvB)
    _counter += ak.count_nonzero(events.Jet.cRegCorr)
    _counter += ak.count_nonzero(events.LHEPart.incomingpz)

    return {"nevts": num_events, "_counter": _counter}

just run over a local DY file here as an example

In [None]:
# fileset = {"DY": {"files": {"0263846E-B57D-7E48-A80F-458F8445E6C6.root": "Events"}}}
# fileset
import json
fname = "zstd_files.json"
fileset = {}
with open(fname,'r') as fp:
    for i,(dataset_name,file_list) in enumerate(json.load(fp).items()):
        if i != 0:
            continue
        fileset[dataset_name] = {"files": {}}
        for j,dataset_fpath in enumerate(file_list):
            if j != 1:
                continue
            # xrd_fpath = f"root://xrootd-local.unl.edu/{dataset_fpath}"
            xrd_fpath = f"root://xcache/{dataset_fpath}"
            # fileset[dataset_name]["files"][xrd_fpath] = {"object_path": "Events"}
            print(f"! xrdcp -f {xrd_fpath} /dev/null")
            fileset[dataset_name]["files"][xrd_fpath] = "Events"
            break
        break
fileset

In [None]:
%%time
# pre-process
samples, _ = dataset_tools.preprocess(fileset, step_size=500_000,uproot_options={"allow_read_errors_with_report": True})

In [None]:
%%time
# create the task graph
tasks = dataset_tools.apply_to_fileset(task, samples, uproot_options={"allow_read_errors_with_report": True})

execute task graph

In [None]:
%%time
# execute
t0 = time.perf_counter()
((out, report),) = dask.compute(tasks)  # feels strange that this is a tuple-of-tuple
t1 = time.perf_counter()

print(f"total time spent in uproot reading data: {ak.sum([v['duration'] for v in report.values()]):.2f} s")
print(f"wall time: {t1-t0:.2f}s")

In [None]:
out

In [None]:
event_rate = out["DY"]["nevts"] / (t1-t0)
print(f"event rate: {event_rate / 1_000:.2f} kHz")