In [None]:
! xrdcp -f root://xrootd-local.unl.edu//store/user/IDAP/zstd_files.json zstd_files.json

In [None]:
from pathlib import Path

import awkward as ak
import dask
import dask_awkward as dak
import hist.dask
import coffea
import numpy as np
import uproot
from dask.distributed import Client, performance_report

from coffea.nanoevents import NanoEventsFactory, NanoAODSchema
from coffea.analysis_tools import PackedSelection
from coffea import dataset_tools

from functools import partial
import os
import time
import warnings
    
executor = "dask"   # "dask" or "taskvine"

warnings.filterwarnings("ignore")
NanoAODSchema.warn_missing_crossrefs = False # silences warnings about branches we will not use here

    
print(f"awkward: {ak.__version__}")
print(f"dask-awkward: {dak.__version__}")
print(f"uproot: {uproot.__version__}")
print(f"hist: {hist.__version__}")
print(f"coffea: {coffea.__version__}")

In [None]:
scheduler_options = {}

# for coffea-casa
if executor == "taskvine":
    from ndcctools.taskvine import DaskVine, Task
    
    
    manager = DaskVine(port=8788, ssl=True, name=f"{os.environ.get('USER', 'noname')}-coffea-casa")
    manager.disable_peer_transfers()  # disable xfers between workers until figuring out docker routing

    extra_files = {}
    env_vars = {}
    
    token_path = "/etc/cmsaf-secrets/access_token"
    if Path(token_path).is_file():
        local_token = manager.declare_file(token_path, cache=True)

        # Manually change mode of access token, as it needs to be 0600
        # Alternatively, we could copy it to execution directory and change the mode there.
        do_chmod = Task("chmod 0600 access_token")
        do_chmod.add_input(local_token, "access_token")
        token_file = manager.declare_minitask(do_chmod, "access_token", cache=True, peer_transfer=True)
        extra_files = {token_file: "access_token"}
        env_vars = {"BEARER_TOKEN_FILE": "access_token"}

    vine_scheduler = partial(manager.get,
                             resources={"cores": 1, "disk": 2000},  #  max 1 core, 5GB of disk per task
                             extra_files=extra_files,
                             env_vars=env_vars,
                             submit_per_cycle=1000,
                             #  resources_mode=None,   # set to "fixed" to kill tasks on resources
                            )
    # change default scheduler
    scheduler_options['scheduler'] = vine_scheduler
else:
    # by default use dask   
    # local: single thread, single worker
    from dask.distributed import LocalCluster, Client, progress
    
    # cluster = LocalCluster(n_workers=1, processes=False, threads_per_worker=1)
    # client = Client(cluster)
    client = Client("tls://localhost:8786")

In [None]:
def task(events):
    # track number of events
    num_events = ak.num(events, axis=0)
    
    # hit all the other branches, just derive integers from them that will be aggregated to avoid memory issues
    _counter = 0
    _counter += ak.count_nonzero(events.GenPart.pt)
    _counter += ak.count_nonzero(events.GenPart.eta)
    _counter += ak.count_nonzero(events.GenPart.phi)
    _counter += ak.count_nonzero(events.CorrT1METJet.phi)
    _counter += ak.count_nonzero(events.GenJet.pt)
    _counter += ak.count_nonzero(events.CorrT1METJet.eta)
    _counter += ak.count_nonzero(events.SoftActivityJet.pt)
    _counter += ak.count_nonzero(events.Jet.eta)
    _counter += ak.count_nonzero(events.Jet.phi)
    _counter += ak.count_nonzero(events.SoftActivityJet.eta)
    _counter += ak.count_nonzero(events.SoftActivityJet.phi)
    _counter += ak.count_nonzero(events.LHEPart.eta)
    _counter += ak.count_nonzero(events.LHEPart.phi)
    _counter += ak.count_nonzero(events.CorrT1METJet.rawPt)
    _counter += ak.count_nonzero(events.Jet.btagDeepFlavB)
    _counter += ak.count_nonzero(events.GenJet.eta)
    _counter += ak.count_nonzero(events.GenPart.mass)
    _counter += ak.count_nonzero(events.GenJet.phi)
    _counter += ak.count_nonzero(events.Jet.puIdDisc)
    _counter += ak.count_nonzero(events.CorrT1METJet.muonSubtrFactor)
    _counter += ak.count_nonzero(events.Jet.btagDeepFlavCvL)
    _counter += ak.count_nonzero(events.LHEPart.mass)
    _counter += ak.count_nonzero(events.LHEPart.pt)
    _counter += ak.count_nonzero(events.Jet.btagDeepFlavQG)
    _counter += ak.count_nonzero(events.Jet.mass)
    _counter += ak.count_nonzero(events.Jet.pt)
    _counter += ak.count_nonzero(events.GenPart.pdgId)
    _counter += ak.count_nonzero(events.Jet.btagDeepFlavCvB)
    _counter += ak.count_nonzero(events.Jet.cRegCorr)
    _counter += ak.count_nonzero(events.LHEPart.incomingpz)

    return {"nevts": num_events, "_counter": _counter}

just run over a local DY file here as an example

In [None]:
# fileset = {"DY": {"files": {"0263846E-B57D-7E48-A80F-458F8445E6C6.root": "Events"}}}
# fileset
import json
fname = "zstd_files.json"
fileset = {}
with open(fname,'r') as fp:
    for i,(dataset_name,file_list) in enumerate(json.load(fp).items()):
        if i != 0:
            continue
        fileset[dataset_name] = {"files": {}}
        for j,dataset_fpath in enumerate(file_list):
            if j != 1:
                continue
            # xrd_fpath = f"root://xrootd-local.unl.edu/{dataset_fpath}"
            xrd_fpath = f"root://xcache/{dataset_fpath}"
            # fileset[dataset_name]["files"][xrd_fpath] = {"object_path": "Events"}
            print(f"! xrdcp -f {xrd_fpath} /dev/null")
            fileset[dataset_name]["files"][xrd_fpath] = "Events"
            break
        break
fileset

In [None]:
%%time
# pre-process
samples, report = dataset_tools.preprocess(fileset, skip_bad_files=True, uproot_options={"allow_read_errors_with_report": True}, **scheduler_options)

In [None]:
# find issues where access did not work
for process in report:
    for k, v in report[process]["files"].items():
        if v["steps"] is None:
            print(f"could not read {k}")

In [None]:
%%time
# create the task graph
tasks = dataset_tools.apply_to_fileset(task, samples, uproot_options={"allow_read_errors_with_report": (OSError, TypeError, KeyError)})

execute task graph

In [None]:
%%time
# execute
t0 = time.perf_counter()

if executor == "taskvine":
    ((out, report),) = dask.compute(tasks, **scheduler_options)
else:
    with performance_report(filename="dask-report.html"):
        ((out, report),) = dask.compute(tasks, **scheduler_options)  # feels strange that this is a tuple-of-tuple
t1 = time.perf_counter()

print(f"total time spent in uproot reading data: {ak.sum([v['duration'] for v in report.values()]):.2f} s")
print(f"wall time: {t1-t0:.2f}s")

In [None]:
event_rate = sum(out[k]["nevts"] for k in out)

event_rate = event_rate / (t1-t0)
print(f"event rate: {event_rate / 1_000:.2f} kHz")

# need uproot>=5.3.2 to get these useful performance stats
read_MB = ak.sum([v['performance_counters']['num_requested_bytes'] for v in report.values()]) / 1_000**2
rate_Mbs = read_MB / (t1-t0)
print(f" - read {read_MB:.2f} MB in {t1-t0:.2f} s -> {rate_Mbs:.2f} MBps (need to scale by x{200/8/rate_Mbs*1000:.0f} to reach 200 Gbps)")

In [None]:
# report problematic files that caused exceptions
for process in report.keys():
    for i_file in range(len(report[process].exception)):
        file_report = report[process][i_file]
        if file_report.exception is not None:
            print(file_report.args[0].strip("\'"))
            print(file_report.message + "\n")