In [133]:
! xrdcp -f root://xrootd-local.unl.edu//store/user/IDAP/zstd_files.json zstd_files.json

240419 19:49:53 1059 cryptossl_X509CreateProxy: Your identity: /DC=ch/DC=cern/OU=Organic Units/OU=Users/CN=clundst/CN=514102/CN=Carl Lundstedt


In [153]:
from pathlib import Path

import awkward as ak
import dask
import dask_awkward as dak
import hist.dask
import coffea
import numpy as np
import uproot
from dask.distributed import Client, performance_report

from coffea.nanoevents import NanoEventsFactory, NanoAODSchema
from coffea.analysis_tools import PackedSelection
from coffea import dataset_tools

from functools import partial
import os
import time
import warnings
    
executor = "dask_gateway"   # "dask" or "taskvine" or "dask_gateway"

warnings.filterwarnings("ignore")
NanoAODSchema.warn_missing_crossrefs = False # silences warnings about branches we will not use here

    
print(f"awkward: {ak.__version__}")
print(f"dask-awkward: {dak.__version__}")
print(f"uproot: {uproot.__version__}")
print(f"hist: {hist.__version__}")
print(f"coffea: {coffea.__version__}")

awkward: 2.6.3
dask-awkward: 2024.3.0
uproot: 5.3.2
hist: 2.7.2
coffea: 2024.4.0


In [154]:
scheduler_options = {}

# for coffea-casa
if executor == "taskvine":
    from ndcctools.taskvine import DaskVine, Task
    
    
    manager = DaskVine(port=8788, ssl=True, name=f"{os.environ.get('USER', 'noname')}-coffea-casa")
    manager.disable_peer_transfers()  # disable xfers between workers until figuring out docker routing

    extra_files = {}
    env_vars = {}
    
    token_path = "/etc/cmsaf-secrets/access_token"
    if Path(token_path).is_file():
        local_token = manager.declare_file(token_path, cache=True)

        # Manually change mode of access token, as it needs to be 0600
        # Alternatively, we could copy it to execution directory and change the mode there.
        do_chmod = Task("chmod 0600 access_token")
        do_chmod.add_input(local_token, "access_token")
        token_file = manager.declare_minitask(do_chmod, "access_token", cache=True, peer_transfer=True)
        extra_files = {token_file: "access_token"}
        env_vars = {"BEARER_TOKEN_FILE": "access_token"}

    vine_scheduler = partial(manager.get,
                             resources={"cores": 1, "disk": 2000},  #  max 1 core, 5GB of disk per task
                             extra_files=extra_files,
                             env_vars=env_vars,
                             submit_per_cycle=1000,
                             #  resources_mode=None,   # set to "fixed" to kill tasks on resources
                            )
    # change default scheduler
    scheduler_options['scheduler'] = vine_scheduler
elif executor == "dask_gateway":
    num_workers = 200   #number of workers desired
    from dask.distributed import LocalCluster, Client, progress
    gateway = Gateway()
    clusters=gateway.list_clusters()
    cluster = gateway.connect(clusters[0].name)
    client = cluster.get_client()
    cluster.scale(num_workers)
    # %%
    def set_env(dask_worker):
        path = str(pathlib.Path(dask_worker.local_directory) / 'access_token')
        os.environ["BEARER_TOKEN_FILE"] = path
        os.chmod(path, 0o600)

    def try_xrdcp():
        import subprocess
        subprocess.run(["xrdcp", "-f", "root://xcache//store/user/IDAP/zstd_files.json", "zstd_files.json"])
    
    client.wait_for_workers(num_workers)
    client.upload_file("/etc/cmsaf-secrets/access_token")
    client.run(set_env)
        
else:
    # by default use dask   
    # local: single thread, single worker
    from dask.distributed import LocalCluster, Client, progress
    
    # cluster = LocalCluster(n_workers=1, processes=False, threads_per_worker=1)
    # client = Client(cluster)
    client = Client("tls://localhost:8786")

In [155]:
def task(events):
    # track number of events
    num_events = ak.num(events, axis=0)

    # read out all other branches into integers to avoid memory issues
    _counter = 0
    for obj_to_add in [
        events.GenPart.pt,
        events.GenPart.eta,
        events.GenPart.phi,
        events.CorrT1METJet.phi,
        events.GenJet.pt,
        events.CorrT1METJet.eta,
        events.SoftActivityJet.pt,
        events.Jet.eta,
        events.Jet.phi,
        events.SoftActivityJet.eta,
        events.SoftActivityJet.phi,
        events.LHEPart.eta,
        events.LHEPart.phi,
        events.CorrT1METJet.rawPt,
        events.Jet.btagDeepFlavB,
        events.GenJet.eta,
        events.GenPart.mass,
        events.GenJet.phi,
        events.Jet.puIdDisc,
        events.CorrT1METJet.muonSubtrFactor,
        events.Jet.btagDeepFlavCvL,
        events.LHEPart.mass,
        events.LHEPart.pt,
        events.Jet.btagDeepFlavQG,
        events.Jet.mass,
        events.Jet.pt,
        events.GenPart.pdgId,
        events.Jet.btagDeepFlavCvB,
        events.Jet.cRegCorr,
        events.LHEPart.incomingpz
    ]:
        _counter_to_add = ak.count_nonzero(obj_to_add, axis=1)

        # reduce >2-dimensional (per event) branches further
        for _ in range(_counter_to_add.ndim - 1):
            _counter_to_add = ak.count_nonzero(_counter_to_add, axis=-1)

        _counter = _counter + _counter_to_add  # sum 1-dim array built from new branch

    _counter = ak.count_nonzero(_counter, axis=0)  # reduce to int

    return {"nevts": num_events, "_counter": _counter}

just run over a local DY file here as an example

In [156]:
# fileset = {"DY": {"files": {"0263846E-B57D-7E48-A80F-458F8445E6C6.root": "Events"}}}
# fileset
import json
fname = "zstd_files_smallish.json"
fileset = {}
with open(fname,'r') as fp:
    for i,(dataset_name,file_list) in enumerate(json.load(fp).items()):
#        if i != 0:
#            continue
        fileset[dataset_name] = {"files": {}}
        for j,dataset_fpath in enumerate(file_list):
#            if j != 1:
#                continue
            # xrd_fpath = f"root://xrootd-local.unl.edu/{dataset_fpath}"
            xrd_fpath = f"root://xcache/{dataset_fpath}"
            # fileset[dataset_name]["files"][xrd_fpath] = {"object_path": "Events"}
            print(f"! xrdcp -f {xrd_fpath} /dev/null")
            fileset[dataset_name]["files"][xrd_fpath] = "Events"
#            break
#        break
fileset

! xrdcp -f root://xcache//store/user/IDAP/RunIISummer20UL17NanoAODv9/TTToSemiLeptonic_TuneCP5_13TeV-powheg-pythia8/NANOAODSIM/20UL17JMENano_106X_mc2017_realistic_v9-v1/230000/D237194D-005E-2745-A5BC-C9E410C2A8EB.root /dev/null
! xrdcp -f root://xcache//store/user/IDAP/RunIISummer20UL17NanoAODv9/TTToSemiLeptonic_TuneCP5_13TeV-powheg-pythia8/NANOAODSIM/20UL17JMENano_106X_mc2017_realistic_v9-v1/2530000/6C68E715-D429-9F49-8592-A3776F9157B6.root /dev/null
! xrdcp -f root://xcache//store/user/IDAP/RunIISummer20UL17NanoAODv9/TTToSemiLeptonic_TuneCP5_13TeV-powheg-pythia8/NANOAODSIM/20UL17JMENano_106X_mc2017_realistic_v9-v1/2530000/023B10FE-BAFD-D347-BC33-040D9BE19822.root /dev/null
! xrdcp -f root://xcache//store/user/IDAP/RunIISummer20UL17NanoAODv9/TTToSemiLeptonic_TuneCP5_13TeV-powheg-pythia8/NANOAODSIM/20UL17JMENano_106X_mc2017_realistic_v9-v1/2530000/58328AB5-3955-7C45-B174-C66DB804CE5F.root /dev/null
! xrdcp -f root://xcache//store/user/IDAP/RunIISummer20UL17NanoAODv9/TTToSemiLeptonic_Tun

{'/TTToSemiLeptonic_TuneCP5_13TeV-powheg-pythia8/RunIISummer20UL18NanoAODv9-20UL18JMENano_106X_upgrade2018_realistic_v16_L1v1-v1/NANOAODSIM': {'files': {'root://xcache//store/user/IDAP/RunIISummer20UL17NanoAODv9/TTToSemiLeptonic_TuneCP5_13TeV-powheg-pythia8/NANOAODSIM/20UL17JMENano_106X_mc2017_realistic_v9-v1/230000/D237194D-005E-2745-A5BC-C9E410C2A8EB.root': 'Events',
   'root://xcache//store/user/IDAP/RunIISummer20UL17NanoAODv9/TTToSemiLeptonic_TuneCP5_13TeV-powheg-pythia8/NANOAODSIM/20UL17JMENano_106X_mc2017_realistic_v9-v1/2530000/6C68E715-D429-9F49-8592-A3776F9157B6.root': 'Events',
   'root://xcache//store/user/IDAP/RunIISummer20UL17NanoAODv9/TTToSemiLeptonic_TuneCP5_13TeV-powheg-pythia8/NANOAODSIM/20UL17JMENano_106X_mc2017_realistic_v9-v1/2530000/023B10FE-BAFD-D347-BC33-040D9BE19822.root': 'Events',
   'root://xcache//store/user/IDAP/RunIISummer20UL17NanoAODv9/TTToSemiLeptonic_TuneCP5_13TeV-powheg-pythia8/NANOAODSIM/20UL17JMENano_106X_mc2017_realistic_v9-v1/2530000/58328AB5-3955

In [157]:
%%time
# pre-process
samples, report = dataset_tools.preprocess(fileset, skip_bad_files=True, uproot_options={"allow_read_errors_with_report": True}, **scheduler_options)

CPU times: user 932 ms, sys: 21.4 ms, total: 953 ms
Wall time: 27.9 s


In [158]:
# find issues where access did not work
for process in report:
    for k, v in report[process]["files"].items():
        if v["steps"] is None:
            print(f"could not read {k}")

In [162]:
%%time
# create the task graph
tasks = dataset_tools.apply_to_fileset(task, samples, uproot_options={"allow_read_errors_with_report": (OSError, TypeError, KeyError)})

CPU times: user 2.25 s, sys: 23.1 ms, total: 2.27 s
Wall time: 3.68 s


execute task graph

In [None]:
%%time
# execute
t0 = time.perf_counter()

if executor == "taskvine":
    ((out, report),) = dask.compute(tasks, **scheduler_options)
else:
    with performance_report(filename="dask-report.html"):
        ((out, report),) = dask.compute(tasks, **scheduler_options)  # feels strange that this is a tuple-of-tuple
t1 = time.perf_counter()

print(f"total time spent in uproot reading data: {ak.sum([v['duration'] for v in report.values()]):.2f} s")
print(f"wall time: {t1-t0:.2f}s")

In [None]:
event_rate = sum(out[k]["nevts"] for k in out)

event_rate = event_rate / (t1-t0)
print(f"event rate: {event_rate / 1_000:.2f} kHz")

# need uproot>=5.3.2 to get these useful performance stats
read_MB = ak.sum([v['performance_counters']['num_requested_bytes'] for v in report.values()]) / 1_000**2
rate_Mbs = read_MB / (t1-t0)
print(f" - read {read_MB:.2f} MB in {t1-t0:.2f} s -> {rate_Mbs:.2f} MBps (need to scale by x{200/8/rate_Mbs*1000:.0f} to reach 200 Gbps)")

In [None]:
# report problematic files that caused exceptions
for process in report.keys():
    for i_file in range(len(report[process].exception)):
        file_report = report[process][i_file]
        if file_report.exception is not None:
            print(file_report.args[0].strip("\'"))
            print(file_report.message + "\n")