In [None]:
import datetime
import glob
import json
import os
from collections import defaultdict
from pathlib import Path
import traceback
import time
import warnings
import copy
import pathlib

import awkward as ak
import dask
import dask_awkward as dak
import hist.dask
import coffea
import numpy as np
import uproot
from dask.distributed import Client
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.style.use("ggplot")

from coffea.nanoevents import NanoEventsFactory, PHYSLITESchema
from coffea import dataset_tools

import utils
import warnings

warnings.filterwarnings("ignore")

import input_files.utils

from dask.distributed import LocalCluster, Client, progress, performance_report

# local: single thread, single worker
# cluster = LocalCluster(n_workers=1, processes=False, threads_per_worker=1)
# client = Client(cluster)

# for UChicago
# update this to point to your own client!
client = Client("tcp://dask-alheld-fa8ea830-2.af-jupyter:8786")

# create a folder for output tracking of uproot.open setup
MEASUREMENT_PATH = pathlib.Path(datetime.datetime.now().strftime("measurements/%Y-%m-%d_%H-%M-%S"))
os.makedirs(MEASUREMENT_PATH)

print(f"awkward: {ak.__version__}")
print(f"dask-awkward: {dak.__version__}")
print(f"uproot: {uproot.__version__}")
print(f"hist: {hist.__version__}")
print(f"coffea: {coffea.__version__}")

In [None]:
# -------------------
# INPUT CONFIGURATION
# -------------------
# modify this to change how many files are being processed
# top-level processes determine containers/DSIDs, which each have some number of files
# full list is list(find_containers.container_dict.keys()) + ["data15_13TeV", "data16_13TeV", "data17_13TeV", "data18_13TeV"]

PROCESSES_TO_USE = ["ttbar"]  # 6.7 TB
# PROCESSES_TO_USE = ["db", "zjets", "wjets", "ttV", "othertop", "ttbar"]  # all simulation, 48.4 TB
# PROCESSES_TO_USE = ["db", "zjets", "wjets", "ttV", "othertop", "ttbar", "data15_13TeV", "data16_13TeV", "data17_13TeV", "data18_13TeV"]  # 191 TB
# PROCESSES_TO_USE = ["db", "zjets", "wjets", "ttV", "ttbar", "data15_13TeV", "data16_13TeV", "data17_13TeV", "data18_13TeV"]  # 187 TB

fileset = input_files.utils.get_fileset(PROCESSES_TO_USE, max_files_per_container=1, max_containers_per_dsid=None, max_dsid_per_process=None)

# example for how to veto files
# files_to_veto = [("root://192.170.240.148//root://fax.mwt2.org:1094//pnfs/uchicago.edu/atlaslocalgroupdisk/rucio/mc20_13TeV/f5/99/DAOD_PHYSLITE.37230013._001196.pool.root.1", "CollectionTree")]
# fileset = dataset_tools.filter_files(fileset, lambda x: x in files_to_veto)

# duplicate each entry in the fileset
# print("DUPLICATING FILESET CONTENT")
# fileset_with_duplicates = copy.deepcopy(fileset)
# for k, v in fileset.items():
#     fileset_with_duplicates.update({f"{k}-duplicate": dict(v)})
# fileset = fileset_with_duplicates

utils.save_fileset(fileset, MEASUREMENT_PATH)
print(f"total number of files (including duplicates): {sum([len(v['files']) for v in fileset.values()])}")

In [None]:
# check for files not yet replicated to MWT2
files_at_mwt2 = 0
files_elsewhere = 0
for process in fileset.keys():
    for file in fileset[process]["files"]:
        if "mwt2" in file:
            files_at_mwt2 += 1
        else:
            files_elsewhere += 1

print(f"files at MWT2: {files_at_mwt2}, elsewhere: {files_elsewhere}")

## Dask distributing `uproot.open`

In [None]:
# turn fileset into simple list of files to run over
all_files = []
for process in fileset:
    all_files += fileset[process]["files"]

# define work to be done
def uproot_open_materialize(fname):
    # ~9%, around 400 Mbps single core, ~150 Mbps with 100 workers
    # BRANCH_LIST = [
    #     'PrimaryVerticesAuxDyn.z',
    #     'PrimaryVerticesAuxDyn.x',
    #     'PrimaryVerticesAuxDyn.y',
    #     'AnalysisJetsAuxDyn.Timing',
    #     'AnalysisJetsAuxDyn.JetConstitScaleMomentum_phi',
    #     'AnalysisJetsAuxDyn.DetectorEta',
    #     'AnalysisJetsAuxDyn.ActiveArea4vec_eta',
    #     'AnalysisJetsAuxDyn.JetConstitScaleMomentum_eta',
    #     'AnalysisJetsAuxDyn.phi',
    #     'AnalysisJetsAuxDyn.m',
    #     'AnalysisJetsAuxDyn.JetConstitScaleMomentum_pt',
    #     'AnalysisJetsAuxDyn.ActiveArea4vec_phi',
    #     'AnalysisJetsAuxDyn.JetConstitScaleMomentum_m',
    #     'AnalysisJetsAuxDyn.ActiveArea4vec_m',
    #     'AnalysisJetsAuxDyn.pt',
    #     'AnalysisJetsAuxDyn.Width',
    #     'AnalysisJetsAuxDyn.EMFrac',
    #     'AnalysisJetsAuxDyn.ActiveArea4vec_pt',
    #     'AnalysisJetsAuxDyn.PSFrac'
    # ]

    # ~15%, around 300 Mbps single core, ~130 Mbps with 100 workers
    BRANCH_LIST = [
         'PrimaryVerticesAuxDyn.z',
         'PrimaryVerticesAuxDyn.x',
         'PrimaryVerticesAuxDyn.y',
         'AnalysisJetsAuxDyn.Timing',
         'AnalysisJetsAuxDyn.JetConstitScaleMomentum_phi',
         'AnalysisJetsAuxDyn.DetectorEta',
         'AnalysisJetsAuxDyn.ActiveArea4vec_eta',
         'AnalysisJetsAuxDyn.JetConstitScaleMomentum_eta',
         'AnalysisJetsAuxDyn.phi',
         'AnalysisJetsAuxDyn.m',
         'AnalysisJetsAuxDyn.JetConstitScaleMomentum_pt',
         'AnalysisJetsAuxDyn.ActiveArea4vec_phi',
         'AnalysisJetsAuxDyn.JetConstitScaleMomentum_m',
         'AnalysisJetsAuxDyn.ActiveArea4vec_m',
         'AnalysisJetsAuxDyn.pt',
         'AnalysisJetsAuxDyn.Width',
         'AnalysisJetsAuxDyn.EMFrac',
         'AnalysisJetsAuxDyn.ActiveArea4vec_pt',
         'AnalysisJetsAuxDyn.PSFrac',
         'AnalysisJetsAuxDyn.JVFCorr',
         'AnalysisJetsAuxDyn.DFCommonJets_QGTagger_TracksC1',
         'AnalysisJetsAuxDyn.eta',
         'AnalysisPhotonsAuxDyn.topoetcone40_CloseByCorr',
         'AnalysisPhotonsAuxDyn.topoetcone40',
         'AnalysisPhotonsAuxDyn.eta',
         'AnalysisJetsAuxDyn.DFCommonJets_fJvt',
         'AnalysisPhotonsAuxDyn.phi',
         'AnalysisPhotonsAuxDyn.topoetcone20_CloseByCorr',
         'AnalysisPhotonsAuxDyn.topoetcone40ptCorrection',
         'AnalysisPhotonsAuxDyn.topoetcone20ptCorrection',
         'AnalysisPhotonsAuxDyn.pt',
         'AnalysisJetsAuxDyn.DFCommonJets_QGTagger_NTracks',
         'AnalysisJetsAuxDyn.DFCommonJets_QGTagger_TracksWidth',
         'AnalysisJetsAuxDyn.GhostMuonSegmentCount',
         'AnalysisPhotonsAuxDyn.topoetcone20',
         'AnalysisPhotonsAuxDyn.f1',
         'AnalysisPhotonsAuxDyn.DFCommonPhotonsIsEMTightIsEMValue',
         'AnalysisPhotonsAuxDyn.ptcone20_CloseByCorr',
         'AnalysisPhotonsAuxDyn.OQ',
         'AnalysisPhotonsAuxDyn.ptcone20',
         'AnalysisTauJetsAuxDyn.RNNJetScore',
         'AnalysisTauJetsAuxDyn.JetDeepSetScore',
         'AnalysisTauJetsAuxDyn.etaTauEnergyScale',
         'AnalysisTauJetsAuxDyn.etaFinalCalib',
         'AnalysisTauJetsAuxDyn.RNNEleScoreSigTrans_v1'
    ]

    filter_name = lambda x: x in BRANCH_LIST

    size_uncompressed = 0
    t0 = time.perf_counter()
    try:
        with uproot.open(fname, filter_name=filter_name) as f:
            num_entries = f["CollectionTree"].num_entries

            # iterate approach
            # for _ in f["CollectionTree"].iterate(expressions=BRANCH_LIST):
            #     pass

            # branch loop approach
            for b in BRANCH_LIST:
                f["CollectionTree"][b].array()
                size_uncompressed += f["CollectionTree"][b].uncompressed_bytes

            size_read = f.file.source.num_requested_bytes
        exception = None

    except:
        num_entries = 0
        size_read = 0
        size_uncompressed = 0
        exception = traceback.format_exc()

    t1 = time.perf_counter()
    time_finished = datetime.datetime.now()
    return {"fname": fname, "read": size_read, "uncompressed": size_uncompressed, "num_entries": num_entries,
            "runtime": t1-t0, "time_finished": time_finished, "exception": exception}

In [None]:
# perform computation
print(f"running with {len(all_files)} files")
# scattered_data = client.scatter([f for f in all_files])  # instead of submitting (possibly big) object directly

utils.start_tracking_workers(client, MEASUREMENT_PATH)  # track worker count in background
with performance_report(filename=MEASUREMENT_PATH/"dask-report-plain-uproot.html"):
    # futures = client.map(uproot_open_materialize, scattered_data)
    # out = ak.Array([r for r in client.gather(iter(futures))])

    tasks = [dask.delayed(uproot_open_materialize)(f) for f in all_files]  # create tasks
    t0 = time.perf_counter()
    out = ak.Array(dask.compute(*tasks))  # perform computations
    t1 = time.perf_counter()

utils.stop_tracking_workers()

print(f"wall clock time: {t1-t0:.2f}s")
utils.save_measurement(out, t0, t1, MEASUREMENT_PATH)

while waiting, check out out the XCache output: https://grafana.mwt2.org/d/EKefjM-Sz/af-network-200gbps-challenge?orgId=1&viewPanel=205&from=now-30m&to=now

In [None]:
# when repeating plots for old measurements, set appropriate MEASUREMENT_PATH here
# MEASUREMENT_PATH = pathlib.Path("measurements/2024-05-09_20-39-29")

# load measurements from file again
timestamps, nworkers, avg_num_workers = utils.get_timestamps_and_counts(MEASUREMENT_PATH)  # worker count info
out, t0, t1 = utils.load_measurement(MEASUREMENT_PATH)

# summary of performance
read_GB = sum(out['read']) / 1000**3
print(f"total read (compressed): {read_GB:.2f} GB")
print(f"total read (uncompressed): {sum(out['uncompressed']) / 1000**3:.2f} GB")

rate_Gbps = read_GB*8/(t1-t0)
print(f"average data rate: {rate_Gbps:.2f} Gbps (need to scale by x{200/rate_Gbps:.1f} to reach 200 Gbps)")

n_evts = sum(out["num_entries"])
print(f"total event rate (wall clock time): {n_evts / (t1-t0) / 1000:.2f} kHz (processed {n_evts} events total)")

total_runtime = sum(out["runtime"])
print(f"total aggregated runtime in function: {total_runtime:.2f} s")
print(f"ratio total runtime / wall clock time: {total_runtime / (t1-t0):.2f} "\
      "(should match # cores without overhead / scheduling issues)")
print(f"time-averaged number of workers: {avg_num_workers:.1f}")
print(f"\"efficiency\" (ratio of two numbers above): {total_runtime / (t1-t0) / avg_num_workers:.1%}")
print(f"event rate (aggregated time spent in function): {n_evts / total_runtime / 1000:.2f} kHz")

In [None]:
# get arrays for starting time, runtime and end time of all tasks
runtimes = np.asarray([datetime.timedelta(seconds=t) for t in out["runtime"]], dtype=np.timedelta64)
ends = out["time_finished"].to_numpy()
starts = ends - runtimes

# calculate instantaneous rates for given timestamp
times_for_rates = []
instantaneous_rates = []
for t in timestamps[::30]:  # only calculate every 30 seconds
    mask = np.logical_and((starts <= t), (t <= ends))  # mask for tasks running at given timestamp
    rate_Gbps_at_timestamp = sum(out[mask]['read']*8 / 1000**3 / out[mask]["runtime"])
    times_for_rates.append(t)
    instantaneous_rates.append(rate_Gbps_at_timestamp)

utils.plot_worker_count(timestamps, nworkers, avg_num_workers, times_for_rates, instantaneous_rates, MEASUREMENT_PATH)

In [None]:
print(f"{sum(o is not None for o in out['exception'])} files failed\n")

# use below to get full list with details
# for report in out:
    # if report["exception"] is not None:
        # print(f"{report['fname']} failed in {report['runtime']:.2f} s\n{report['exception']}\n")

In [None]:
# runtime distribution for all files
fig, ax = plt.subplots()
bins = np.linspace(0, max(out["runtime"])*1.01, 100)
ax.hist(out["runtime"], bins=bins)
ax.set_xlabel("runtime [s]")
ax.set_xlim([0, ax.get_xlim()[1]])
ax.set_ylabel("count")
ax.semilogy()
fig.savefig(MEASUREMENT_PATH / "runtime_distribution.pdf")

In [None]:
# runtime vs number of events in file
fig, ax = plt.subplots()
ax.scatter(out["num_entries"], out["runtime"], marker="x")
ax.set_xlabel("number of events")
ax.set_ylabel("runtime [s]")
fig.savefig(MEASUREMENT_PATH / "runtime_vs_nevts.pdf")

## Using coffea 2024

In [None]:
BRANCH_LIST = [
    'PrimaryVerticesAuxDyn.z',
    'PrimaryVerticesAuxDyn.x',
    'PrimaryVerticesAuxDyn.y',
    'AnalysisJetsAuxDyn.Timing',
    'AnalysisJetsAuxDyn.JetConstitScaleMomentum_phi',
    'AnalysisJetsAuxDyn.DetectorEta',
    'AnalysisJetsAuxDyn.ActiveArea4vec_eta',
    'AnalysisJetsAuxDyn.JetConstitScaleMomentum_eta',
    'AnalysisJetsAuxDyn.phi',
    'AnalysisJetsAuxDyn.m',
    'AnalysisJetsAuxDyn.JetConstitScaleMomentum_pt',
    'AnalysisJetsAuxDyn.ActiveArea4vec_phi',
    'AnalysisJetsAuxDyn.JetConstitScaleMomentum_m',
    'AnalysisJetsAuxDyn.ActiveArea4vec_m',
    'AnalysisJetsAuxDyn.pt',
    'AnalysisJetsAuxDyn.Width',
    'AnalysisJetsAuxDyn.EMFrac',
    'AnalysisJetsAuxDyn.ActiveArea4vec_pt',
    'AnalysisJetsAuxDyn.PSFrac',
    'AnalysisJetsAuxDyn.JVFCorr',
    'AnalysisJetsAuxDyn.DFCommonJets_QGTagger_TracksC1',
    'AnalysisJetsAuxDyn.eta',
    'AnalysisPhotonsAuxDyn.topoetcone40_CloseByCorr',
    'AnalysisPhotonsAuxDyn.topoetcone40',
    'AnalysisPhotonsAuxDyn.eta',
    'AnalysisJetsAuxDyn.DFCommonJets_fJvt',
    'AnalysisPhotonsAuxDyn.phi',
    'AnalysisPhotonsAuxDyn.topoetcone20_CloseByCorr',
    'AnalysisPhotonsAuxDyn.topoetcone40ptCorrection',
    'AnalysisPhotonsAuxDyn.topoetcone20ptCorrection',
    'AnalysisPhotonsAuxDyn.pt',
    'AnalysisJetsAuxDyn.DFCommonJets_QGTagger_NTracks',
    'AnalysisJetsAuxDyn.DFCommonJets_QGTagger_TracksWidth',
    'AnalysisJetsAuxDyn.GhostMuonSegmentCount',
    'AnalysisPhotonsAuxDyn.topoetcone20',
    'AnalysisPhotonsAuxDyn.f1',
    'AnalysisPhotonsAuxDyn.DFCommonPhotonsIsEMTightIsEMValue',
    'AnalysisPhotonsAuxDyn.ptcone20_CloseByCorr',
    'AnalysisPhotonsAuxDyn.OQ',
    'AnalysisPhotonsAuxDyn.ptcone20',
    'AnalysisTauJetsAuxDyn.RNNJetScore',
    'AnalysisTauJetsAuxDyn.JetDeepSetScore',
    'AnalysisTauJetsAuxDyn.etaTauEnergyScale',
    'AnalysisTauJetsAuxDyn.etaFinalCalib',
    'AnalysisTauJetsAuxDyn.RNNEleScoreSigTrans_v1'
]


def materialize_branches(events):
    num_events = ak.num(events, axis=0)  # track number of events

    # this will read around 25% of data files
    # materialize branches, just derive integers from them that will be aggregated to avoid memory issues
    _counter = 0
    for branch in BRANCH_LIST:
        obj_name, obj_prop = branch.split(".")
        obj_name = obj_name.replace("Analysis", "").replace("AuxDyn", "")
        if "Link" not in obj_prop:
            branch_data = events[obj_name, obj_prop]
        else:
            branch_data = events[obj_name, obj_prop]["m_persIndex"]

        _counter_to_add = ak.count_nonzero(branch_data, axis=-1)  # reduce innermost

        # reduce >2-dimensional (per event) branches further
        for _ in range(_counter_to_add.ndim - 1):
            _counter_to_add = ak.count_nonzero(_counter_to_add, axis=-1)

        _counter = _counter + _counter_to_add  # sum 1-dim array built from new branch

    _counter = ak.count_nonzero(_counter, axis=0)  # reduce to int

    return {"nevts": num_events, "_counter": _counter}

In [None]:
%%time
# pre-process
with performance_report(filename="dask-report-preprocess.html"):
    samples, report = dataset_tools.preprocess(fileset, skip_bad_files=True, uproot_options={"allow_read_errors_with_report": True})

# find issues where access did not work
for process in report:
    for k, v in report[process]["files"].items():
        if v["steps"] is None:
            print(f"could not read {k}")

samples = dict((k, dict(v.items())) for k, v in samples.items())  # convert to dict (no defaultdict left)

# save pre-processing output to json
with open("preprocessed_files.json", "w") as f:
    f.write(json.dumps(samples, sort_keys=True, indent=4))

In [None]:
# load the pre-processing results: useful to skip repeated pre-processing
with open("preprocessed_files.json") as f:
    samples = json.load(f)

In [None]:
%%time
# create the task graph
# filter_name seems to not do anything here in terms of performance
filter_name = lambda name: name in BRANCH_LIST
tasks = dataset_tools.apply_to_fileset(materialize_branches,
                                       samples,
                                       uproot_options={"allow_read_errors_with_report": (OSError, TypeError, KeyError), "filter_name": filter_name},
                                       schemaclass=PHYSLITESchema)

In [None]:
%%time
# execute task graph
utils.start_tracking_workers(client, MEASUREMENT_PATH)  # track worker count in background
t0 = time.perf_counter()
with performance_report(filename=MEASUREMENT_PATH/"dask-report-compute.html"):
    ((out, report),) = dask.compute(tasks)  # feels strange that this is a tuple-of-tuple
t1 = time.perf_counter()
utils.stop_tracking_workers()

time_uproot = ak.sum([v['duration'] for v in report.values()])
print(f"total time spent in uproot reading data: {time_uproot:.2f} s")
print(f"wall time: {t1-t0:.2f}s")

timestamps, nworkers, avg_num_workers = utils.get_timestamps_and_counts(MEASUREMENT_PATH)  # worker count info

In [None]:
print(f"output: {out}")

print("\nperformance metrics:")
event_rate = sum([out[process]["nevts"] for process in out.keys()]) / (t1-t0)
print(f" - event rate: {event_rate / 1_000:.2f} kHz")

num_bytes = ak.sum([report[process]["performance_counters"]["num_requested_bytes"] for process in out.keys()])
read_MB = num_bytes / 1_000**2
rate_Mbs = read_MB / (t1-t0)
print(f" - read {read_MB:.2f} MB in {t1-t0:.2f} s -> {rate_Mbs/1000*8:.2f} Gbps (need to scale by x{200/8/rate_Mbs*1000:.1f} to reach 200 Gbps)")
print(f" - time-averaged number of workers: {avg_num_workers:.1f}")
print(f" - spent {time_uproot:.1f} s reading data with wall time {t1-t0:.2f} and {avg_num_workers:.1f} cores on average -> \"efficiency\": {time_uproot / (t1-t0) / avg_num_workers:.1%}")

utils.plot_worker_count(timestamps, nworkers, avg_num_workers, [], [], pathlib.Path("."))

In [None]:
# report problematic files that caused exceptions
for process in report.keys():
    for i_file in range(len(report[process].exception)):
        file_report = report[process][i_file]
        if file_report.exception is not None:
            print(file_report.args[0].strip("\'"))
            print(file_report.message + "\n")

In [None]:
# sanity check that the right colums are being touched
# dak.report_necessary_columns(tasks)

In [None]:
# if issues with files exist, paste in path and reproduce
# fname = "root://192.170.240.148//root://fax.mwt2.org:1094//pnfs/uchicago.edu/atlaslocalgroupdisk/rucio/mc20_13TeV/f5/99/DAOD_PHYSLITE.37230013._001196.pool.root.1"
# treename = "CollectionTree"
# events = NanoEventsFactory.from_root({fname: treename}, schemaclass=PHYSLITESchema).events()
# task = materialize_branches(events)
# task["_counter"].compute()

## Dask distributing `xrdcp`

In [None]:
RUN_XRDCP = False  # off by default

# distribute `xrdcp` with Dask
def run_xrdcp(fname):
    t0 = time.perf_counter()
    os.system(f"xrdcp {fname} /dev/null -f")
    t1 = time.perf_counter()
    time_finished = datetime.datetime.now()
    return {"runtime": t1-t0, "time_finished": time_finished}

all_files = []
for process in fileset:
    all_files += fileset[process]["files"]

if RUN_XRDCP:
    # perform computation
    print(f"running with {len(all_files)} files")
    
    t0 = time.perf_counter()
    with performance_report(filename="dask-report-xrdcp.html"):
        futures = client.map(run_xrdcp, all_files)
        out = ak.Array([r for r in client.gather(iter(futures))])
    t1 = time.perf_counter()
    
    print(f"wall clock time: {t1-t0:.2f}s")
    print(f"current number of workers: {len(client.scheduler_info()['workers'])}")
    
    total_runtime = sum(out["runtime"])
    print(f"total aggregated runtime in function: {total_runtime:.2f} s")
    print(f"ratio total runtime / wall clock time: {total_runtime / (t1-t0):.2f} "\
          "(should match # cores without overhead / scheduling issues)")