In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
from pathlib import Path
import datetime
import traceback

import dask
import hist.dask
import awkward as ak
import coffea
import numpy as np
import uproot

from coffea.nanoevents import NanoEventsFactory, NanoAODSchema
NanoAODSchema.warn_missing_crossrefs = False # silences warnings about branches we will not use here

from coffea.analysis_tools import PackedSelection
from coffea import dataset_tools

from functools import partial
import cloudpickle
from collections import defaultdict
import os
import time

import ndcctools.taskvine as vine

In [None]:
# work for coffea
def do_stuff(events):
    import awkward as ak
    import time

    t0 = time.time()
    
    # track number of events
    num_entries = ak.num(events, axis=0)
    _counter = 0

    # read out all other branches into integers to avoid memory issues
    for b in [
        events.GenPart.pt,
        events.GenPart.eta,
        events.GenPart.phi,
        # events.CorrT1METJet.phi,
        # events.GenJet.pt,
        # events.CorrT1METJet.eta,
        # events.SoftActivityJet.pt,
        # events.Jet.eta,
        # events.Jet.phi,
    ]: 
        _counter += ak.count_nonzero(b, axis=None)
  
    
    return {"chunks": 1, "num_entries": num_entries, "_counter": _counter, "runtime": time.time() - t0}

In [None]:
# define the main taskvine scheduler specialized on executing dask graphs
vine_scheduler = vine.DaskVine(port=8786, ssl=True,
                            name=f"{os.environ.get('USER', 'noname')}-coffea-casa",
                            run_info_path="/mnt/data/btovar-logs/",)

In [None]:
# declare the token files and their environment variables
extra_files = {}
env_vars = {}

token_acc_path = "/etc/cmsaf-secrets-chown/access_token"
token_xch_path = "/etc/cmsaf-secrets-chown/xcache_token"

if Path(token_acc_path).is_file():
    extra_files[vine_scheduler.declare_file(token_acc_path, cache=True)] = "access_token"
    env_vars["BEARER_TOKEN_FILE"] = "access_token"

if Path(token_xch_path).is_file():
    extra_files[vine_scheduler.declare_file(token_xch_path, cache=True)] = "xcache_token"
    env_vars["XCACHE_FILE"] = "xcache_token"

In [None]:
# usually we put all these options directly into dask calls,
# but coffea preprocessing only allows one argument to set the scheduler,
# thus we create a partial of manager.get, which is the function that takes
# a dask graph and executes it.
vine_get = partial(vine_scheduler.get,
                    resources={"cores": 1},  #  max 1 core, 5GB of disk per task
                    resources_mode='fixed',   # set to "fixed" to kill tasks on resources
                    extra_files=extra_files,
                    env_vars=env_vars,
                    worker_transfers=True,  # keep partials at workers
                    task_mode="function-calls", # use one interpreter per worker
                    lib_resources={"cores": 8, "slots": 8}, # resources a single interpreter can run
                    # environment="env.tar.gz", # nfor task_mode="tasks" if taskvine version at worker 
                                                # is behind:
                                                # poncho_package_create $CONDA_PREFIX env.tar.gz,
                                                # or if more modules are needed at the execution site.
                        )

# given to coffea and dask functions as **scheduler_options to make taskvine the scheduler
scheduler_options = {}
scheduler_options['scheduler'] = vine_get

In [None]:
# read datasets
import json
fname = "zstd_files.json"

files_to_add = 10

fileset = {}
with open(fname,'r') as fp:
    for i,(dataset_name,file_list) in enumerate(json.load(fp).items()):
        fileset[dataset_name] = {"files": {}}
        for j,dataset_fpath in enumerate(file_list):
            xrd_fpath = f"root://xcache.cmsaf-dev.flatiron.hollandhpc.org:1094/{dataset_fpath}"
            fileset[dataset_name]["files"][xrd_fpath] = "Events"
            files_to_add -= 1
            if files_to_add < 1:
                break
        else:
            continue
        break

In [None]:
# preprocess
# step_size = 50_000
# step_size = 100_000
step_size = 250_000
# step_size = 500_000
# step_size = 5_000_000
pre_filename = f"preprocessed_{step_size}_demo_day.pkl"

try:
    # do not re preprocess if we don't have too...
    with open(pre_filename + "never", "rb") as f:
        samples = cloudpickle.load(f)
except Exception:
    samples, report = dataset_tools.preprocess(fileset,
                                               step_size=step_size,
                                               skip_bad_files=True,
                                               uproot_options={
                                                   "allow_read_errors_with_report": True},
                                               **scheduler_options)
    with open(pre_filename, "wb") as f:
        cloudpickle.dump(samples, f)

total_files  = sum([len(p["files"]) for p in samples.values()])
total_chunks = sum(sum(len(f["steps"]) for f in p["files"].values()) for p in samples.values())
print(f"nfiles: {total_files} chunks: {total_chunks}")

In [None]:
# regular coffea
t0 = time.perf_counter()

# change default scheduler
tasks = dataset_tools.apply_to_fileset(do_stuff,
                                       samples,
                                       uproot_options = {
                                           "allow_read_errors_with_report":(OSError, TypeError, KeyError)})

#(out, report) = dask.compute(tasks, **scheduler_options, progress_label="[green]process")
((out,report),) = dask.compute(tasks, **scheduler_options)
t1 = time.perf_counter()

print(f"wall time: {t1 - t0:.2f}s")

In [None]:
print(f"total time spent in uproot reading data: {ak.sum([v['duration'] for v in report.values()]):.2f} s")
print(f"wall time: {t1-t0:.2f}s")
print(f"events: {sum(out[k]['num_entries'] for k in out)}")
event_rate = sum(out[k]["num_entries"] for k in out)

event_rate = event_rate / (t1-t0)
print(f"event rate: {event_rate / 1_000:.2f} kHz")

read_GB = ak.sum([v['performance_counters']['num_requested_bytes'] for v in report.values()]) / 1_000**3
rate_Gbs = read_GB / (t1-t0)
print(f" - read {read_GB:.2f} GB in {t1-t0:.2f} s -> {rate_Gbs:.2f} GBps")


with open("outs.pkl", "wb") as f:
    cloudpickle.dump((out, report), f)



In [None]:
# stand-alone dask graph example
# funciton calls are tuples which first element is Callable type
# keys are any hashable that is not a function call
# values computed are keys, function calls, lists, tuples. Other values are taken as they are.
# arguments in tuple function calls are interpreted as key if needed.

graph = {
    "bases":           list(range(0, 10)),
    "exponents":       list(range(0, 10)),
    "even_exponents":  (lambda exps: [e for e in exps if e % 2 == 0], "exponents"),
    "even_powers":     (lambda bs, exps: { b: [ b ** e for e in exps] for b in bs }, "bases", "exponents"),
    "keyp":            (lambda e, o: {"even": e, "odd": o}, "even_exponents", "odd_exponents")
}

In [None]:
vine_get(graph, ["even_powers", "keyp"])