In [None]:
import awkward as ak
import dask
import hist
import hist.dask
import json
from coffea import processor
from coffea.nanoevents import BaseSchema, NanoAODSchema 
from coffea.dataset_tools import apply_to_dataset, apply_to_fileset, preprocess, rucio_utils
from coffea.dataset_tools import max_chunks, max_files, slice_chunks, slice_files
import corrections
import matplotlib.pyplot as plt


class MyZPeak(processor.ProcessorABC):
    def __init__(self, mode="virtual"):
        assert mode in ["eager", "virtual", "dask"]
        self._mode = mode
        
    def process(self, events):
        dataset = events.metadata['dataset']
        isRealData = "genWeight" not in events.fields
        sumw = 0. if isRealData else ak.sum(events.genWeight, axis=0)
        cutflow = {"start": ak.num(events, axis=0)}
        
        if isRealData:
            events = events[
                corrections.lumimask(events.run, events.luminosityBlock)
            ]
            cutflow["lumimask"] = ak.num(events, axis=0)
    
        events["goodmuons"] = events.Muon[
            (events.Muon.pt >= 20.)
            & events.Muon.tightId
        ]

        events = events[
            (ak.num(events.goodmuons) == 2)
            & (ak.sum(events.goodmuons.charge, axis=1) == 0)
        ]
        cutflow["ossf"] = ak.num(events, axis=0)
        
        # add first and second muon p4 in every event together
        events["zcand"] = events.goodmuons[:, 0] + events.goodmuons[:, 1]

        # require trigger
        events = events[
            # https://twiki.cern.ch/twiki/bin/view/CMS/MuonHLT2018
            events.HLT.Mu17_TrkIsoVVL_Mu8_TrkIsoVVL_DZ_Mass3p8
        ]
        weight = 1 * ak.ones_like(events.event) if isRealData else events.genWeight
        cutflow["trigger"] = ak.num(events, axis=0)

        if self._mode == "dask":
            hist_class = hist.dask.Hist
        else:
            hist_class = hist.Hist

        h = hist_class.new.Reg(120, 0., 120., label=r"$m_{\mu\mu}$ [GeV]").Weight()

        if self._mode == "dask":
            return {
                    "entries": ak.num(events, axis=0),
                    "sumw": sumw,
                    "cutflow": cutflow,
                    "mass": h.fill(events.zcand.mass, weight=weight)
                }
        else:
            return {
                dataset: {
                    "entries": ak.num(events, axis=0),
                    "sumw": sumw,
                    "cutflow": cutflow,
                    "mass": h.fill(events.zcand.mass, weight=weight)
                }
            }

    def postprocess(self, accumulator):
        return accumulator

In [None]:
from dask.distributed import Client

client = Client("tls://localhost:8786")
client

In [None]:
import shutil
shutil.make_archive("corrections", "zip", base_dir="corrections")

In [None]:
client.upload_file("corrections.zip")

In [None]:
with open("fileset.json", "rt") as file:
    initial_fileset = json.load(file)

# Scaling in Virtual mode

In [None]:
run = processor.Runner(
    executor = processor.DaskExecutor(client=client, compression=None),
    schema=NanoAODSchema,
    chunksize=100_000,
    skipbadfiles=True,
    savemetrics=True,
    maxchunks=7,
)

small_result, small_report = run(
    initial_fileset,
    processor_instance=MyZPeak("virtual"),
)

# Scaling in Dask mode

In [None]:
preprocessed_available, preprocessed_total = preprocess(
        initial_fileset,
        step_size=100_000,
        align_clusters=None,
        skip_bad_files=True,
        recalculate_steps=False,
        files_per_batch=1,
        file_exceptions=(OSError,),
        save_form=True,
        uproot_options={},
        step_size_safety_factor=0.5,
    )

In [None]:
import gzip, pickle, json
output_file = "scaleout_fileset"
with gzip.open(f"{output_file}_available.json.gz", "wt") as file:
    json.dump(preprocessed_available, file, indent=2)
    print(f"Saved available fileset chunks to {output_file}_available.json.gz")
with gzip.open(f"{output_file}_all.json.gz", "wt") as file:
    json.dump(preprocessed_total, file, indent=2)
    print(f"Saved complete fileset chunks to {output_file}_all.json.gz")

In [None]:
test_preprocessed_files = max_files(preprocessed_available, 5)
test_preprocessed = max_chunks(test_preprocessed_files, 7)

In [None]:
small_tg, small_rep = apply_to_fileset(data_manipulation=MyZPeak("dask"),
                            fileset=test_preprocessed,
                            schemaclass=NanoAODSchema,
                            uproot_options={"allow_read_errors_with_report": (OSError, ValueError)},
                           )

In [None]:
small_result, small_report = dask.compute(small_tg, small_rep)