In [None]:
import awkward as ak
from coffea import processor
import hist
import json
import numpy as np

import utils


class Processor(processor.ProcessorABC):
    def __init__(self):
        num_bins = 10
        bin_low = 0
        bin_high = 500
        name = "mass"
        label = "m [GeV]"
        self.hist = (
            hist.Hist.new.Reg(num_bins, bin_low, bin_high, name=name, label=label)
            .StrCat([], name="category", label="Category", growth=True)
            .StrCat([], name="variation", label="Systematic variation", growth=True)
            .Weight()
        )

    def process(self, events):
        histogram = self.hist.copy()

        category = events.metadata["dataset"]  # "ttbar" etc.
        variation = events.metadata["variation"]  # "nominal" etc.

        # normalization for MC
        x_sec = events.metadata["xsec"]
        nevts_total = events.metadata["nevts"]
        lumi = 3378 # /pb
        if category != "data":
            xsec_weight = x_sec * lumi / nevts_total
        else:
            xsec_weight = 1

        selected_jets = events.jet[events.jet.pt > 25]  # pT > 25 GeV for jets
        cut_btag = (
            ak.sum(selected_jets.btag > 0.2, axis=1) > 2
        )  # more than two btags ("tag" means > 0.2 score)

        selected_events = events[cut_btag]

        histogram.fill(
            mass=ak.sum(selected_events.jet, axis=-1).mass, category=category, variation=variation, weight=xsec_weight
        )

        output = {"nevents": {category: len(events)}, "hist": histogram}

        return output

    def postprocess(self, accumulator):
        return accumulator

In [None]:
# using https://atlas-groupdata.web.cern.ch/atlas-groupdata/dev/AnalysisTop/TopDataPreparation/XSection-MC15-13TeV.data
# x-secs are in pb
xsec_info = {
    "ttbar": 396.87 + 332.97, # nonallhad + allhad, keep same x-sec for all
    "single_top_s_chan": 2.0268 + 1.2676,
    "single_top_t_chan": 36.993 + 22.175,
    "single_top_tW": 37.936 + 37.906,
    "wjets": 61457 * 0.252,  # e/mu+nu final states
    "data": None
}

# list of files
with open("inputs/one_file.json") as f:
    file_info = json.load(f)

# process into "fileset" summarizing all info
fileset = {}
variation = "nominal"
for process in file_info.keys():
    file_list = file_info[process][variation]["files"]
    file_paths = [f["path"] for f in file_list]
    nevts_total = sum([f["nevts"] for f in file_list])
    metadata = {"variation": variation, "nevts": nevts_total, "xsec": xsec_info[process]}
    fileset.update({process: {"files": file_paths, "metadata": metadata}})

In [None]:
USE_DASK = True

if USE_DASK:
    from dask.distributed import Client

    client = Client("tls://localhost:8786")
    executor = processor.DaskExecutor(client=client)
else:
    executor = processor.IterativeExecutor()

run = processor.Runner(executor=executor, schema=utils.AGCSchema, savemetrics=True)

output, metrics = run(fileset, "events", processor_instance=Processor())

In [None]:
output["hist"][:, "ttbar", "nominal"].plot(stack=True, label="ttbar")
output["hist"][:, "wjets", "nominal"].plot(stack=True, label="wjets")
output["hist"][:, "single_top_s_chan", "nominal"].plot(stack=True, label="s-chan")
output["hist"][:, "single_top_t_chan", "nominal"].plot(stack=True, label="t-chan")
output["hist"][:, "single_top_tW", "nominal"].plot(stack=True, label="tW")

output["hist"][:, "data", "nominal"].plot(label="data")


import matplotlib.pyplot as plt
fig = plt.gcf()
fig.legend()