In [None]:
import pickle
import numpy as np
import awkward
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

import uproot
import boost_histogram as bh
import mplhep
import glob
import os
import vector
import shutil

mplhep.style.use("CMS")

In [None]:
import sys

sys.path += ["../../mlpf/plotting/"]
sys.path += ["../../mlpf/"]

import plot_utils
import jet_utils

In [None]:
# ev = uproot.open("/local/joosep/mlpf/results/cms/CMSSW_14_1_0_74d149_btvnano/TTbar_noPU_mlpf/step3_NANO_jme_1.root").get("Events")
# for br in ev.branches:
#     brname = br.name
#     if "FatJet" in brname:
#         print(brname)

In [None]:
def to_bh(data, bins, cumulative=False):
    h1 = bh.Histogram(bh.axis.Variable(bins))
    h1.fill(data)
    if cumulative:
        h1[:] = np.sum(h1.values()) - np.cumsum(h1)
    return h1

def load_nano(fn):
    print(fn)
    tt = uproot.open(fn).get("Events")
    ret = {}
    for k in [
        "Jet_pt", "Jet_genJetIdx", "Jet_rawFactor",
        "JetCHS_pt", "JetCHS_genJetIdx", "JetCHS_rawFactor",
        "FatJet_pt", "FatJet_genJetAK8Idx", "FatJet_rawFactor",
        "GenJet_pt",
        "GenJetAK8_pt",
        "GenMET_pt", "GenMET_phi",
        "PFMET_pt", "PFMET_phi",
        "RawPFMET_pt", "RawPFMET_phi"
    ]:
        ret[k] = tt.arrays(k)[k]
    return [ret, ]

def varbins(*args):
    newlist = []
    for arg in args[:-1]:
        newlist.append(arg[:-1])
    newlist.append(args[-1])
    return np.concatenate(newlist)

def get_hist_and_merge(files, histname):
    hists = []
    for fn in files:
        fi = uproot.open(fn)
        h = fi[histname].to_boost()
        hists.append(h)
    return sum(hists[1:], hists[0])

from scipy.optimize import curve_fit

def Gauss(x, a, x0, sigma):
    return a * np.exp(-((x - x0) ** 2) / (2 * sigma**2))

def fit_response(hist2d, bin_range):
    centers = []
    means = []
    means_unc = []

    sigmas = []
    sigmas_unc = []

    for ibin in bin_range:

        print(ibin)
        plt.figure()
        xvals = hist2d.axes[1].centers
        vals = hist2d.values()[ibin]
        errs = np.sqrt(vals)
        errs[vals == 0] = 1.0

        parameters1, covariances1 = curve_fit(
            Gauss,
            xvals,
            vals,
            p0=[1.0, 0.0, 1.0],
            sigma=errs,
            maxfev=1000000,
            method="dogbox",
            bounds=[(-np.inf, -10, 0), (np.inf, 10, 50)],
        )
        plt.errorbar(xvals, vals, errs)
        plt.plot(xvals, Gauss(xvals, *parameters1))
        plt.xlabel("$\Delta E_T / E_T$")
        plt.title("${} < E_T < {}$".format(hist2d.axes[0].edges[ibin], hist2d.axes[0].edges[ibin + 1]))

        means.append(parameters1[1])
        means_unc.append(np.sqrt(covariances1[1, 1]))
        sigmas.append(parameters1[2])
        sigmas_unc.append(np.sqrt(covariances1[2, 2]))

        centers.append(hist2d.axes[0].centers[ibin])

    centers = np.array(centers)
    means = np.array(means)
    means_unc = np.array(means_unc)

    sigmas = np.array(sigmas)
    sigmas_unc = np.array(sigmas_unc)

    return centers, means, means_unc, sigmas, sigmas_unc

In [None]:
from plot_utils import ELEM_LABELS_CMS, ELEM_NAMES_CMS
from plot_utils import CLASS_LABELS_CMS, CLASS_NAMES_CMS
from plot_utils import cms_label, sample_label

In [None]:
folder = "QCD_noPU"
physics_process = "cms_pf_qcd_nopu"

if folder == "QCD_noPU" or folder == "QCD_PU":
    jet_bins = varbins(np.linspace(10, 100, 21), np.linspace(100, 200, 5), np.linspace(200, 1000, 5))
    met_bins = varbins(np.linspace(0, 150, 21), np.linspace(150, 500, 5))

if folder == "TTbar_PU" or folder == "TTbar_noPU":
    jet_bins = varbins(np.linspace(10, 100, 21), np.linspace(100, 250, 5))
    met_bins = varbins(np.linspace(0, 150, 21), np.linspace(150, 250, 5))

outpath = "cmssw/{}".format(folder)
shutil.rmtree(outpath, ignore_errors=True)
os.makedirs(outpath)

In [None]:
pf_files = glob.glob("/local/joosep/mlpf/results/cms/CMSSW_14_1_0_74d149_btvnano/{}_pf/step3_NANO_jme_*.root".format(folder))
mlpf_new_files = glob.glob("/local/joosep/mlpf/results/cms/CMSSW_14_1_0_74d149_btvnano/{}_mlpf/step3_NANO_jme_*.root".format(folder))

pf_files_d = {os.path.basename(fn): fn for fn in pf_files}
mlpf_new_files_d = {os.path.basename(fn): fn for fn in mlpf_new_files}

In [None]:
len(pf_files), len(mlpf_new_files)

In [None]:
common_files = list(set(pf_files_d.keys()).intersection(set(mlpf_new_files_d.keys())))
len(common_files)

In [None]:
data_baseline = awkward.Array(sum([load_nano(pf_files_d[fn]) for fn in common_files], []))
data_mlpf_new = awkward.Array(sum([load_nano(mlpf_new_files_d[fn]) for fn in common_files], []))

data_baseline = awkward.Array({k: awkward.flatten(data_baseline[k], axis=1) for k in data_baseline.fields})
data_mlpf_new = awkward.Array({k: awkward.flatten(data_mlpf_new[k], axis=1) for k in data_mlpf_new.fields})

## Jets

### Jet $p_T$ spectrum

In [None]:
data_baseline["JetCHS_pt_raw"] = data_baseline["JetCHS_pt"]*(1.0 - data_baseline["JetCHS_rawFactor"])
data_mlpf_new["JetCHS_pt_raw"] = data_mlpf_new["JetCHS_pt"]*(1.0 - data_mlpf_new["JetCHS_rawFactor"])
data_baseline["FatJet_pt_raw"] = data_baseline["FatJet_pt"]*(1.0 - data_baseline["FatJet_rawFactor"])
data_mlpf_new["FatJet_pt_raw"] = data_mlpf_new["FatJet_pt"]*(1.0 - data_mlpf_new["FatJet_rawFactor"])

In [None]:
f, (a0, a1) = plt.subplots(2, 1, gridspec_kw={"height_ratios": [3, 1]}, sharex=True)

h0 = to_bh(awkward.flatten(data_baseline["GenJet_pt"]), jet_bins)
h1 = to_bh(awkward.flatten(data_baseline["JetCHS_pt_raw"]), jet_bins)
# h2 = to_bh(awkward.flatten(mlpf_old_jets.pt), jet_bins)
h3 = to_bh(awkward.flatten(data_mlpf_new["JetCHS_pt_raw"]), jet_bins)

plt.sca(a0)
x0 = mplhep.histplot(h0, histtype="step", lw=2, label="gen", binwnorm=1.0, ls="--")
x1 = mplhep.histplot(h1, histtype="step", lw=2, label="PF", binwnorm=1.0, ls="-")
# x2 = mplhep.histplot(h2, histtype="step", lw=2, label="MLPF old", binwnorm=1.0, ls="-")
x3 = mplhep.histplot(h3, histtype="step", lw=2, label="MLPF", binwnorm=1.0, ls="-")

# plt.xscale("log")
plt.yscale("log")
cms_label(a0)
sample_label(a0, physics_process, x=0.01, y=0.9)
a0.text(0.01, 0.92, "AK4 CHS jets", transform=a0.transAxes)
handles, labels = a0.get_legend_handles_labels()
handles = [x0[0].stairs, x1[0].stairs, x3[0].stairs]
a0.legend(handles, labels, loc=1)
plt.ylim(10, 10**6)
plt.ylabel("Number of jets / GeV")

plt.sca(a1)
mplhep.histplot(h0 / h0, histtype="step", lw=2, ls="--")
mplhep.histplot(h1 / h0, histtype="step", lw=2, ls="-")
# mplhep.histplot(h2 / h0, histtype="step", lw=2, ls="-")
mplhep.histplot(h3 / h0, histtype="step", lw=2, ls="-")
plt.ylim(0,5)
plt.ylabel("reco / gen")
plt.xlabel("jet $p_T$ [GeV]")

plt.xscale("log")

plt.xlim(min(jet_bins), max(jet_bins))
plt.savefig("{}/ak4_chs_jet_pt.pdf".format(outpath))

In [None]:
jet_bins_fat = varbins(np.linspace(100, 200, 5), np.linspace(200, 1000, 5))

In [None]:
f, (a0, a1) = plt.subplots(2, 1, gridspec_kw={"height_ratios": [3, 1]}, sharex=True)

h0 = to_bh(awkward.flatten(data_baseline["GenJetAK8_pt"]), jet_bins_fat)
h1 = to_bh(awkward.flatten(data_baseline["FatJet_pt_raw"]), jet_bins_fat)
h3 = to_bh(awkward.flatten(data_mlpf_new["FatJet_pt_raw"]), jet_bins_fat)

plt.sca(a0)
x0 = mplhep.histplot(h0, histtype="step", lw=2, label="gen", binwnorm=1.0, ls="--")
x1 = mplhep.histplot(h1, histtype="step", lw=2, label="PF", binwnorm=1.0, ls="-")
x3 = mplhep.histplot(h3, histtype="step", lw=2, label="MLPF", binwnorm=1.0, ls="-")

# plt.xscale("log")
plt.yscale("log")
cms_label(a0)
sample_label(a0, physics_process, x=0.01, y=0.9)
a0.text(0.01, 0.92, "AK8 jets", transform=a0.transAxes)
handles, labels = a0.get_legend_handles_labels()
handles = [x0[0].stairs, x1[0].stairs, x3[0].stairs]
a0.legend(handles, labels, loc=1)
plt.ylim(1, 10**6)
plt.ylabel("Number of jets / GeV")

plt.sca(a1)
mplhep.histplot(h0 / h0, histtype="step", lw=2, ls="--")
mplhep.histplot(h1 / h0, histtype="step", lw=2, ls="-")
# mplhep.histplot(h2 / h0, histtype="step", lw=2, ls="-")
mplhep.histplot(h3 / h0, histtype="step", lw=2, ls="-")
plt.ylim(0,5)
plt.ylabel("reco / gen")
plt.xlabel("jet $p_T$ [GeV]")

plt.xscale("log")

plt.xlim(min(jet_bins_fat), max(jet_bins_fat))
plt.savefig("{}/ak8_jet_pt.pdf".format(outpath))

### Jet response, matching to gen-jets

In [None]:
plt.figure()
ax = plt.axes()

b = np.linspace(0,2,200)

cms_label(ax)
sample_label(ax, physics_process, x=0.02, y=0.92)
ax.text(0.02, 0.94, "AK4 CHS jets", transform=ax.transAxes)

msk = data_baseline["JetCHS_genJetIdx"]!=-1
jet_response_pf = awkward.flatten(data_baseline["JetCHS_pt_raw"][msk] / data_baseline["GenJet_pt"][data_baseline["JetCHS_genJetIdx"]][msk])
msk = data_mlpf_new["JetCHS_genJetIdx"]!=-1
jet_response_mlpf = awkward.flatten(data_mlpf_new["JetCHS_pt_raw"][msk] / data_mlpf_new["GenJet_pt"][data_mlpf_new["JetCHS_genJetIdx"]][msk])

h0 = to_bh(jet_response_pf, b)
h1 = to_bh(jet_response_mlpf, b)

plt.plot([], [])
x0 = mplhep.histplot(h0, histtype="step", lw=2, label="PF");
x1 = mplhep.histplot(h1, histtype="step", lw=2, label="MLPF");

handles, labels = ax.get_legend_handles_labels()
handles = [x0[0].stairs, x1[0].stairs]
ax.legend(handles, labels, loc=1)
plt.xlabel("Matched reco / gen jet $p_T$")
plt.ylabel("Matched jets / bin")
plt.savefig("{}/ak4_chs_jet_pt_ratio.pdf".format(outpath))

In [None]:
plt.figure()
ax = plt.axes()

b = np.linspace(0,2,200)

cms_label(ax)
sample_label(ax, physics_process, x=0.02, y=0.92)
ax.text(0.02, 0.94, "AK8 jets", transform=ax.transAxes)

msk = data_baseline["FatJet_genJetAK8Idx"]!=-1
jet_response_pf = awkward.flatten(data_baseline["FatJet_pt_raw"][msk] / data_baseline["GenJetAK8_pt"][data_baseline["FatJet_genJetAK8Idx"]][msk])
msk = data_mlpf_new["FatJet_genJetAK8Idx"]!=-1
jet_response_mlpf = awkward.flatten(data_mlpf_new["FatJet_pt_raw"][msk] / data_mlpf_new["GenJetAK8_pt"][data_mlpf_new["FatJet_genJetAK8Idx"]][msk])

h0 = to_bh(jet_response_pf, b)
h1 = to_bh(jet_response_mlpf, b)

plt.plot([], [])
x0 = mplhep.histplot(h0, histtype="step", lw=2, label="PF");
x1 = mplhep.histplot(h1, histtype="step", lw=2, label="MLPF");

handles, labels = ax.get_legend_handles_labels()
handles = [x0[0].stairs, x1[0].stairs]
ax.legend(handles, labels, loc=1)
plt.xlabel("Matched reco / gen jet $p_T$")
plt.ylabel("Matched jets / bin")
plt.savefig("{}/ak8_jet_pt_ratio.pdf".format(outpath))

In [None]:
def compute_iqr(data):
    p75 = np.percentile(data, 75)
    p25 = np.percentile(data, 25)
    return p75-p25

In [None]:
def get_response_in_bins(
    label_gjidx="JetCHS_genJetIdx", label_rjpt="JetCHS_pt_raw", label_gjpt="GenJet_pt",
    jet_bins=jet_bins,
    fn="ak4_chs",
    title="AK4 CHS",
    ):
    response_bins = np.linspace(0, 2, 100)
    
    med_vals_pf = []
    iqr_vals_pf = []
    match_vals_pf = []
    
    med_vals_mlpf = []
    iqr_vals_mlpf = []
    match_vals_mlpf = []
    
    for ibin in range(len(jet_bins)-1):
        min_pt = jet_bins[ibin]
        max_pt = jet_bins[ibin+1]
    
        msk = data_baseline[label_gjidx]!=-1
        matched_gj = data_baseline[label_gjpt][data_baseline[label_gjidx]][msk]
        jet_response_pf = data_baseline[label_rjpt][msk] / matched_gj
        jet_response_pf = awkward.flatten(jet_response_pf[(matched_gj>=min_pt) & (matched_gj<max_pt)])
        all_gj_pt = data_baseline[label_gjpt]
        sel_gj_pt = all_gj_pt[(all_gj_pt>=min_pt) & (all_gj_pt<max_pt)]
        match_vals_pf.append(awkward.count(jet_response_pf)/awkward.count(sel_gj_pt))

        msk = data_mlpf_new[label_gjidx]!=-1
        matched_gj = data_mlpf_new[label_gjpt][data_mlpf_new[label_gjidx]][msk]
        jet_response_mlpf = data_mlpf_new[label_rjpt][msk] / matched_gj
        jet_response_mlpf = awkward.flatten(jet_response_mlpf[(matched_gj>=min_pt) & (matched_gj<max_pt)])
        all_gj_pt = data_mlpf_new[label_gjpt]
        sel_gj_pt = all_gj_pt[(all_gj_pt>=min_pt) & (all_gj_pt<max_pt)]
        match_vals_mlpf.append(awkward.count(jet_response_mlpf)/awkward.count(sel_gj_pt))
        
        med, iqr = plot_utils.med_iqr(jet_response_pf)
        med_vals_pf.append(med)
        iqr_vals_pf.append(iqr)
    
        plt.figure()
        ax = plt.axes()
        plt.plot([], [])
        plt.hist(
            jet_response_pf,
            bins=response_bins,
            histtype="step", lw=2,
            label="PF: ${:.2f}\pm{:.2f}$".format(med, iqr)
        );
    
        med, iqr = plot_utils.med_iqr(jet_response_mlpf)
        med_vals_mlpf.append(med)
        iqr_vals_mlpf.append(iqr)
    
        plt.hist(
            jet_response_mlpf,
            bins=response_bins,
            histtype="step", lw=2,
            label="MLPF: ${:.2f}\pm{:.2f}$".format(med, iqr)
        );
        
        plt.legend(loc=1, title="{} jets, ${} < p_T < {}$".format(title, min_pt, max_pt))
        plt.ylim(0, 2*ax.get_ylim()[1])
        cms_label(ax)
        plt.xlabel("Jet $p_T$ response, $r=p_{T,reco}/p_{T,gen}$")
        plt.savefig("{}/{}_jet_response_bin_{}.pdf".format(outpath, fn, ibin))
        if ibin>0:
            plt.clf()
    return (med_vals_pf, iqr_vals_pf, match_vals_pf), (med_vals_mlpf, iqr_vals_mlpf, match_vals_mlpf)

In [None]:
stats_pf, stats_mlpf = get_response_in_bins()

In [None]:
fig = plt.figure()
ax = plt.axes()
plt.plot([], [])
plt.plot(jet_bins[:-1], np.array(stats_pf[0]), marker="o", label="PF")
plt.plot(jet_bins[:-1], np.array(stats_mlpf[0]), marker="^", label="MLPF")
plt.xscale("log")
plt.legend()
plt.ylabel("jet $p_T$ response median")
cms_label(ax)
sample_label(ax, physics_process, x=0.01, y=0.94)
ax.text(0.01, 0.95, "AK4 CHS jets", transform=ax.transAxes)
plt.axhline(1.0, color="black", ls="--")
plt.ylim(0.5, 1.5)
plt.savefig("{}/ak4_chs_jet_response_median.pdf".format(outpath))

In [None]:
fig = plt.figure()
ax = plt.axes()
plt.plot([], [])

plt.plot(
    jet_bins[:-1],
    np.array(stats_pf[1])/np.array(stats_pf[0]),
    label="PF", marker="o")

plt.plot(
    jet_bins[:-1],
    np.array(stats_mlpf[1])/np.array(stats_mlpf[0]),
    label="MLPF", marker="^")

plt.xscale("log")
cms_label(ax)
sample_label(ax, physics_process, x=0.01, y=0.94)
plt.legend()
plt.ylabel("jet $p_T$ response IQR / median")
ax.text(0.01, 0.95, "AK4 CHS jets", transform=ax.transAxes)
plt.ylim(0, 1.0)
plt.savefig("{}/ak4_chs_jet_response_iqr_over_median.pdf".format(outpath))

In [None]:
fig = plt.figure()
ax = plt.axes()
plt.plot([], [])

plt.plot(
    jet_bins[:-1],
    np.array(stats_pf[2]),
    label="PF", marker="o")

plt.plot(
    jet_bins[:-1],
    np.array(stats_mlpf[2]),
    label="MLPF", marker="^")

plt.xscale("log")
cms_label(ax)
sample_label(ax, physics_process, x=0.01, y=0.94)
plt.legend()
plt.ylabel("fraction of jets matched to gen")
ax.text(0.01, 0.95, "AK4 CHS jets", transform=ax.transAxes)
plt.ylim(0.8, 1.2)
plt.savefig("{}/ak4_chs_jet_match_frac.pdf".format(outpath))

## Fat jets

In [None]:
stats_pf, stats_mlpf = get_response_in_bins(
    label_gjidx="FatJet_genJetAK8Idx", label_rjpt="FatJet_pt_raw", label_gjpt="GenJetAK8_pt",
    jet_bins=jet_bins_fat,
    fn="ak8",
    title="AK8",
)

In [None]:
fig = plt.figure()
ax = plt.axes()
plt.plot([], [])
plt.plot(jet_bins_fat[:-1], np.array(stats_pf[0]), marker="o", label="PF")
plt.plot(jet_bins_fat[:-1], np.array(stats_mlpf[0]), marker="^", label="MLPF")
plt.xscale("log")
plt.legend()
plt.ylabel("jet $p_T$ response median")
cms_label(ax)
sample_label(ax, physics_process, x=0.01, y=0.94)
ax.text(0.01, 0.95, "AK8 jets", transform=ax.transAxes)
plt.axhline(1.0, color="black", ls="--")
plt.ylim(0.5, 1.5)
plt.savefig("{}/ak8_jet_response_median.pdf".format(outpath))

In [None]:
fig = plt.figure()
ax = plt.axes()
plt.plot([], [])

plt.plot(
    jet_bins_fat[:-1],
    np.array(stats_pf[1])/np.array(stats_pf[0]),
    label="PF", marker="o")

plt.plot(
    jet_bins_fat[:-1],
    np.array(stats_mlpf[1])/np.array(stats_mlpf[0]),
    label="MLPF", marker="^")

plt.xscale("log")
cms_label(ax)
sample_label(ax, physics_process, x=0.01, y=0.94)
plt.legend()
plt.ylabel("jet $p_T$ response IQR / median")
ax.text(0.01, 0.95, "AK8 jets", transform=ax.transAxes)
plt.ylim(0, 0.4)
plt.savefig("{}/ak8_jet_response_iqr_over_median.pdf".format(outpath))

In [None]:
fig = plt.figure()
ax = plt.axes()
plt.plot([], [])

plt.plot(
    jet_bins_fat[:-1],
    np.array(stats_pf[2]),
    label="PF", marker="o")

plt.plot(
    jet_bins_fat[:-1],
    np.array(stats_mlpf[2]),
    label="MLPF", marker="^")

plt.xscale("log")
cms_label(ax)
sample_label(ax, physics_process, x=0.01, y=0.94)
plt.legend()
plt.ylabel("fraction of jets matched to gen")
ax.text(0.01, 0.95, "AK8 jets", transform=ax.transAxes)
plt.ylim(0, 2)
plt.savefig("{}/ak8_jet_match_frac.pdf".format(outpath))

# MET

In [None]:
f, (a0, a1) = plt.subplots(2, 1, gridspec_kw={"height_ratios": [3, 1]}, sharex=True)

h0 = to_bh(data_baseline["GenMET_pt"], met_bins)
h1 = to_bh(data_baseline["RawPFMET_pt"], met_bins)
h3 = to_bh(data_mlpf_new["RawPFMET_pt"], met_bins)

plt.sca(a0)
x0 = mplhep.histplot(h0, histtype="step", lw=2, label="gen", binwnorm=1.0, ls="--")
x1 = mplhep.histplot(h1, histtype="step", lw=2, label="PF", binwnorm=1.0, ls="-")
x3 = mplhep.histplot(h3, histtype="step", lw=2, label="MLPF", binwnorm=1.0, ls="-")

# plt.xscale("log")
plt.yscale("log")
cms_label(a0)
sample_label(a0, physics_process, x=0.01, y=0.94)
handles, labels = a0.get_legend_handles_labels()
handles = [x0[0].stairs, x1[0].stairs, x3[0].stairs]

a0.legend(handles, labels, loc=1)
plt.ylim(1, 10**5)
plt.ylabel("Number of events / bin")

plt.sca(a1)
mplhep.histplot(h0 / h0, histtype="step", lw=2, ls="--")
mplhep.histplot(h1 / h0, histtype="step", lw=2, ls="-")
mplhep.histplot(h3 / h0, histtype="step", lw=2, ls="-")
if folder == "QCD_PU":
    plt.ylim(-5,5)
elif folder == "TTbar_PU":
    plt.ylim(-2,5)

plt.ylabel("reco / gen")
plt.xlabel("MET [GeV]")
plt.xlim(min(met_bins), max(met_bins))

plt.savefig("{}/met.pdf".format(outpath))