In [None]:
import sklearn
import sklearn.metrics
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import pandas
import mplhep
import pickle
import awkward
import glob
import bz2
import os
import tqdm
import fastjet
import vector
import uproot

mplhep.style.use("CMS")

import sys
sys.path += ["../../mlpf/"]

import jet_utils
sys.path += ["../../mlpf/plotting/"]

from plot_utils import ELEM_LABELS_CMS, ELEM_NAMES_CMS
from plot_utils import CLASS_LABELS_CMS, CLASS_NAMES_CMS
from plot_utils import cms_label, sample_label
from plot_utils import pid_to_text

In [None]:
def load_tree(ttree):
    particles_pythia = ttree.arrays(["gen_pt", "gen_eta", "gen_phi", "gen_energy", "gen_pdgid", "gen_status", "gen_daughters"])
    particles_cp = ttree.arrays(["caloparticle_pt", "caloparticle_eta", "caloparticle_phi", "caloparticle_energy", "caloparticle_pid"])
    genjet = ttree.arrays(["genjet_pt", "genjet_eta", "genjet_phi", "genjet_energy"])
    genmet = ttree.arrays(["genmet_pt"])
    return awkward.Array({"pythia": particles_pythia, "cp": particles_cp, "genjet": genjet, "genmet": genmet})

def med_iqr(arr):
    if len(arr) > 0:
        p25 = np.percentile(arr, 25)
        p50 = np.percentile(arr, 50)
        p75 = np.percentile(arr, 75)
    else:
        p25 = 0.0
        p50 = 0.0
        p75 = 0.0
    return p50, p75 - p25

In [None]:
#Download from https://jpata.web.cern.ch/jpata/mlpf/cms/20240823_simcluster/nopu/

# sample_name = "cms_pf_qcd"
# sample_folder = "QCDForPF_14TeV_TuneCUETP8M1_cfi"
# pu_config = "pu55to75"

sample_name = "cms_pf_ztt"
sample_folder = "ZTT_All_hadronic_14TeV_TuneCUETP8M1_cfi"
pu_config = "pu55to75"

max_files = 100

rootfiles = sorted(glob.glob("/local/joosep/mlpf/cms/20240823_simcluster/{}/{}/root/pfntuple_*.root".format(pu_config, sample_folder)))
pklfiles = sorted(glob.glob("/local/joosep/mlpf/cms/20240823_simcluster/{}/{}/raw/pfntuple_*.pkl.bz2".format(pu_config, sample_folder)))

rootfiles_d = {fn.split("/")[-1].split(".")[0]: fn for fn in rootfiles}
pklfiles_d = {fn.split("/")[-1].split(".")[0]: fn for fn in pklfiles}

common_keys = sorted(list(set(set(rootfiles_d.keys()).intersection(set(pklfiles_d.keys())))))

tts = [
    load_tree(uproot.open(rootfiles_d[k])["pfana/pftree"]) for k in common_keys[:max_files]
]
tts = awkward.concatenate(tts, axis=0)

In [None]:
particles_pythia = tts["pythia"]
particles_cp = tts["cp"]
len(particles_pythia), len(particles_cp)

In [None]:
pickle_data = sum(
    [
        pickle.load(bz2.BZ2File(pklfiles_d[k], "r"))
        for k in common_keys[:max_files]
    ],
    [],
)

for i in range(len(pickle_data)):
    for coll in ["ytarget", "ycand"]:
        pickle_data[i][coll] = pandas.DataFrame(pickle_data[i][coll])
        pickle_data[i][coll]["phi"] = np.arctan2(pickle_data[i][coll]["sin_phi"], pickle_data[i][coll]["cos_phi"])

#get awkward and flat arrays from the data
arrs_awk = {}
arrs_flat = {}

#tracks and clusters
for coll in ["Xelem"]:
    arrs_awk[coll] = {}
    arrs_flat[coll] = {}
    for feat in ["typ", "pt", "eta", "phi", "energy"]:
        arr = [np.array(p[coll][feat][p[coll]["typ"] != 0]) for p in pickle_data]
        arrs_awk[coll][feat] = awkward.unflatten(awkward.concatenate(arr), [len(a) for a in arr])
        arr = [np.array(p[coll][feat]) for p in pickle_data]
        arrs_flat[coll][feat] = awkward.unflatten(awkward.concatenate(arr), [len(a) for a in arr])

#MLPF targets and PF reco 
for coll in ["ytarget", "ycand"]:
    arrs_awk[coll] = {}
    arrs_flat[coll] = {}
    for feat in ["pid", "pt", "eta", "phi", "energy", "ispu"]:
        arr = [np.array(p[coll][feat][p[coll]["pid"] != 0]) for p in pickle_data]
        arrs_awk[coll][feat] = awkward.unflatten(awkward.concatenate(arr), [len(a) for a in arr])
        arr = [np.array(p[coll][feat]) for p in pickle_data]
        arrs_flat[coll][feat] = awkward.unflatten(awkward.concatenate(arr), [len(a) for a in arr])

#pythia generator level particles
arrs_awk["pythia"] = {}
arrs_awk["pythia"]["pid"] = awkward.from_regular([np.array(p["pythia"][:, 0]) for p in pickle_data])
arrs_awk["pythia"]["pt"] = awkward.from_regular([np.array(p["pythia"][:, 1]) for p in pickle_data])
arrs_awk["pythia"]["eta"] = awkward.from_regular([np.array(p["pythia"][:, 2]) for p in pickle_data])
arrs_awk["pythia"]["phi"] = awkward.from_regular([np.array(p["pythia"][:, 3]) for p in pickle_data])
arrs_awk["pythia"]["energy"] = awkward.from_regular([np.array(p["pythia"][:, 4]) for p in pickle_data])

#genMet, genJets from CMSSW (should be the same as computed from Pythia)
genmet_cmssw = np.array([pickle_data[i]["genmet"][0, 0] for i in range(len(pickle_data))])
genjet_cmssw = awkward.from_regular([pickle_data[i]["genjet"] for i in range(len(pickle_data))])
genjet_cmssw = vector.awk(
    awkward.zip(
        {   
            "pt": genjet_cmssw[:, :, 0],
            "eta": genjet_cmssw[:, :, 1],
            "phi": genjet_cmssw[:, :, 2],
            "energy": genjet_cmssw[:, :, 3],
        }
    )
)

#MET from MLPF targets and from PF particles
ytarget_met = np.sqrt(awkward.sum(
    (arrs_awk["ytarget"]["pt"] * np.sin(arrs_awk["ytarget"]["phi"]))**2 + (arrs_awk["ytarget"]["pt"] * np.cos(arrs_awk["ytarget"]["phi"]))**2,
    axis=1
))

ycand_met = np.sqrt(awkward.sum(
    (arrs_awk["ycand"]["pt"] * np.sin(arrs_awk["ycand"]["phi"]))**2 + (arrs_awk["ycand"]["pt"] * np.cos(arrs_awk["ycand"]["phi"]))**2,
    axis=1
))

In [None]:
b = np.logspace(-4,4,100)
fig = plt.figure()
ax = plt.axes()

abs_pid = np.abs(particles_pythia["gen_pdgid"])
mask_pythia_nonu = (
    ((particles_pythia["gen_status"]==1) & (abs_pid!=12) & (abs_pid!=14) & (abs_pid!=16)) #|
    # ((particles_pythia["gen_status"]==2) & (awkward.num(particles_pythia["gen_daughters"], axis=2) == 0))
)
pu_mask = arrs_awk["ytarget"]["ispu"]<0.5
mask_cp = np.abs(particles_cp["caloparticle_eta"])<5

plt.hist(awkward.flatten(particles_pythia[mask_pythia_nonu]["gen_pt"]), bins=b, label="Pythia", histtype="step")
plt.hist(awkward.flatten(particles_cp[mask_cp]["caloparticle_pt"]), bins=b, label="CaloParticle", histtype="step")
plt.hist(awkward.flatten(arrs_awk["ytarget"]["pt"]), bins=b, label="MLPF target", histtype="step")
plt.hist(awkward.flatten(arrs_awk["ytarget"]["pt"][pu_mask]), bins=b, label="MLPF target, no PU", histtype="step")

plt.xscale("log")
plt.yscale("log")
plt.xlabel("Particle $p_T$ [GeV]")
plt.legend(loc=1, fontsize=12)
plt.ylim(1, 1e7)

cms_label(ax)
sample_label(ax, sample_name)

plt.savefig("{}_particle_pt.pdf".format(sample_label))

In [None]:
b = np.logspace(-4,4,100)

pid1 = np.abs(particles_pythia["gen_pdgid"])
pid2 = np.abs(particles_cp["caloparticle_pid"])
pid3 = np.abs(arrs_awk["ytarget"]["pid"])

uniq_pid = np.unique(awkward.flatten(pid1[mask_pythia_nonu]))
fig, axs = plt.subplots(4,4, figsize=(16,16))

iax = 0
axs = axs.flatten()

for pid in uniq_pid:
    if (np.sum(pid1==pid)>0):
        plt.sca(axs[iax])
        plt.hist(awkward.flatten(particles_pythia[mask_pythia_nonu & (pid1==pid)]["gen_pt"]), bins=b, label="Pythia", histtype="step")
        plt.hist(awkward.flatten(particles_cp[mask_cp & (pid2==pid)]["caloparticle_pt"]), bins=b, label="CaloParticle", histtype="step")
        plt.hist(awkward.flatten(arrs_awk["ytarget"]["pt"][pid3==pid]), bins=b, label="MLPF target", histtype="step")
        plt.hist(awkward.flatten(arrs_awk["ytarget"]["pt"][(pid3==pid) & pu_mask]), bins=b, label="MLPF target, no PU", histtype="step")
        
        plt.xscale("log")
        plt.yscale("log")
        plt.xlabel("Particle $p_T$ [GeV]")
        plt.legend(loc=1, fontsize=8)
        plt.title(pid, fontsize=12)
        iax += 1
        plt.ylim(1, 1e6)
plt.tight_layout()
plt.savefig("{}_particle_pt_separate.pdf".format(sample_label))

In [None]:
jets_coll = {}
jetdef = fastjet.JetDefinition(fastjet.antikt_algorithm, 0.4)

vec = vector.awk(
    awkward.zip(
        {   
            "pt": particles_pythia[mask_pythia_nonu]["gen_pt"],
            "eta": particles_pythia[mask_pythia_nonu]["gen_eta"],
            "phi": particles_pythia[mask_pythia_nonu]["gen_phi"],
            "energy": particles_pythia[mask_pythia_nonu]["gen_energy"],
        }
    )
)
cluster = fastjet.ClusterSequence(vec.to_xyzt(), jetdef)
jets_coll["pythia_nonu"] = cluster.inclusive_jets(min_pt=3)

vec = vector.awk(
    awkward.zip(
        {   
            "pt": particles_cp[mask_cp]["caloparticle_pt"],
            "eta": particles_cp[mask_cp]["caloparticle_eta"],
            "phi": particles_cp[mask_cp]["caloparticle_phi"],
            "energy": particles_cp[mask_cp]["caloparticle_energy"],
        }
    )
)
cluster = fastjet.ClusterSequence(vec.to_xyzt(), jetdef)
jets_coll["cp"] = cluster.inclusive_jets(min_pt=3)

for coll in ["ytarget", "ycand"]:
    vec = vector.awk(
        awkward.zip(
            {   
                "pt": arrs_awk[coll]["pt"],
                "eta": arrs_awk[coll]["eta"],
                "phi": arrs_awk[coll]["phi"],
                "energy": arrs_awk[coll]["energy"],
            }
        )
    )
    cluster = fastjet.ClusterSequence(vec.to_xyzt(), jetdef)
    jets_coll[coll] = cluster.inclusive_jets(min_pt=3)

vec = vector.awk(
    awkward.zip(
        {   
            "pt": arrs_awk["ytarget"]["pt"][pu_mask],
            "eta": arrs_awk["ytarget"]["eta"][pu_mask],
            "phi": arrs_awk["ytarget"]["phi"][pu_mask],
            "energy": arrs_awk["ytarget"]["energy"][pu_mask],
        }
    )
)
cluster = fastjet.ClusterSequence(vec.to_xyzt(), jetdef)
jets_coll["ytarget_nopu"] = cluster.inclusive_jets(min_pt=3)

jets_coll["genjet"] = genjet_cmssw

In [None]:
bins = np.logspace(0, 4, 100)
fig = plt.figure()
ax = plt.axes()

plt.hist(awkward.flatten(jets_coll["pythia_nonu"].pt), histtype="step", bins=bins, label="Pythia")
plt.hist(awkward.flatten(jets_coll["cp"].pt), histtype="step", bins=bins, label="CaloParticle")
plt.hist(awkward.flatten(jets_coll["ytarget"].pt), histtype="step", bins=bins, label="MLPF target")
plt.hist(awkward.flatten(jets_coll["ytarget_nopu"].pt), histtype="step", bins=bins, label="MLPF target, no PU")
plt.xscale("log")
plt.yscale("log")
plt.legend()
sample_label(ax, sample_name)
cms_label(ax)
plt.xlabel("jet $p_T$ [GeV]")
plt.ylim(1, 1e6)

In [None]:
bins = np.linspace(-5, 5, 100)

fig = plt.figure()
ax = plt.axes()

plt.hist(awkward.flatten(jets_coll["pythia_nonu"].eta), histtype="step", bins=bins, label="Pythia")
plt.hist(awkward.flatten(jets_coll["cp"].eta), histtype="step", bins=bins, label="CaloParticle")
plt.hist(awkward.flatten(jets_coll["ytarget"].eta), histtype="step", bins=bins, label="MLPF target");
plt.hist(awkward.flatten(jets_coll["ytarget_nopu"].eta), histtype="step", bins=bins, label="MLPF target, no PU");

sample_label(ax, sample_name)
cms_label(ax)
plt.xlabel("jet $\eta$")

In [None]:
pythia_to_cp = jet_utils.match_two_jet_collections(jets_coll, "pythia_nonu", "cp", 0.1)
pythia_to_ytarget = jet_utils.match_two_jet_collections(jets_coll, "pythia_nonu", "ytarget", 0.1)
pythia_to_ytarget_nopu = jet_utils.match_two_jet_collections(jets_coll, "pythia_nonu", "ytarget_nopu", 0.1)
pythia_to_ycand = jet_utils.match_two_jet_collections(jets_coll, "pythia_nonu", "ycand", 0.1)

In [None]:
fm_cp = np.sum(awkward.num(pythia_to_cp["pythia_nonu"]))/np.sum(awkward.num(jets_coll["pythia_nonu"], axis=1))
fm_tg = np.sum(awkward.num(pythia_to_ytarget["pythia_nonu"]))/np.sum(awkward.num(jets_coll["pythia_nonu"], axis=1))
fm_tg_nopu = np.sum(awkward.num(pythia_to_ytarget_nopu["pythia_nonu"]))/np.sum(awkward.num(jets_coll["pythia_nonu"], axis=1))
fm_pf = np.sum(awkward.num(pythia_to_ycand["pythia_nonu"]))/np.sum(awkward.num(jets_coll["pythia_nonu"], axis=1))

In [None]:
plt.figure()
ax = plt.axes()
b = np.linspace(0.5,1.5,101)

ratio = awkward.flatten((jets_coll["cp"][pythia_to_cp["cp"]].pt / jets_coll["pythia_nonu"][pythia_to_cp["pythia_nonu"]].pt))
med, iqr = med_iqr(ratio)
plt.hist(
    ratio, bins=b, histtype="bar", lw=1, label="CaloParticle (M={:.2f}, IQR={:.2f}, f={:.2f})".format(med, iqr, fm_cp)
);

ratio = awkward.flatten((jets_coll["ytarget"][pythia_to_ytarget["ytarget"]].pt / jets_coll["pythia_nonu"][pythia_to_ytarget["pythia_nonu"]].pt))
med, iqr = med_iqr(ratio)
plt.hist(
    ratio, bins=b, histtype="step", lw=1, label="MLPF target (M={:.2f}, IQR={:.2f}, f={:.2f})".format(med, iqr, fm_tg)
);

ratio = awkward.flatten((jets_coll["ytarget_nopu"][pythia_to_ytarget_nopu["ytarget_nopu"]].pt / jets_coll["pythia_nonu"][pythia_to_ytarget_nopu["pythia_nonu"]].pt))
med, iqr = med_iqr(ratio)
plt.hist(
    ratio, bins=b, histtype="step", lw=1, label="MLPF target, no PU (M={:.2f}, IQR={:.2f}, f={:.2f})".format(med, iqr, fm_tg)
);

ratio = awkward.flatten((jets_coll["ycand"][pythia_to_ycand["ycand"]].pt / jets_coll["pythia_nonu"][pythia_to_ycand["pythia_nonu"]].pt))
med, iqr = med_iqr(ratio)
plt.hist(
    ratio, bins=b, histtype="step", lw=1, label="PF (M={:.2f}, IQR={:.2f}, f={:.2f})".format(med, iqr, fm_pf)
);

#plt.xscale("log")
#plt.yscale("log")
plt.xlabel("jet $p_T$ / gen-jet $p_T$")
plt.legend(loc=1, fontsize=10)
cms_label(ax)
sample_label(ax, sample_name)
#plt.ylim(1,1e6)
plt.savefig("{}_truth_target_jets.pdf".format(sample_label))

In [None]:
plt.figure()
ax = plt.axes()
b = np.linspace(0,5,101)

ratio = awkward.flatten((jets_coll["cp"][pythia_to_cp["cp"]].pt / jets_coll["pythia_nonu"][pythia_to_cp["pythia_nonu"]].pt))
med, iqr = med_iqr(ratio)
plt.hist(
    ratio, bins=b, histtype="bar", lw=1, label="CaloParticle (M={:.2f}, IQR={:.2f}, f={:.2f})".format(med, iqr, fm_cp)
);

ratio = awkward.flatten((jets_coll["ytarget"][pythia_to_ytarget["ytarget"]].pt / jets_coll["pythia_nonu"][pythia_to_ytarget["pythia_nonu"]].pt))
med, iqr = med_iqr(ratio)
plt.hist(
    ratio, bins=b, histtype="step", lw=1, label="MLPF target (M={:.2f}, IQR={:.2f}, f={:.2f})".format(med, iqr, fm_tg)
);

ratio = awkward.flatten((jets_coll["ytarget_nopu"][pythia_to_ytarget_nopu["ytarget_nopu"]].pt / jets_coll["pythia_nonu"][pythia_to_ytarget_nopu["pythia_nonu"]].pt))
med, iqr = med_iqr(ratio)
plt.hist(
    ratio, bins=b, histtype="step", lw=1, label="MLPF target, no PU (M={:.2f}, IQR={:.2f}, f={:.2f})".format(med, iqr, fm_tg)
);

ratio = awkward.flatten((jets_coll["ycand"][pythia_to_ycand["ycand"]].pt / jets_coll["pythia_nonu"][pythia_to_ycand["pythia_nonu"]].pt))
med, iqr = med_iqr(ratio)
plt.hist(
    ratio, bins=b, histtype="step", lw=1, label="PF (M={:.2f}, IQR={:.2f}, f={:.2f})".format(med, iqr, fm_pf)
);

#plt.xscale("log")
plt.yscale("log")
plt.xlabel("jet $p_T$ / gen-jet $p_T$")
plt.legend(loc=(0.55, 0.8), fontsize=10)
cms_label(ax)
sample_label(ax, sample_name)
#plt.ylim(1,1e6)
plt.savefig("{}_truth_target_jets.pdf".format(sample_label))

In [None]:
import pickle

In [None]:
def add_results(d0, d1):
    d_ret = {}
    k0 = set(d0.keys())
    k1 = set(d1.keys())

    for k in k0.intersection(k1):
        d_ret[k] = d0[k] + d1[k]

    for k in k0.difference(k1):
        d_ret[k] = d0[k]

    for k in k1.difference(k0):
        d_ret[k] = d1[k]

    return d_ret

In [None]:
files = [pickle.load(open(fn, "rb")) for fn in glob.glob("../../out8.pkl")]
ret = reduce(add_results, files, {})
for k in sorted(ret.keys()):
    print(k)

In [None]:
import mplhep

In [None]:
for sample in ["TTbar_14TeV_TuneCUETP8M1_cfi", "ZTT_All_hadronic_14TeV_TuneCUETP8M1_cfi"]:
    plt.figure()
    mplhep.histplot(ret[f"{sample}/jets_pt_pythia"])
    mplhep.histplot(ret[f"{sample}/jets_pt_cand"])
    mplhep.histplot(ret[f"{sample}/jets_pt_caloparticle"])
    mplhep.histplot(ret[f"{sample}/jets_pt_target"])
    mplhep.histplot(ret[f"{sample}/jets_pt_target_pumask"])
    plt.xscale("log")

In [None]:
rebin = 10
for sample in ["TTbar_14TeV_TuneCUETP8M1_cfi", "ZTT_All_hadronic_14TeV_TuneCUETP8M1_cfi"]:
    plt.figure()
    mplhep.histplot(0.0*ret[f"{sample}/jets_pt_ratio_cand"][bh.rebin(rebin)])
    mplhep.histplot(ret[f"{sample}/jets_pt_ratio_cand"][bh.rebin(rebin)])
    mplhep.histplot(ret[f"{sample}/jets_pt_ratio_caloparticle"][bh.rebin(rebin)])
    mplhep.histplot(ret[f"{sample}/jets_pt_ratio_target"][bh.rebin(rebin)])
    mplhep.histplot(ret[f"{sample}/jets_pt_ratio_target_pumask"][bh.rebin(rebin)])
    plt.yscale("log")