In [None]:
import sklearn
import sklearn.metrics
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import pandas
import mplhep
import pickle
import awkward
import glob
import bz2
import os
import tqdm
import fastjet
import vector
from pathlib import Path

mplhep.style.use("CMS")

In [None]:
import sys
sys.path += ["../../mlpf/"]

import jet_utils

sys.path += ["../../mlpf/plotting/"]

from plot_utils import ELEM_LABELS_CMS, ELEM_NAMES_CMS
from plot_utils import CLASS_LABELS_CMS, CLASS_NAMES_CMS
from plot_utils import cms_label, sample_label
from plot_utils import pid_to_text
from plot_utils import save_img

In [None]:
!ls /media/joosep/data/20240823_simcluster/pu55to75/

In [None]:
#https://jpata.web.cern.ch/jpata/mlpf/cms/20240823_simcluster/nopu/TTbar_14TeV_TuneCUETP8M1_cfi/raw

sample = "QCDForPF_14TeV_TuneCUETP8M1_cfi"
sample_name = "cms_pf_qcd"
sample_pid = 11

pickle_data = sum(
    [
        pickle.load(bz2.BZ2File(f, "r"))
        for f in tqdm.tqdm(sorted(list(glob.glob("/media/joosep/data/20240823_simcluster/pu55to75/{}/raw/*.pkl.bz2".format(sample))))[:100])
    ],
    [],
)

# compute phi from sin_phi, cos_phi for the ygen and ycand
for i in range(len(pickle_data)):
    for coll in ["ytarget", "ycand"]:
        pickle_data[i][coll] = pandas.DataFrame(pickle_data[i][coll])
        pickle_data[i][coll]["phi"] = np.arctan2(pickle_data[i][coll]["sin_phi"], pickle_data[i][coll]["cos_phi"])

In [None]:
#get awkward and flat arrays from the data
arrs_awk = {}
arrs_flat = {}

#tracks and clusters
for coll in ["Xelem"]:
    arrs_awk[coll] = {}
    arrs_flat[coll] = {}
    for feat in ["typ", "pt", "eta", "phi", "energy"]:
        arr = [np.array(p[coll][feat][p[coll]["typ"] != 0]) for p in pickle_data]
        arrs_awk[coll][feat] = awkward.unflatten(awkward.concatenate(arr), [len(a) for a in arr])
        arr = [np.array(p[coll][feat]) for p in pickle_data]
        arrs_flat[coll][feat] = awkward.unflatten(awkward.concatenate(arr), [len(a) for a in arr])

#MLPF tarets and PF reco 
for coll in ["ytarget", "ycand"]:
    arrs_awk[coll] = {}
    arrs_flat[coll] = {}
    for feat in ["pid", "pt", "eta", "phi", "energy", "ispu"]:
        arr = [np.array(p[coll][feat][p[coll]["pid"] != 0]) for p in pickle_data]
        arrs_awk[coll][feat] = awkward.unflatten(awkward.concatenate(arr), [len(a) for a in arr])
        arr = [np.array(p[coll][feat]) for p in pickle_data]
        arrs_flat[coll][feat] = awkward.unflatten(awkward.concatenate(arr), [len(a) for a in arr])

#pythia generator level particles
arrs_awk["pythia"] = {}
arrs_awk["pythia"]["pid"] = awkward.from_regular([np.array(p["pythia"][:, 0]) for p in pickle_data])
arrs_awk["pythia"]["pt"] = awkward.from_regular([np.array(p["pythia"][:, 1]) for p in pickle_data])
arrs_awk["pythia"]["eta"] = awkward.from_regular([np.array(p["pythia"][:, 2]) for p in pickle_data])
arrs_awk["pythia"]["phi"] = awkward.from_regular([np.array(p["pythia"][:, 3]) for p in pickle_data])
arrs_awk["pythia"]["energy"] = awkward.from_regular([np.array(p["pythia"][:, 4]) for p in pickle_data])

#genMet, genJets from CMSSW (should be the same as computed from Pythia)
genmet_cmssw = np.array([pickle_data[i]["genmet"][0, 0] for i in range(len(pickle_data))])
genjet_cmssw = awkward.from_regular([pickle_data[i]["genjet"] for i in range(len(pickle_data))])
genjet_cmssw = vector.awk(
    awkward.zip(
        {   
            "pt": genjet_cmssw[:, :, 0],
            "eta": genjet_cmssw[:, :, 1],
            "phi": genjet_cmssw[:, :, 2],
            "energy": genjet_cmssw[:, :, 3],
        }
    )
)

#MET from MLPF targets and from PF particles
ytarget_met = np.sqrt(awkward.sum(
    (arrs_awk["ytarget"]["pt"] * np.sin(arrs_awk["ytarget"]["phi"]))**2 + (arrs_awk["ytarget"]["pt"] * np.cos(arrs_awk["ytarget"]["phi"]))**2,
    axis=1
))

ycand_met = np.sqrt(awkward.sum(
    (arrs_awk["ycand"]["pt"] * np.sin(arrs_awk["ycand"]["phi"]))**2 + (arrs_awk["ycand"]["pt"] * np.cos(arrs_awk["ycand"]["phi"]))**2,
    axis=1
))

In [None]:
#These arrays are awkward, such that each event can contain a different number of Xelem, ytarget, ycand.
arrs_awk

In [None]:
#These arrays are flattened such that each event has the same size, so you can match e.g. Xelem to ytarget or ycand by indices.
arrs_flat

In [None]:
np.unique(awkward.flatten(arrs_awk["pythia"]["pid"]), return_counts=True)

In [None]:
#Cluster MLPF target jets, PF jets, and Pythia genjets
jets_coll = {}
jets_coll["cmssw"] = genjet_cmssw

for coll in ["ytarget", "ycand", "pythia"]:
    vec = vector.awk(
        awkward.zip(
            {   
                "pt": arrs_awk[coll]["pt"],
                "eta": arrs_awk[coll]["eta"],
                "phi": arrs_awk[coll]["phi"],
                "energy": arrs_awk[coll]["energy"],
            }
        )
    )
    jetdef = fastjet.JetDefinition(fastjet.antikt_algorithm, 0.4)
    cluster = fastjet.ClusterSequence(vec.to_xyzt(), jetdef)
    jets_coll[coll] = cluster.inclusive_jets(min_pt=3)

#Apply a mask to remove target particles from PU 
msk_nopu = arrs_awk["ytarget"]["ispu"]<0.5
vec = vector.awk(
    awkward.zip(
        {   
            "pt": arrs_awk["ytarget"]["pt"][msk_nopu],
            "eta": arrs_awk["ytarget"]["eta"][msk_nopu],
            "phi": arrs_awk["ytarget"]["phi"][msk_nopu],
            "energy": arrs_awk["ytarget"]["energy"][msk_nopu],
        }
    )
)
jetdef = fastjet.JetDefinition(fastjet.antikt_algorithm, 0.4)
cluster = fastjet.ClusterSequence(vec.to_xyzt(), jetdef)
jets_coll["ytarget_nopu"] = cluster.inclusive_jets(min_pt=3)

cmssw_to_ytarget_nopu = jet_utils.match_two_jet_collections(jets_coll, "cmssw", "ytarget_nopu", 0.1)
cmssw_to_ytarget = jet_utils.match_two_jet_collections(jets_coll, "cmssw", "ytarget", 0.1)
cmssw_to_ycand = jet_utils.match_two_jet_collections(jets_coll, "cmssw", "ycand", 0.1)

pythia_to_ytarget = jet_utils.match_two_jet_collections(jets_coll, "pythia", "ytarget", 0.1)
pythia_to_ycand = jet_utils.match_two_jet_collections(jets_coll, "pythia", "ycand", 0.1)

pythia_to_cmssw = jet_utils.match_two_jet_collections(jets_coll, "pythia", "cmssw", 0.1)

In [None]:
plt.figure()
ax = plt.axes()
plt.hist(awkward.flatten(arrs_awk["ytarget"]["ispu"]), bins=np.linspace(0,1,101), histtype="step")
plt.yscale("log")
#plt.xscale("log")
plt.xlabel("PU fraction")
cms_label(ax)
sample_label(ax, sample_name)
save_img("{}_pu_frac.png".format(sample), cp_dir=Path("./"))

In [None]:
ptbins = np.logspace(-3,3,100)
ispu_fracs = []
for ibin in range(len(ptbins)-1):
    msk = (arrs_awk["ytarget"]["pt"]>=ptbins[ibin]) & (arrs_awk["ytarget"]["pt"]<ptbins[ibin+1])
    ispu = awkward.flatten(arrs_awk["ytarget"]["ispu"][msk])
    frac_pu = np.sum(ispu>0.5) / len(ispu)
    ispu_fracs.append(frac_pu)

plt.figure(figsize=(5,5))
plt.plot(ptbins[:-1], ispu_fracs, marker=".")
#plt.yscale("log")
plt.xscale("log")
plt.ylim(0.0, 1.2)
plt.axhline(1.0, color="black", ls="--")
plt.legend(loc="best")
plt.xlabel("particle $p_T$ [GeV]")
plt.ylabel("fraction with isPU>0.5")

In [None]:
etabins = np.linspace(-5,5,100)
ispu_fracs = []
for ibin in range(len(etabins)-1):
    msk = (arrs_awk["ytarget"]["eta"]>=etabins[ibin]) & (arrs_awk["ytarget"]["eta"]<etabins[ibin+1])
    ispu = awkward.flatten(arrs_awk["ytarget"]["ispu"][msk])
    frac_pu = np.sum(ispu>0.5) / len(ispu)
    ispu_fracs.append(frac_pu)

plt.figure(figsize=(5,5))
plt.plot(etabins[:-1], ispu_fracs, marker=".")
#plt.yscale("log")
#plt.xscale("log")
plt.ylim(0.0, 1.2)
plt.axhline(1.0, color="black", ls="--")
plt.legend(loc="best")
plt.xlabel("particle $\eta$")
plt.ylabel("fraction with isPU>0.5")

## Particles

In [None]:
b = np.logspace(-3,4,500)
plt.figure(figsize=(5,5))
plt.hist(awkward.flatten(arrs_awk["pythia"]["pt"]), bins=b, label="Pythia", histtype="step")
plt.hist(awkward.flatten(arrs_awk["ytarget"]["pt"]), bins=b, label="MLPF target", histtype="step")
plt.xscale("log")
plt.yscale("log")
plt.legend(loc="best")

In [None]:
fig = plt.figure()
ax = plt.axes()
b = np.logspace(-3,4,500)
plt.hist(awkward.sum(arrs_awk["pythia"]["pt"][arrs_awk["pythia"]["pid"]==sample_pid], axis=1), bins=b, label="Pythia", histtype="step")
plt.hist(awkward.sum(arrs_awk["ytarget"]["pt"][arrs_awk["ytarget"]["pid"]==sample_pid], axis=1), bins=b, label="MLPF target", histtype="step")
plt.xscale("log")
plt.yscale("log")
plt.legend(loc="best")
plt.xlabel("Sum $p_T$ [GeV]")
cms_label(ax)
sample_label(ax, sample_name)
save_img("{}_particle_sumpt.png".format(sample), cp_dir=Path("./"))

## Jets

In [None]:
b = np.logspace(0,4,401)
plt.figure(figsize=(5,5))
plt.hist(np.abs(awkward.flatten(jets_coll["cmssw"].pt)), bins=b, histtype="step", label="genJet");
plt.hist(np.abs(awkward.flatten(jets_coll["ytarget"].pt)), bins=b, histtype="step", label="MLPF target");
plt.hist(np.abs(awkward.flatten(jets_coll["ytarget_nopu"].pt)), bins=b, histtype="step", label="MLPF target, no PU");
plt.xscale("log")
plt.yscale("log")
plt.legend(loc=1, fontsize=12)
plt.xlabel("jet $p_T$ [GeV]")
#plt.hist(np.abs(awkward.flatten(jets_coll["ycand"].eta)), bins=b, histtype="step");

In [None]:
b = np.linspace(0,6,401)
plt.figure(figsize=(5,5))
plt.hist(np.abs(awkward.flatten(jets_coll["cmssw"].eta)), bins=b, histtype="step", label="genJet");
plt.hist(np.abs(awkward.flatten(jets_coll["ytarget"].eta)), bins=b, histtype="step", label="MLPF target");
plt.hist(np.abs(awkward.flatten(jets_coll["ytarget_nopu"].eta)), bins=b, histtype="step", label="MLPF target, no PU");
plt.legend(loc=1, fontsize=12)
plt.xlabel("jet $\eta$")
#plt.hist(np.abs(awkward.flatten(jets_coll["ycand"].eta)), bins=b, histtype="step");

### One event display

In [None]:
iev = 2

plt.figure(figsize=(10,10))
plt.scatter(arrs_awk["pythia"]["eta"][iev], arrs_awk["pythia"]["phi"][iev], s=5*arrs_awk["pythia"]["pt"][iev], alpha=0.5, label="Pythia ptcl")
plt.scatter(arrs_awk["ytarget"]["eta"][iev], arrs_awk["ytarget"]["phi"][iev], s=5*arrs_awk["ytarget"]["pt"][iev], marker="s", alpha=0.5, label="MLPF target")
plt.scatter(jets_coll["cmssw"].eta[iev], jets_coll["cmssw"].phi[iev], s=5*jets_coll["cmssw"].pt[iev], marker="v", alpha=0.5, label="genJets")
plt.scatter(jets_coll["ytarget"].eta[iev], jets_coll["ytarget"].phi[iev], s=5*jets_coll["ytarget"].pt[iev], marker="^", alpha=0.5, label="MLPF target jets")
plt.legend(ncols=2, frameon=True)
plt.xlabel("$\eta$")
plt.ylabel("$\phi$")
plt.xlim(-6,6)
plt.ylim(-5,5)

## Jet response

In [None]:
plt.figure(figsize=(5,5))
b = np.logspace(-2,2,600)
plt.hist(
    awkward.flatten(
        (jets_coll["pythia"][pythia_to_cmssw["pythia"]].pt / jets_coll["cmssw"][pythia_to_cmssw["cmssw"]].pt)
    ), bins=b, histtype="step", lw=1, label="Pythia"
);

plt.xscale("log")
plt.yscale("log")
plt.xlabel("jet $p_T$ / genjet $p_T$")
plt.legend(loc=1, fontsize=12)
plt.axvline(1.0, color="black", ls="--", lw=0.5)

In [None]:
def plot_jet_ratio_ptcut(ptcut1, ptcut2):
    plt.figure(figsize=(5,5))
    b = np.logspace(-1,1,600)
    
    pt = jets_coll["cmssw"][cmssw_to_ycand["cmssw"]].pt
    plt.hist(
        awkward.flatten(
            (jets_coll["ycand"][cmssw_to_ycand["ycand"]].pt / jets_coll["cmssw"][cmssw_to_ycand["cmssw"]].pt)[(pt>=ptcut1) & (pt<ptcut2)]
        ), bins=b, histtype="step", lw=1, label="PF"
    )

    pt = jets_coll["cmssw"][cmssw_to_ytarget["cmssw"]].pt
    plt.hist(
        awkward.flatten(
            (jets_coll["ytarget"][cmssw_to_ytarget["ytarget"]].pt / jets_coll["cmssw"][cmssw_to_ytarget["cmssw"]].pt)[(pt>=ptcut1) & (pt<ptcut2)]
        ), bins=b, histtype="step", lw=1, label="MLPF target"
    );

    pt = jets_coll["cmssw"][cmssw_to_ytarget_nopu["cmssw"]].pt
    plt.hist(
        awkward.flatten(
            (jets_coll["ytarget_nopu"][cmssw_to_ytarget_nopu["ytarget_nopu"]].pt / jets_coll["cmssw"][cmssw_to_ytarget_nopu["cmssw"]].pt)[(pt>=ptcut1) & (pt<ptcut2)]
        ), bins=b, histtype="step", lw=1, label="MLPF target, no PU"
    );

    plt.xscale("log")
    plt.yscale("log")
    plt.xlabel("jet $p_T$ / genjet $p_T$")
    plt.legend(loc=2, fontsize=12)
    plt.axvline(1.0, color="black", ls="--", lw=0.5)

def plot_jet_ratio_ptcut2(ptcut1, ptcut2):
    fig = plt.figure()
    ax = plt.axes()
    b = np.linspace(0.5,1.5,100)

    pt = jets_coll["cmssw"][cmssw_to_ytarget["cmssw"]].pt
    plt.hist(
        awkward.flatten(
            (jets_coll["ytarget"][cmssw_to_ytarget["ytarget"]].pt / jets_coll["cmssw"][cmssw_to_ytarget["cmssw"]].pt)[(pt>=ptcut1) & (pt<ptcut2)]
        ), bins=b, histtype="bar", lw=1, label="MLPF target"
    );
    
    pt = jets_coll["cmssw"][cmssw_to_ycand["cmssw"]].pt
    plt.hist(
        awkward.flatten(
            (jets_coll["ycand"][cmssw_to_ycand["ycand"]].pt / jets_coll["cmssw"][cmssw_to_ycand["cmssw"]].pt)[(pt>=ptcut1) & (pt<ptcut2)]
        ), bins=b, histtype="step", lw=2, label="PF"
    )

    plt.xlabel("jet $p_T$ / genjet $p_T$")
    plt.legend(loc=1, fontsize=12)
    plt.axvline(1.0, color="black", ls="--", lw=0.5)
    cms_label(ax)
    sample_label(ax, sample_name)
    plt.yscale("log")
    save_img("{}_jet_pt_ratio.png".format(sample), cp_dir=Path("./"))

In [None]:
plot_jet_ratio_ptcut(0,500)
plt.ylim(1, 1e4)

In [None]:
plot_jet_ratio_ptcut2(0,1000)
#plt.ylim(0,75000)

In [None]:
def met(pt, phi):
    px = pt * np.cos(phi)
    py = pt * np.sin(phi)
    pt = np.sqrt(awkward.sum(px, axis=1)**2 + awkward.sum(py, axis=1)**2)
    return pt

In [None]:
b = np.logspace(-2,4,100)
plt.hist(
    awkward.to_numpy(genmet_cmssw),
    bins=b, histtype="step", lw=2, label="genMET"
);

plt.hist(
    awkward.to_numpy(met(arrs_awk["ycand"]["pt"], arrs_awk["ycand"]["phi"])),
    bins=b, histtype="step", lw=2, label="PF"
)

plt.hist(
    awkward.to_numpy(met(arrs_awk["ytarget"]["pt"], arrs_awk["ytarget"]["phi"])),
    bins=b, histtype="step", lw=2, label="MLPF targets"
)

plt.hist(
    awkward.to_numpy(met(arrs_awk["ytarget"]["pt"][arrs_awk["ytarget"]["ispu"]<0.5], arrs_awk["ytarget"]["phi"][arrs_awk["ytarget"]["ispu"]<0.5])),
    bins=b, histtype="step", lw=2, label="MLPF targets, no PU"
)

plt.legend(loc=2)
plt.yscale("log")
plt.xscale("log")
plt.xlabel("MET [GeV]")

## Matching of PFElements to PF candidates or MLPF targets 

In [None]:
def plot_element_matching_frac(elemtype):
    bins = np.logspace(-1, 3, 200)
    fracs_gen = []
    fracs_cand = []
    
    msk = arrs_flat["Xelem"]["typ"]==elemtype
    
    gen_pid = awkward.flatten(arrs_flat["ytarget"]["pid"][msk])
    cand_pid = awkward.flatten(arrs_flat["ycand"]["pid"][msk])
    elem_energy = awkward.flatten(arrs_flat["Xelem"]["energy"][msk])
    
    plt.figure(figsize=(5,5))
    plt.hist(track_pt, bins);
    plt.xscale("log")
    plt.yscale("log")
    plt.xlabel("PFElement $E$ [GeV]")
    
    for ibin in range(len(bins)-1):
        b0 = bins[ibin]
        b1 = bins[ibin+1]
        msk = (elem_energy >= b0) & (elem_energy < b1)
        frac_gen = np.sum(gen_pid[msk]!=0) / np.sum(msk)
        frac_cand = np.sum(cand_pid[msk]!=0) / np.sum(msk)
        fracs_gen.append(frac_gen)
        fracs_cand.append(frac_cand)
    
    plt.figure(figsize=(5,5))
    plt.plot(bins[:-1], fracs_gen, marker=".", label="MLPF target")
    plt.plot(bins[:-1], fracs_cand, marker=".", label="PF")
    plt.xscale("log")
    plt.ylabel("matched PFElements")
    plt.xlabel("PFElement $E$ [GeV]")
    plt.ylim(0,1.1)
    plt.legend(loc="best")

In [None]:
plot_element_matching_frac(1)

In [None]:
plot_element_matching_frac(4)

In [None]:
plot_element_matching_frac(5)

In [None]:
plot_element_matching_frac(6)

In [None]:
plot_element_matching_frac(8)

In [None]:
plot_element_matching_frac(9)

In [None]:
def plot_elem_vs_particle_ptratio(elemtype):
    plt.figure(figsize=(5,5))
    msk = (arrs_flat["Xelem"]["typ"] == elemtype) & (arrs_flat["ytarget"]["pid"] !=0)
    plt.hist(
        awkward.flatten((arrs_flat["ytarget"]["pt"]/arrs_flat["Xelem"]["pt"])[msk]),
        bins=np.logspace(-3,3,600), histtype="step", label="MLPF target"
    );
    
    msk = (arrs_flat["Xelem"]["typ"] == elemtype) & (arrs_flat["ycand"]["pid"] !=0)
    plt.hist(
        awkward.flatten((arrs_flat["ycand"]["pt"]/arrs_flat["Xelem"]["pt"])[msk]),
        bins=np.logspace(-3,3,600), histtype="step", label="PF"
    );
    
    plt.yscale("log")
    plt.xscale("log")
    plt.xlabel("target / PFElement $p_T$")
    plt.legend(loc=1, fontsize=12)

In [None]:
plot_elem_vs_particle_ptratio(1)

In [None]:
plot_elem_vs_particle_ptratio(4)

In [None]:
plot_elem_vs_particle_ptratio(5)

In [None]:
plot_elem_vs_particle_ptratio(6)

In [None]:
plot_elem_vs_particle_ptratio(8)

In [None]:
plot_elem_vs_particle_ptratio(9)