**Notebook used to make particle-level eff/fakerates split by detector region**

- Stacked histograms
- Particle pT, eta, phi
- Efficiency and Fake rate

In [None]:
import os, sys, glob
import pickle as pkl
import uproot
import awkward as ak
import vector
import numpy as np
vector.register_awkward()

import boost_histogram as bh
import numba
import mplhep
import sklearn
import sklearn.metrics
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.patches import Circle
import tqdm

mplhep.set_style(mplhep.styles.CMS)

In [None]:
sys.path += ["../../mlpf/plotting//"]
from plot_utils import EVALUATION_DATASET_NAMES, experiment_label
from plot_utils import SAMPLE_LABEL_CMS, pid_to_text, EXPERIMENT_LABELS

In [None]:
def sample_label(ax, sample, additional_text="", x=0.03, y=0.97, fontsize=20):
    text = EVALUATION_DATASET_NAMES[sample]
    plt.text(x, y, text + additional_text, ha="left", va="top", transform=ax.transAxes, fontsize=fontsize)

def cms_label(ax):
    return experiment_label(ax, experiment="CMS", tag1=" Simulation Preliminary", tag2="Run 3 (14 TeV)", x1=0.13)

In [None]:
@numba.njit
def deltaphi(phi1, phi2):
    diff = phi1 - phi2
    return np.arctan2(np.sin(diff), np.cos(diff))

@numba.njit
def deltar(eta1, phi1, eta2, phi2):
    deta = eta1 - eta2
    dphi = deltaphi(phi1, phi2)
    return np.sqrt(deta**2 + dphi**2)

@numba.njit
def match_particles(eta1, eta2, phi1, phi2, deltaR_cut):
    nev = len(eta1)
    ptcl_inds_1_ev = []
    ptcl_inds_2_ev = []
    best_drs_ev = []
    for iev in range(nev):
        ptcl_inds_1 = []
        ptcl_inds_2 = []
        best_drs = []

        # loop over the first collection
        pfs_used = np.zeros(len(eta2[iev]))
        for ip1 in range(len(eta1[iev])):
            # compute deltaR from this particle to all particles in the other collection
            drs = 999*np.ones(len(eta2[iev]), dtype=np.float64)

            # loop over the second collection
            for ip2 in range(len(eta2[iev])):
                if pfs_used[ip2]==1:
                    continue
                _eta1 = eta1[iev][ip1]
                _eta2 = eta2[iev][ip2]
                _phi1 = phi1[iev][ip1]
                _phi2 = phi2[iev][ip2]

                dr = deltar(_eta1, _phi1, _eta2, _phi2)
                drs[ip2] = dr

            if len(drs) > 0:
                # find closest match to this particle
                min_idx_dr = np.argmin(drs)

                # has to be closer than the deltaR_cut
                if drs[min_idx_dr] < deltaR_cut:
                    ptcl_inds_1.append(ip1)
                    ptcl_inds_2.append(min_idx_dr)
                    best_drs.append(drs[min_idx_dr])
                    pfs_used[min_idx_dr] = 1
                
        ptcl_inds_1_ev.append(ptcl_inds_1)
        ptcl_inds_2_ev.append(ptcl_inds_2)
        best_drs_ev.append(best_drs)
    return ptcl_inds_1_ev, ptcl_inds_2_ev, best_drs_ev

def sum_overflow_into_last_bin(all_values):
    values = all_values[1:-1]
    values[-1] = values[-1] + all_values[-1]
    values[0] = values[0] + all_values[0]
    return values
    
def to_bh(data, bins, cumulative=False):
    h1 = bh.Histogram(bh.axis.Variable(bins))
    h1.fill(data)
    if cumulative:
        h1[:] = np.sum(h1.values()) - np.cumsum(h1)
    h1[:] = sum_overflow_into_last_bin(h1.values(flow=True)[:])
    return h1

def binom_error(n_sig, n_tot):
    """
    for an efficiency = nSig/nTrueSig or purity = nSig / (nSig + nBckgrd), this function calculates the
    standard deviation according to http://arxiv.org/abs/physics/0701199 .
    """
    variance = np.where(
        n_tot > 0, (n_sig + 1) * (n_sig + 2) / ((n_tot + 2) * (n_tot + 3)) - (n_sig + 1) ** 2 / ((n_tot + 2) ** 2), 0
    )
    return np.sqrt(variance)

def midpoints(x):
    return (x[1:] + x[:-1]) / 2

In [None]:
dataset = "cms"

save_as = {
    "cms_pf_qcd_nopu": "QCD_noPU_13p6",
    "cms_pf_ttbar_nopu": "TTbar_noPU_13p6",
}

# sample = "cms_pf_ttbar_nopu"
sample = "cms_pf_qcd_nopu"

In [None]:
# PDG IDs of neutrinos
vELE_PDGID = 12
vMU_PDGID  = 14
vTAU_PDGID = 16

def fix_ak(arr):
    # replace None with empty list
    return ak.Array([x if x is not None else [] for x in arr])

def process_file(fn):
    # Load events
    with open(fn, "rb") as f:
        events = ak.Array(pkl.load(f))
    gen = events["packedGenParticles"]
    pf  = events["packedPFCandidates"]

    # Compute masks
    status = gen["status"]
    pdgId  = gen["pdgId"]
    mask_final = (status == 1)
    mask_nu    = (abs(pdgId) == vELE_PDGID) | \
                 (abs(pdgId) == vMU_PDGID)  | \
                 (abs(pdgId) == vTAU_PDGID)
    mask = mask_final & ~mask_nu

    # Slice and zip
    sliced = {
        "GenCands_pt":    gen.pt[mask],
        "GenCands_eta":   gen.eta[mask],
        "GenCands_phi":   gen.phi[mask],
        "GenCands_pdgId": gen.pdgId[mask],
        "PFCands_pt":     pf.pt,
        "PFCands_eta":    pf.eta,
        "PFCands_phi":    pf.phi,
        "PFCands_pdgId":  pf.pdgId,
    }
    return ak.zip(sliced, depth_limit=1)

from concurrent.futures import ThreadPoolExecutor
def load_multiprocess(files, max_workers=None):
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        results = list(tqdm.tqdm(executor.map(process_file, files), total=len(files)))
    successful_results = [r for r in results if r is not None]

    return ak.concatenate(successful_results)

def load_singleprocess(files):
    results = []
    for fn in tqdm.tqdm(files):
        try:
            ret = process_file(fn)
            results.append(ret)
        except Exception as e:
            print("could not process " + fn)
    return ak.concatenate(results)

In [None]:
path_mlpf = f"/eos/user/j/jpata/mlpf/results/cms/CMSSW_15_0_5_mlpf_v2.5.0_p01_f8ae2f_test/cuda_False/{save_as[sample]}_mlpfpu/step3_MINI_*.pkl"
path_pf = f"/eos/user/j/jpata/mlpf/results/cms/CMSSW_15_0_5_mlpf_v2.5.0_p01_f8ae2f_test/cuda_False/{save_as[sample]}_pf/step3_MINI_*.pkl"

# mlpf
files = glob.glob(path_mlpf)
files = files[:1000]
# data_mlpf =  load_multiprocess([fn for fn in files], 16)
data_mlpf =  load_singleprocess([fn for fn in files])

# pf
files = glob.glob(path_pf)
files = files[:1000]
# data_pf =  load_multiprocess([fn for fn in files], 16)
data_pf =  load_singleprocess([fn for fn in files])

In [None]:
import particle
from particle import Particle
from particle import PDGID # https://github.com/scikit-hep/particle

import numba
from numba import njit
from numba.typed import Dict
from numba.core import types

@numba.njit
def get_charge_numba(pids, pid_to_charge_numbadict):
    ret = np.zeros(len(pids))
    for i in range(len(pids)):
        ret[i] = pid_to_charge_numbadict[pids[i]]
    return ret
    
def get_charge_array(pdgids):
    pids_uniq = np.unique(pdgids)
    pid_to_charge = {
        pid: Particle.from_pdgid(pid).charge for pid in pids_uniq
    }
    
    pid_to_charge_numbadict = Dict.empty(key_type=types.int64, value_type=types.float64)
    for pid, c in pid_to_charge.items():
        pid_to_charge_numbadict[pid] = c

    ret = get_charge_numba(pdgids, pid_to_charge_numbadict)
    return ak.Array(ret)

def remap_pid_gen(data):
    """
    GenCands have all sorts of PID so must remap
    """
    pid = np.abs(np.asarray(ak.flatten(data["GenCands_pdgId"])))
    
    pa = ak.flatten(np.abs(data["GenCands_pdgId"]))
    pc = np.abs(get_charge_array(ak.flatten(data["GenCands_pdgId"])))

    pid[(pa!=11) & (pa!=13) & (pa!=22) & (pc==1)] = 211
    pid[(pa!=11) & (pa!=13) & (pa!=22) & (pc==0)] = 130
    data["GenCands_pid"] = ak.unflatten(pid, ak.count(data["GenCands_pdgId"], axis=1))
    data["PFCands_pid"] = np.abs(data["PFCands_pdgId"])

remap_pid_gen(data_pf)
remap_pid_gen(data_mlpf)

# Plot configs

In [None]:
color_code = {
    "Gen": "tab:blue",
    "PF": "tab:orange",
    "MLPF": "tab:red",
}

bins_pt = {
    "cms_pf_qcd_nopu": {
        211: np.linspace(0,500,41),
        130: np.linspace(0,200,41),
        22: np.linspace(0,200,41),
        11: np.linspace(0,50,21),
        13: np.linspace(0,50,21),
    },
    "cms_pf_ttbar_nopu": {
        211: np.linspace(0,60,31),
        130: np.linspace(0,40,31),
        22: np.linspace(0,40,31),
        11: np.linspace(0,100,31),
        13: np.linspace(0,100,31),
    },  
}

bins_eta = {
    "cms_pf_qcd_nopu": {
        211: np.linspace(-2.5,2.5,41),
        130: np.linspace(-3,3,41),
        22: np.linspace(-3,3,41),
        11: np.linspace(-2.5,2.5,41),
        13: np.linspace(-2.5,2.5,41),
    },
    "cms_pf_ttbar_nopu": {
        211: np.linspace(-2.5,2.5,41),
        130: np.linspace(-3,3,41),
        22: np.linspace(-3,3,41),
        11: np.linspace(-2.5,2.5,41),
        13: np.linspace(-2.5,2.5,41),
    },
}

bins_phi = {
    "cms_pf_qcd_nopu": {
        211: np.linspace(-3,3,41),
        130: np.linspace(-3,3,41),
        22: np.linspace(-3,3,41),
        11: np.linspace(-3,3,41),
        13: np.linspace(-3,3,41),
    },
    "cms_pf_ttbar_nopu": {
        211: np.linspace(-3,3,41),
        130: np.linspace(-3,3,41),
        22: np.linspace(-3,3,41),
        11: np.linspace(-3,3,21),
        13: np.linspace(-3,3,21),
    },  
}

marker_style = {
    'PF': 's',
    'MLPF': 'o',
}
linestyle = {
    'Gen': '--',
    'PF': ':',
    'MLPF': '-',
}

col_pid = {
    211: "tab:blue",
    130: "tab:orange",
    22: "tab:red",
    11: "tab:green",
    13: "tab:pink",    
}

In [None]:
os.makedirs(f"./plots/{save_as[sample]}/particle_eff_fakerate_phasespacesplit", exist_ok=True)

# Make particle-level plots

In [None]:
eta_cuts = [
    (0, 1.3), 
    (1.3, 2.5), 
    (2.5, 2.7), 
    (2.7, 3.0),
]

pt_cuts = [
    (0, 3), 
    (3, 5), 
    (5, 10), 
    (10, 20), 
    (20, 40), 
    (40, 100), 
    (100, np.inf),
]

cut_label = {
    "pt": {
        (0, 3): r"$p_T < 3$ GeV",
        (3, 5): r"$3 < p_T < 5$ GeV",
        (5, 10): r"$5 < p_T < 10$ GeV",
        (10, 20): r"$10 < p_T < 20$ GeV",
        (20, 40): r"$20 < p_T < 40$ GeV",
        (40, 100): r"$40 < p_T < 100$ GeV",
        (100, np.inf): r"$p_T > 100$ GeV",
    },
    "eta": {
        (0, 1.3): r"$|\eta|<1.3$",
        (1.3, 2.5): r"$1.3 < |\eta| < 2.5$",
        (2.5, 2.7): r"$2.5 < |\eta| < 2.7$", 
        (2.7, 3.0): r"$2.7 < |\eta| < 3.0$",
    } , 
}

In [None]:
bins_pt = {}
bins_pt[sample] = {}

# must define matching condition
dR_cut = 0.15

for pid in [
    211,
    130,
#     22,
]:

    for pt_cut in pt_cuts:

        fig, axs = plt.subplots(2, 4, figsize=(28, 12))
        axs = axs.flatten()    

        for i, eta_cut in enumerate(eta_cuts):
            
            bins_pt[sample][pid] = np.linspace(pt_cut[0], pt_cut[1], 5) if pt_cut[1] != np.inf else np.linspace(pt_cut[0], 200, 5)
            
            ################
            # define baseline kinematic cuts
            msk_pf_gen_pt = (data_pf["GenCands_pt"] > pt_cut[0]) & (data_pf["GenCands_pt"] < pt_cut[1])
            msk_pf_gen_eta = (abs(data_pf["GenCands_eta"]) > eta_cut[0]) & (abs(data_pf["GenCands_eta"]) < eta_cut[1])
            msk_pf_gen = msk_pf_gen_pt & msk_pf_gen_eta

            msk_pf_reco_pt = (data_pf["PFCands_pt"] > pt_cut[0]) & (data_pf["PFCands_pt"] < pt_cut[1])
            msk_pf_reco_eta = (abs(data_pf["PFCands_eta"]) > eta_cut[0]) & (abs(data_pf["PFCands_eta"]) < eta_cut[1])
            msk_pf_reco = msk_pf_reco_pt & msk_pf_reco_eta

            # apply baseline kinematic cuts
            eta1 = data_pf["GenCands_eta"][msk_pf_gen]
            phi1 = data_pf["GenCands_phi"][msk_pf_gen]

            eta2 = data_pf["PFCands_eta"][msk_pf_reco]
            phi2 = data_pf["PFCands_phi"][msk_pf_reco]

            pf_idx1, pf_idx2, pf_dr = match_particles(eta1, eta2, phi1, phi2, dR_cut)

            # define baseline kinematic cuts
            msk_mlpf_gen_pt = (data_mlpf["GenCands_pt"] > pt_cut[0]) & (data_mlpf["GenCands_pt"] < pt_cut[1])
            msk_mlpf_gen_eta = (abs(data_mlpf["GenCands_eta"]) > eta_cut[0]) & (abs(data_mlpf["GenCands_eta"]) < eta_cut[1])
            msk_mlpf_gen = msk_mlpf_gen_pt & msk_mlpf_gen_eta

            msk_mlpf_reco_pt = (data_mlpf["PFCands_pt"] > pt_cut[0]) & (data_mlpf["PFCands_pt"] < pt_cut[1])
            msk_mlpf_reco_eta = (abs(data_mlpf["PFCands_eta"]) > eta_cut[0]) & (abs(data_mlpf["PFCands_eta"]) < eta_cut[1])
            msk_mlpf_reco = msk_mlpf_reco_pt & msk_mlpf_reco_eta

            # apply baseline kinematic cuts
            eta1 = data_mlpf["GenCands_eta"][msk_mlpf_gen]
            phi1 = data_mlpf["GenCands_phi"][msk_mlpf_gen]

            eta2 = data_mlpf["PFCands_eta"][msk_mlpf_reco]
            phi2 = data_mlpf["PFCands_phi"][msk_mlpf_reco]

            mlpf_idx1, mlpf_idx2, mlpf_dr = match_particles(eta1, eta2, phi1, phi2, dR_cut)

            ################
            # plotting below

            #pick genparticles, and genparticles matched to reco
            h_pf_gen =            to_bh(ak.flatten(data_pf["GenCands_pt"][msk_pf_gen][data_pf["GenCands_pid"][msk_pf_gen]==pid]), bins=bins_pt[sample][pid])
            h_pf_gen_matched =    to_bh(ak.flatten(data_pf["GenCands_pt"][msk_pf_gen][pf_idx1][data_pf["GenCands_pid"][msk_pf_gen][pf_idx1]==pid]), bins=bins_pt[sample][pid])

            #pick recoparticles, and recoparticles matched to gen
            h_pf_reco =           to_bh(ak.flatten(data_pf["PFCands_pt"][msk_pf_reco][data_pf["PFCands_pid"][msk_pf_reco]==pid]), bins=bins_pt[sample][pid])
            h_pf_reco_matched =   to_bh(ak.flatten(data_pf["PFCands_pt"][msk_pf_reco][pf_idx2][data_pf["PFCands_pid"][msk_pf_reco][pf_idx2]==pid]), bins=bins_pt[sample][pid])

            #repeat for mlpf
            h_mlpf_gen =          to_bh(ak.flatten(data_mlpf["GenCands_pt"][msk_mlpf_gen][data_mlpf["GenCands_pid"][msk_mlpf_gen]==pid]), bins=bins_pt[sample][pid])
            h_mlpf_gen_matched =  to_bh(ak.flatten(data_mlpf["GenCands_pt"][msk_mlpf_gen][mlpf_idx1][data_mlpf["GenCands_pid"][msk_mlpf_gen][mlpf_idx1]==pid]), bins=bins_pt[sample][pid])

            h_mlpf_reco =         to_bh(ak.flatten(data_mlpf["PFCands_pt"][msk_mlpf_reco][data_mlpf["PFCands_pid"][msk_mlpf_reco]==pid]), bins=bins_pt[sample][pid])
            h_mlpf_reco_matched = to_bh(ak.flatten(data_mlpf["PFCands_pt"][msk_mlpf_reco][mlpf_idx2][data_mlpf["PFCands_pid"][msk_mlpf_reco][mlpf_idx2]==pid]), bins=bins_pt[sample][pid])

            #eff: fraction of all gen that were reconstructed
            heff_pf = h_pf_gen_matched/h_pf_gen
            #fake: fraction of all reco that were matched to gen
            hfake_pf = (h_pf_reco - h_pf_reco_matched)/h_pf_reco

            heff_mlpf = h_mlpf_gen_matched/h_mlpf_gen
            hfake_mlpf = (h_mlpf_reco - h_mlpf_reco_matched)/h_mlpf_reco

            #eff plot
            axs[i].errorbar(
                midpoints(heff_pf.axes[0].edges), heff_pf.values(), binom_error(h_pf_gen_matched.values(), h_pf_gen.values()), marker=".", label="PF", linestyle="--", color=color_code["PF"],
            )

            axs[i].errorbar(
                midpoints(heff_mlpf.axes[0].edges), heff_mlpf.values(), binom_error(h_mlpf_gen_matched.values(), h_mlpf_gen.values()), marker=".", label="MLPF", color=color_code["MLPF"],
            )
            
            axs[i].set_ylim(0, 1.3)
            axs[i].set_title(cut_label["eta"][eta_cut], pad=15)
            axs[i].set_ylabel("Efficiency", fontsize=22)
            axs[i].set_xlabel("$p_T^{gen}$ (GeV)", fontsize=22)
            axs[i].legend(fontsize=22, loc="upper right",  bbox_to_anchor=(1, 1 - 0.1))
            sample_label(axs[i], sample, additional_text=", no PU", fontsize=22)

            ### Fake Rate
            axs[i + 4].errorbar(
                midpoints(hfake_pf.axes[0].edges), hfake_pf.values(),
                binom_error(h_pf_reco_matched.values(), h_pf_reco.values()),
                marker=".", label="PF", linestyle="--", color=color_code["PF"]
            )
            axs[i + 4].errorbar(
                midpoints(hfake_mlpf.axes[0].edges), hfake_mlpf.values(),
                binom_error(h_mlpf_reco_matched.values(), h_mlpf_reco.values()),
                marker=".", label="MLPF", color=color_code["MLPF"]
            )
            axs[i + 4].set_ylim(0, 1.0)
            axs[i + 4].set_title(cut_label["eta"][eta_cut], pad=15)
            axs[i + 4].set_ylabel("Fake rate", fontsize=22)
            axs[i + 4].set_xlabel("$p_T^{reco}$ (GeV)", fontsize=22)
            axs[i + 4].legend(fontsize=22, loc="upper right", bbox_to_anchor=(1, 1 - 0.1))

            sample_label(axs[i + 4], sample, additional_text=", no PU", fontsize=22)

        fig.suptitle(f"{pid_to_text[pid]}, {cut_label['pt'][pt_cut]}", fontsize=45)

        fig.subplots_adjust(wspace=0.25, hspace=0.4)
        plt.savefig(f"./plots/{save_as[sample]}/particle_eff_fakerate_phasespacesplit/{pid}_pt{pt_cut[0]}to{pt_cut[1]}.pdf")        
            
#         break
#     break