In [None]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import glob
import tqdm
import awkward as ak
import boost_histogram as bh
import sys
import vector
import pickle

import mplhep
mplhep.style.use(mplhep.style.CMS)

import sys
sys.path.append("../../mlpf/")
from plotting.plot_utils import pid_to_text, EVALUATION_DATASET_NAMES, load_eval_data

# Input datasets

In [None]:
!pwd

In [None]:
!mkdir plots_mlpf_clic_2023
!ls -lrt plots_mlpf_clic_2023/*.pdf

In [None]:
label_tt = r"$\mathrm{t}\overline{\mathrm{t}}$"
label_qq = r"$\gamma/\mathrm{Z}^* \rightarrow \mathrm{hadrons}$"
label_zh = r"$ZH \rightarrow \tau \tau$"
label_ww = r"$WW \rightarrow \mathrm{hadrons}$"

num_files = 1000

In [None]:
def sum_overflow_into_last_bin(all_values):
    values = all_values[1:-1]
    values[-1] = values[-1] + all_values[-1]
    values[0] = values[0] + all_values[0]
    return values


def to_bh(data, bins, cumulative=False, sum_overflow=True):
    h1 = bh.Histogram(bh.axis.Variable(bins))
    h1.fill(data)
    if cumulative:
        h1[:] = np.sum(h1.values()) - np.cumsum(h1)
    if sum_overflow:
        h1[:] = sum_overflow_into_last_bin(h1.values(flow=True)[:])
    return h1

In [None]:
# Load the datasets, process to flattened (X,ygen,ycand) format

def load_data(path):
    ret = []
    filelist = list(glob.glob(path))[:num_files]
    print(len(filelist))

    X_track = []
    X_cluster = []

    ygen_track = []
    ygen_cluster = []

    ycand_track = []
    ycand_cluster = []

    for fn in tqdm.tqdm(filelist):
        dd = ak.from_parquet(fn)

        X_track.append(dd["X_track"])
        X_cluster.append(dd["X_cluster"])

        ygen_track.append(dd["ygen_track"])
        ygen_cluster.append(dd["ygen_cluster"])

        ycand_track.append(dd["ycand_track"])
        ycand_cluster.append(dd["ycand_cluster"])

    X_track = ak.concatenate(X_track)
    X_cluster = ak.concatenate(X_cluster)
    ygen_track = ak.concatenate(ygen_track)
    ygen_cluster = ak.concatenate(ygen_cluster)
    ycand_track = ak.concatenate(ycand_track)
    ycand_cluster = ak.concatenate(ycand_cluster)


    #keep only events with at least 5 tracks and 5 clusters
    msk = (ak.num(X_track)>5) & (ak.num(X_cluster)>5)
    X = ak.concatenate([X_track, X_cluster], axis=1)
    ygen = ak.concatenate([ygen_track, ygen_cluster], axis=1)
    ycand = ak.concatenate([ycand_track, ycand_cluster], axis=1)
    
    return {
        "X_track": X_track[msk],
        "X_cluster": X_cluster[msk],
        "ygen_track": ygen_track[msk],
        "ygen_cluster": ygen_cluster[msk],
        "ycand_track": ycand_track[msk],
        "ycand_cluster": ycand_cluster[msk],
        "X": X[msk],
        "ygen": ygen[msk],
        "ycand": ycand[msk]
    }

def load_data_hits(path, num_files):
    ret = []
    filelist = list(glob.glob(path))[:num_files]
    print(len(filelist))

    X_hit = []

    for fn in tqdm.tqdm(filelist):
        dd = ak.from_parquet(fn)

        X_hit.append(dd["X_hit"])
        
    X_hit = ak.concatenate(X_hit)

    return {
        "X_hit": X_hit,
    }

In [None]:
data_tt = load_data("/local/joosep/mlpf/clic_edm4hep/p8_ee_tt_ecm380/*.parquet")
data_qcd = load_data("/local/joosep/mlpf/clic_edm4hep/p8_ee_qq_ecm380/*.parquet")
data_zh = load_data("/local/joosep/mlpf/clic_edm4hep/p8_ee_ZH_Htautau_ecm380/*.parquet")
data_ww = load_data("/local/joosep/mlpf/clic_edm4hep/p8_ee_WW_fullhad_ecm380/*.parquet")

In [None]:
data_tt_pu10 = load_data("/local/joosep/mlpf/clic_edm4hep/p8_ee_tt_ecm380_PU10/*.parquet")

In [None]:
data_tt_hits = load_data_hits("/local/joosep/mlpf/clic_edm4hep_hits/p8_ee_tt_ecm380/*.parquet", 100)
data_qq_hits = load_data_hits("/local/joosep/mlpf/clic_edm4hep_hits/p8_ee_qq_ecm380/*.parquet", 100)

## Number of PFelements per event

In [None]:
b = np.linspace(0, 200, 101)

h1 = to_bh(ak.num(data_tt["X_track"]), b)
h2 = to_bh(ak.num(data_qcd["X_track"]), b)
h3 = to_bh(ak.num(data_zh["X_track"]), b)
h4 = to_bh(ak.num(data_ww["X_track"]), b)
h5 = to_bh(ak.num(data_tt_pu10["X_track"]), b)

fig = plt.figure()
ax = plt.axes()

mplhep.histplot(h1, histtype="step", lw=2, label=label_tt)
mplhep.histplot(h2, histtype="step", lw=2, label=label_qq)
mplhep.histplot(h3, histtype="step", lw=2, label=label_zh)
mplhep.histplot(h4, histtype="step", lw=2, label=label_ww)
mplhep.histplot(h5*10, histtype="step", lw=2, label=label_tt + " PU10")
plt.xlabel("Number of tracks / event")
plt.ylabel("Number of events")
plt.legend()
plt.ylim(0, 10*num_files)
plt.ticklabel_format(axis="y", style="sci", scilimits=(0,0))
plt.savefig("plots_mlpf_clic_2023/num_tracks.pdf")

In [None]:
b = np.linspace(0, 500, 101)

h1 = to_bh(ak.num(data_tt["X_cluster"]), b)
h2 = to_bh(ak.num(data_qcd["X_cluster"]), b)
h3 = to_bh(ak.num(data_zh["X_cluster"]), b)
h4 = to_bh(ak.num(data_ww["X_cluster"]), b)
h5 = to_bh(ak.num(data_tt_pu10["X_cluster"]), b)

mplhep.histplot(h1, histtype="step", lw=2, label=label_tt)
mplhep.histplot(h2, histtype="step", lw=2, label=label_qq)
mplhep.histplot(h3, histtype="step", lw=2, label=label_zh)
mplhep.histplot(h4, histtype="step", lw=2, label=label_ww)
mplhep.histplot(h5*10, histtype="step", lw=2, label=label_tt + " PU10")

plt.xlabel("Number of clusters / event")
plt.ylabel("Number of events")
plt.legend()
plt.ylim(0,15*num_files)
plt.ticklabel_format(axis="y", style="sci", scilimits=(0,0))
plt.savefig("plots_mlpf_clic_2023/num_clusters.pdf")

In [None]:
b = np.linspace(0, 15000, 101)

h1 = to_bh(ak.num(data_tt_hits["X_hit"]), b)
h2 = to_bh(ak.num(data_qq_hits["X_hit"]), b)

mplhep.histplot(h1, histtype="step", lw=2, label=label_tt)
mplhep.histplot(h2, histtype="step", lw=2, label=label_qq)

plt.xlabel("Number of calorimeter hits / event")
plt.ylabel("Number of events")
plt.legend()
plt.ylim(0,500)
plt.ticklabel_format(axis="y", style="sci", scilimits=(0,0))
plt.savefig("plots_mlpf_clic_2023/num_hits.pdf")

In [None]:
gen_pt1 = ak.flatten(data_tt["ygen"][data_tt["ygen"][:, :, 0]!=0][:, :, 2])
gen_pt2 = ak.flatten(data_qcd["ygen"][data_qcd["ygen"][:, :, 0]!=0][:, :, 2])
gen_pt3 = ak.flatten(data_zh["ygen"][data_zh["ygen"][:, :, 0]!=0][:, :, 2])
gen_pt4 = ak.flatten(data_ww["ygen"][data_ww["ygen"][:, :, 0]!=0][:, :, 2])
gen_pt5 = ak.flatten(data_tt_pu10["ygen"][data_tt_pu10["ygen"][:, :, 0]!=0][:, :, 2])

cand_pt1 = ak.flatten(data_tt["ycand"][data_tt["ycand"][:, :, 0]!=0][:, :, 2])
cand_pt2 = ak.flatten(data_qcd["ycand"][data_qcd["ycand"][:, :, 0]!=0][:, :, 2])
cand_pt3 = ak.flatten(data_zh["ycand"][data_zh["ycand"][:, :, 0]!=0][:, :, 2])
cand_pt4 = ak.flatten(data_ww["ycand"][data_ww["ycand"][:, :, 0]!=0][:, :, 2])
cand_pt5 = ak.flatten(data_tt_pu10["ycand"][data_tt_pu10["ycand"][:, :, 0]!=0][:, :, 2])


b = np.logspace(-2,3,100)
h1 = to_bh(gen_pt1, b)
h2 = to_bh(gen_pt2, b)
h3 = to_bh(gen_pt3, b)
h4 = to_bh(gen_pt4, b)
h5 = to_bh(gen_pt5, b)

h1c = to_bh(cand_pt1, b)
h2c = to_bh(cand_pt2, b)
h3c = to_bh(cand_pt3, b)
h4c = to_bh(cand_pt4, b)
h5c = to_bh(cand_pt5, b)

fig = plt.figure()
ax = plt.axes()

prev = mplhep.histplot(h1, flow="sum", histtype="step", lw=1, label=label_tt, ls="--")
mplhep.histplot(h1c, flow="sum", histtype="step", lw=2, color=prev[0].errorbar.get_children()[0].get_color())

prev = mplhep.histplot(h2, flow="sum", histtype="step", lw=1, label=label_qq, ls="--")
mplhep.histplot(h2c, flow="sum", histtype="step", lw=2, color=prev[0].errorbar.get_children()[0].get_color())

prev = mplhep.histplot(h3, flow="sum", histtype="step", lw=1, label=label_zh, ls="--")
mplhep.histplot(h3c, flow="sum", histtype="step", lw=2, color=prev[0].errorbar.get_children()[0].get_color())

prev = mplhep.histplot(h4, flow="sum", histtype="step", lw=1, label=label_ww, ls="--")
mplhep.histplot(h4c, flow="sum", histtype="step", lw=2, color=prev[0].errorbar.get_children()[0].get_color())

prev = mplhep.histplot(h5, flow="sum", histtype="step", lw=1, label=label_tt + " PU10", ls="--")
mplhep.histplot(h5c, flow="sum", histtype="step", lw=2, color=prev[0].errorbar.get_children()[0].get_color())

plt.xscale("log")
plt.xlabel("particle $p_T$ [GeV]")
plt.ylabel("Particles / bin")
plt.legend()
plt.text(0.03, 0.97, "dashed - stable generator particles\nsolid - reconstructed Pandora PF particles", transform=ax.transAxes, va="top", ha="left", fontsize=16)
plt.ylim(0,500*num_files)
plt.ticklabel_format(axis="y", style="sci", scilimits=(0,0))
plt.savefig("plots_mlpf_clic_2023/gen_cand_particle_pt.pdf")


In [None]:
pids = [11, 13, 22, 130, 211]


fig = plt.figure()
ax = plt.axes()

b = np.logspace(-2,3,100)
hs = []
labels = []
for pid in pids:
    pt_pid = ak.flatten(data_tt["ygen"][data_tt["ygen"][:, :, 0]==pid][:, :, 2])
    hs.append(to_bh(pt_pid, bins=b))
    labels.append(pid_to_text[pid])
mplhep.histplot(hs, stack=True, histtype="fill", label=labels)
plt.xscale("log")
plt.legend(loc="best", frameon=False)
plt.text(0.03, 0.97, label_tt + ", stable generator particles", transform=ax.transAxes, va="top", ha="left")
plt.xlabel("particle $p_T$ [GeV]")
plt.ylabel("particles / bin")
plt.ylim(0,500*num_files)
plt.ticklabel_format(axis="y", style="sci", scilimits=(0,0))
plt.savefig("plots_mlpf_clic_2023/gen_particle_pid_pt.pdf")

In [None]:
pids = [11, 13, 22, 130, 211]

fig = plt.figure()
ax = plt.axes()

b = np.logspace(-2,3,100)
hs = []
labels = []
for pid in pids:
    pt_pid = ak.flatten(data_tt["ycand"][data_tt["ycand"][:, :, 0]==pid][:, :, 2])
    hs.append(to_bh(pt_pid, bins=b))
    labels.append(pid_to_text[pid])
mplhep.histplot(hs, stack=True, histtype="fill", label=labels)
plt.xscale("log")
plt.legend(loc="best", frameon=False)
plt.text(0.03, 0.97, label_tt + ", Pandora PF particles", transform=ax.transAxes, va="top", ha="left")
plt.xlabel("particle $p_T$ [GeV]")
plt.ylabel("particles / bin")
plt.ylim(0,500*num_files)
plt.ticklabel_format(axis="y", style="sci", scilimits=(0,0))
plt.savefig("plots_mlpf_clic_2023/pf_particle_pid_pt.pdf")

In [None]:
max_files = -1
yvals1, _, _ = load_eval_data("../../mlpf-clic-2023-results/clusters_best_tuned_gnn_clic_v130/evaluation/epoch_96/clic_edm_ttbar_pf/test/*.parquet", max_files=max_files)
yvals2, _, _ = load_eval_data("../../mlpf-clic-2023-results/clusters_best_tuned_gnn_clic_v130/evaluation/epoch_96/clic_edm_qq_pf/test/*.parquet", max_files=max_files)
yvals3, _, _ = load_eval_data("../../mlpf-clic-2023-results/clusters_best_tuned_gnn_clic_v130/evaluation/epoch_96/clic_edm_ww_fullhad_pf/test/*.parquet", max_files=max_files)
yvals4, _, _ = load_eval_data("../../mlpf-clic-2023-results/clusters_best_tuned_gnn_clic_v130/evaluation/epoch_96/clic_edm_zh_tautau_pf/test/*.parquet", max_files=max_files)

In [None]:
def plot_one_sample(yvals, pid, var, bins, label):
    
    
    gen_pt1 = ak.flatten(yvals["gen_" + var][yvals["gen_cls_id"]==pid])
    cand_pt1 = ak.flatten(yvals["cand_" + var][yvals["cand_cls_id"]==pid])
    pred_pt1 = ak.flatten(yvals["pred_" + var][yvals["pred_cls_id"]==pid])
    h1 = to_bh(gen_pt1, bins, sum_overflow=False)
    h2 = to_bh(cand_pt1, bins, sum_overflow=False)
    h3 = to_bh(pred_pt1, bins, sum_overflow=False)
    plt.sca(a0)
    h_truth = mplhep.histplot(h1, histtype="step", lw=1, flow="none", label=label, density=1)
    
    plt.sca(a1)
    h_pf = mplhep.histplot(h2/h1, histtype="step", lw=1, ls="--", flow="none")
    h_mlpf = mplhep.histplot(h3/h1, histtype="step", lw=2, ls="-", color=h_pf[0].stairs.get_edgecolor(), flow="none")
    return h_truth[0].stairs

In [None]:
for pid, ptclname in [
    (1, "charged hadrons"),
    (2, "neutral hadrons"),
    (3, "photons"),
    (4, "electrons"),
    (5, "muons"),
    ]:
    
    bins = np.logspace(0, 2, 61)
    f, (a0, a1) = plt.subplots(2, 1, gridspec_kw={"height_ratios": [1, 1]}, sharex=True)
    h0 = plot_one_sample(yvals1, pid, "pt", bins, label_tt)
    h1 = plot_one_sample(yvals2, pid, "pt", bins, label_qq)
    h2 = plot_one_sample(yvals3, pid, "pt", bins, label_ww)
    h3 = plot_one_sample(yvals4, pid, "pt", bins, label_zh)

    plt.sca(a0)
    leg = plt.legend([h0, h1, h2, h3], [label_tt, label_qq, label_ww, label_zh], loc="best", title=ptclname)
    leg._legend_box.align = "left"
    plt.ylabel("truth particles / bin [a.u.]")
    plt.ylim(top=2*a0.get_ylim()[1])
    
    plt.sca(a1)
    plt.axhline(1.0, color="black", ls="-")
    plt.ylim(bottom=0)
    plt.xscale("log")
    plt.xlim(10**0, 10**2)
    plt.ylabel("reco. / truth")
    # plt.legend(title=ptclname, loc="best")
    plt.xlabel("particle $p_T$ [GeV]")
    l0,  = plt.plot([0],[0], ls="-", lw=1, color="black")
    l1,  = plt.plot([0],[0], ls="--", lw=1, color="black")
    l2,  = plt.plot([0],[0], ls="-", lw=2, color="black")
    leg = plt.legend([l0, l1, l2], ["truth", "PF", "MLPF"], loc="best")
    leg._legend_box.align = "left"
    plt.savefig("plots_mlpf_clic_2023/{}_pt.pdf".format(ptclname.replace(" ", "_")))

    bins = np.linspace(-3, 3, 61)
    f, (a0, a1) = plt.subplots(2, 1, gridspec_kw={"height_ratios": [1, 1]}, sharex=True)
    h0 = plot_one_sample(yvals1, pid, "eta", bins, label_tt)
    h1 = plot_one_sample(yvals2, pid, "eta", bins, label_qq)
    h2 = plot_one_sample(yvals3, pid, "eta", bins, label_ww)
    h3 = plot_one_sample(yvals4, pid, "eta", bins, label_zh)

    plt.sca(a0)
    leg = plt.legend([h0, h1, h2, h3], [label_tt, label_qq, label_ww, label_zh], loc="best", title=ptclname)
    leg._legend_box.align = "left"
    plt.ylabel("truth particles / bin [a.u.]")
    plt.ylim(top=2*a0.get_ylim()[1])
    plt.sca(a1)
    plt.axhline(1.0, color="black", ls="-")
    plt.ylim(bottom=0)
    plt.xlim(-3, 3)
    plt.ylabel("reco. / truth")
    plt.xlabel("particle $\eta$")
    l0,  = plt.plot([0],[0], ls="-", lw=1, color="black")
    l1,  = plt.plot([0],[0], ls="--", lw=1, color="black")
    l2,  = plt.plot([0],[0], ls="-", lw=2, color="black")
    leg = plt.legend([l0, l1, l2], ["truth", "PF", "MLPF"], loc="best")
    leg._legend_box.align = "left"
    plt.savefig("plots_mlpf_clic_2023/{}_eta.pdf".format(ptclname.replace(" ", "_")))

In [None]:
def plot_one_sample_jets(yvals, var, bins, label):
    gen_pt1 = ak.flatten(yvals["jets_gen_" + var])
    cand_pt1 = ak.flatten(yvals["jets_cand_" + var])
    pred_pt1 = ak.flatten(yvals["jets_pred_" + var])
    
    h1 = to_bh(gen_pt1, bins, sum_overflow=False)
    h2 = to_bh(cand_pt1, bins, sum_overflow=False)
    h3 = to_bh(pred_pt1, bins, sum_overflow=False)
    plt.sca(a0)
    h_truth = mplhep.histplot(h1, histtype="step", lw=1, ls="-", label="truth", flow="none", density=1)
        
    plt.sca(a1)
    plt.axhline(1.0, ls="--", lw=1.0, color="black")
    h_pf = mplhep.histplot(h2/h1, histtype="step", lw=1, ls="--", flow="none")
    h_mlpf = mplhep.histplot(h3/h1, histtype="step", lw=2, ls="-", flow="none", color=h_pf[0].stairs.get_edgecolor())
    return h_mlpf[0].stairs

In [None]:
bins = np.linspace(15, 150, 61)

f, (a0, a1) = plt.subplots(2, 1, gridspec_kw={"height_ratios": [1, 1]}, sharex=True)
h0 = plot_one_sample_jets(yvals1, "pt", bins, label_tt)
h1 = plot_one_sample_jets(yvals2, "pt", bins, label_qq)
h2 = plot_one_sample_jets(yvals3, "pt", bins, label_ww)
h3 = plot_one_sample_jets(yvals4, "pt", bins, label_zh)

plt.sca(a0)
leg = plt.legend([h0, h1, h2, h3], [label_tt, label_qq, label_ww, label_zh], loc="best")
leg._legend_box.align = "left"
plt.ylabel("truth jets / bin [a.u.]")
plt.ylim(top=2*a0.get_ylim()[1])

plt.sca(a1)
plt.axhline(1.0, color="black", ls="-")
plt.ylim(bottom=0)
plt.xlim(15, 150)
plt.ylabel("reco. / truth")
# plt.legend(title=ptclname, loc="best")
plt.xlabel("jet $p_T$ [GeV]")
l0,  = plt.plot([0],[0], ls="-", lw=1, color="black")
l1,  = plt.plot([0],[0], ls="--", lw=1, color="black")
l2,  = plt.plot([0],[0], ls="-", lw=2, color="black")
leg = plt.legend([l0, l1, l2], ["truth", "PF", "MLPF"], loc="best")
leg._legend_box.align = "left"
plt.savefig("plots_mlpf_clic_2023/jet_pt.pdf")

In [None]:
bins = np.linspace(-3, 3, 61)

f, (a0, a1) = plt.subplots(2, 1, gridspec_kw={"height_ratios": [1, 1]}, sharex=True)
h0 = plot_one_sample_jets(yvals1, "eta", bins, label_tt)
h1 = plot_one_sample_jets(yvals2, "eta", bins, label_qq)
h2 = plot_one_sample_jets(yvals3, "eta", bins, label_ww)
h3 = plot_one_sample_jets(yvals4, "eta", bins, label_zh)

plt.sca(a0)
leg = plt.legend([h0, h1, h2, h3], [label_tt, label_qq, label_ww, label_zh], loc="best")
leg._legend_box.align = "left"
plt.ylabel("truth jets / bin [a.u.]")
plt.ylim(top=2*a0.get_ylim()[1])

plt.sca(a1)
plt.axhline(1.0, color="black", ls="-")
plt.ylim(bottom=0)
plt.xlim(-3, 3)
plt.ylabel("reco. / truth")
# plt.legend(title=ptclname, loc="best")
plt.xlabel("jet $\eta$")
l0,  = plt.plot([0],[0], ls="-", lw=1, color="black")
l1,  = plt.plot([0],[0], ls="--", lw=1, color="black")
l2,  = plt.plot([0],[0], ls="-", lw=2, color="black")
leg = plt.legend([l0, l1, l2], ["truth", "PF", "MLPF"], loc="best")
leg._legend_box.align = "left"
plt.savefig("plots_mlpf_clic_2023/jet_eta.pdf")