In [None]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import glob
import tqdm
import awkward as ak
import boost_histogram as bh
import sys
import vector

import mplhep
mplhep.style.use(mplhep.style.CMS)

import sys
sys.path.append("../mlpf/")
from plotting.plot_utils import pid_to_text, EVALUATION_DATASET_NAMES

In [None]:
label_tt = r"$t\bar{t}$"
label_qcd = r"$Z^*/\gamma \rightarrow q\bar{q}$"
num_files = 100

In [None]:
def sum_overflow_into_last_bin(all_values):
    values = all_values[1:-1]
    values[-1] = values[-1] + all_values[-1]
    values[0] = values[0] + all_values[0]
    return values


def to_bh(data, bins, cumulative=False):
    h1 = bh.Histogram(bh.axis.Variable(bins))
    h1.fill(data)
    if cumulative:
        h1[:] = np.sum(h1.values()) - np.cumsum(h1)
    h1[:] = sum_overflow_into_last_bin(h1.values(flow=True)[:])
    return h1


In [None]:
# Load the datasets, process to flattened (X,ygen,ycand) format

def load_data(path):
    ret = []
    filelist = list(glob.glob(path))[:num_files]
    print(len(filelist))

    X_track = []
    X_cluster = []

    ygen_track = []
    ygen_cluster = []

    ycand_track = []
    ycand_cluster = []

    for fn in tqdm.tqdm(filelist):
        dd = ak.from_parquet(fn)

        X_track.append(dd["X_track"])
        X_cluster.append(dd["X_cluster"])

        ygen_track.append(dd["ygen_track"])
        ygen_cluster.append(dd["ygen_cluster"])

        ycand_track.append(dd["ycand_track"])
        ycand_cluster.append(dd["ycand_cluster"])

    X_track = ak.concatenate(X_track)
    X_cluster = ak.concatenate(X_cluster)
    ygen_track = ak.concatenate(ygen_track)
    ygen_cluster = ak.concatenate(ygen_cluster)
    ycand_track = ak.concatenate(ycand_track)
    ycand_cluster = ak.concatenate(ycand_cluster)


    #keep only events with at least 5 tracks and 5 clusters
    msk = (ak.num(X_track)>5) & (ak.num(X_cluster)>5)
    X = ak.concatenate([X_track, X_cluster], axis=1)
    ygen = ak.concatenate([ygen_track, ygen_cluster], axis=1)
    ycand = ak.concatenate([ycand_track, ycand_cluster], axis=1)
    
    return {
        "X_track": X_track[msk],
        "X_cluster": X_cluster[msk],
        "ygen_track": ygen_track[msk],
        "ygen_cluster": ygen_cluster[msk],
        "ycand_track": ycand_track[msk],
        "ycand_cluster": ycand_cluster[msk],
        "X": X[msk],
        "ygen": ygen[msk],
        "ycand": ycand[msk]
    }

In [None]:
data_tt = load_data("../data/clic_edm4hep/p8_ee_tt_ecm365/*.parquet")
data_qcd = load_data("../data/clic_edm4hep/p8_ee_qcd_ecm365/*.parquet")

## Number of PFelements per event

In [None]:
b = np.linspace(0, 200, 101)

h1 = to_bh(ak.num(data_tt["X_track"]), b)
h2 = to_bh(ak.num(data_qcd["X_track"]), b)

mplhep.histplot(h1, histtype="step", lw=2, label=label_tt)
mplhep.histplot(h2, histtype="step", lw=2, label=label_qcd)
plt.xlabel("Number of tracks / event")
plt.ylabel("Number of events")
plt.legend()
plt.ylim(0, 10*num_files)
plt.savefig("plots/clic/num_tracks.pdf", bbox_inches="tight")

In [None]:
b = np.linspace(0, 200, 101)

h1 = to_bh(ak.num(data_tt["X_cluster"]), b)
h2 = to_bh(ak.num(data_qcd["X_cluster"]), b)
#h2 = to_bh(ak.num(X_cluster), b)

mplhep.histplot(h1, histtype="step", lw=2, label=label_tt)
mplhep.histplot(h2, histtype="step", lw=2, label=label_qcd)
plt.xlabel("Number of clusters / event")
plt.ylabel("Number of events")
plt.legend()
plt.ylim(0,10*num_files)
plt.savefig("plots/clic/num_clusters.pdf", bbox_inches="tight")

In [None]:
gen_pt1 = ak.flatten(data_tt["ygen"][data_tt["ygen"][:, :, 0]!=0][:, :, 2])
gen_pt2 = ak.flatten(data_qcd["ygen"][data_qcd["ygen"][:, :, 0]!=0][:, :, 2])
# cand_pt = ak.flatten(ycand[ycand[:, :, 0]!=0][:, :, 2])

b = np.logspace(-2,3,100)
h1 = to_bh(gen_pt1, b)
h2 = to_bh(gen_pt2, b)
# h2 = to_bh(cand_pt, b)

fig = plt.figure()
ax = plt.axes()

mplhep.histplot(h1, histtype="step", lw=2, label=label_tt)
mplhep.histplot(h2, histtype="step", lw=2, label=label_qcd)
plt.xscale("log")
plt.xlabel("particle $p_T$ [GeV]")
plt.ylabel("Number of particles / bin")
plt.text(0.03, 0.97, "stable generator particles", transform=ax.transAxes, va="top", ha="left")
plt.legend()
plt.ylim(0,500*num_files)
plt.savefig("plots/clic/gen_particle_pt.pdf", bbox_inches="tight")

In [None]:
gen_pt1 = ak.flatten(data_tt["ycand"][data_tt["ycand"][:, :, 0]!=0][:, :, 2])
gen_pt2 = ak.flatten(data_qcd["ycand"][data_qcd["ycand"][:, :, 0]!=0][:, :, 2])
# cand_pt = ak.flatten(ycand[ycand[:, :, 0]!=0][:, :, 2])

b = np.logspace(-2,3,100)
h1 = to_bh(gen_pt1, b)
h2 = to_bh(gen_pt2, b)
# h2 = to_bh(cand_pt, b)

fig = plt.figure()
ax = plt.axes()

mplhep.histplot(h1, histtype="step", lw=2, label=label_tt)
mplhep.histplot(h2, histtype="step", lw=2, label=label_qcd)
plt.xscale("log")
plt.xlabel("particle $p_T$ [GeV]")
plt.ylabel("Number of particles / bin")
plt.text(0.03, 0.97, "Pandora PF particles", transform=ax.transAxes, va="top", ha="left")
plt.legend()
plt.ylim(0,500*num_files)
plt.savefig("plots/clic/pf_particle_pt.pdf", bbox_inches="tight")

In [None]:
pids = [11, 13, 22, 130, 211]


fig = plt.figure()
ax = plt.axes()

b = np.logspace(-2,3,100)
hs = []
labels = []
for pid in pids:
    pt_pid = ak.flatten(data_tt["ygen"][data_tt["ygen"][:, :, 0]==pid][:, :, 2])
    hs.append(to_bh(pt_pid, bins=b))
    labels.append(pid_to_text[pid])
mplhep.histplot(hs, stack=True, histtype="fill", label=labels)
plt.xscale("log")
plt.legend(loc="best", frameon=False)
plt.text(0.03, 0.97, label_tt + ", stable generator particles", transform=ax.transAxes, va="top", ha="left")
plt.xlabel("particle $p_T$ [GeV]")
plt.ylabel("particles / bin")
plt.ylim(0,500*num_files)
plt.savefig("plots/clic/gen_particle_pid_pt.pdf", bbox_inches="tight")

In [None]:
pids = [11, 13, 22, 130, 211]

fig = plt.figure()
ax = plt.axes()

b = np.logspace(-2,3,100)
hs = []
labels = []
for pid in pids:
    pt_pid = ak.flatten(data_tt["ycand"][data_tt["ycand"][:, :, 0]==pid][:, :, 2])
    hs.append(to_bh(pt_pid, bins=b))
    labels.append(pid_to_text[pid])
mplhep.histplot(hs, stack=True, histtype="fill", label=labels)
plt.xscale("log")
plt.legend(loc="best", frameon=False)
plt.text(0.03, 0.97, label_tt + ", Pandora PF particles", transform=ax.transAxes, va="top", ha="left")
plt.xlabel("particle $p_T$ [GeV]")
plt.ylabel("particles / bin")
plt.ylim(0,500*num_files)
plt.savefig("plots/clic/pf_particle_pid_pt.pdf", bbox_inches="tight")

## Energy per event

In [None]:
E_gen_per_event = ak.to_numpy(ak.sum(data_tt["ygen"][:, :, 5], axis=1))
E_cand_per_event = ak.to_numpy(ak.sum(data_tt["ycand"][:, :, 5], axis=1))

b = np.linspace(0, 500, 61)
h1 = to_bh(E_gen_per_event, b)
h2 = to_bh(E_cand_per_event, b)

mplhep.histplot(h1, histtype="step", lw=2, label="Gen")
mplhep.histplot(h2, histtype="step", lw=2, label="PF")
plt.xlabel("Total E per event [GeV]")
plt.ylabel("Number of events")
plt.legend()

In [None]:
plt.figure(figsize=(12,10))
plt.hist2d(E_gen_per_event, E_cand_per_event, bins=(b, b), cmap="hot_r")
plt.plot([0, 500], [0, 500], lw=0.5, ls="--", color="black")
plt.xlabel("Gen sum E [GeV]")
plt.ylabel("PF sum E [GeV]")
plt.colorbar(label="events / bin")

## Gen vs. PF energy of individual particles

In [None]:
msk = (data_tt["ygen"][:, :, 0] != 0) & (data_tt["ycand"][:, :, 0] != 0)
gen_e = ak.to_numpy(ak.flatten(data_tt["ygen"][msk][:, :, 5]))
cand_e = ak.to_numpy(ak.flatten(data_tt["ycand"][msk][:, :, 5]))

b = np.logspace(-2, 3, 101)

plt.figure(figsize=(12,10))

plt.hist2d(gen_e, cand_e, bins=(b, b), cmap="hot_r")
plt.plot([10**-2, 10**3], [10**-2, 10**3], lw=0.5, ls="--", color="black")
plt.xscale("log")
plt.yscale("log")

plt.xlabel("Gen particle E [GeV]")
plt.ylabel("PF particle E [GeV]")
plt.colorbar(label="particles / bin")

In [None]:
import pickle

In [None]:
losses_train = []
losses_val = []
for model in ["pytorch_52218749", "pytorch_52218746", "pytorch_52218747"]:
    loss_train = pickle.load(open("../experiments/{}/MLPF/MLPF_test_native/mlpf_native_loss_train.pkl".format(model), "rb"))
    loss_val = pickle.load(open("../experiments/{}/MLPF/MLPF_test_native/mlpf_native_loss_valid.pkl".format(model), "rb"))
    losses_train.append(loss_train)
    losses_val.append(loss_val)
losses_train = np.array(losses_train).T
losses_val = np.array(losses_val).T
epochs = np.arange(len(losses_train))

In [None]:
plt.figure(figsize=(10,5))
m = np.mean(losses_train, axis=1)
s = np.std(losses_train, axis=1)
plt.plot(epochs, m, label="training,\n$L_{{final}}={:.2f} \pm {:.2f}$".format(m[-1], s[-1]))
plt.fill_between(epochs, m-s, m+s, alpha=0.5)

m = np.mean(losses_val, axis=1)
s = np.std(losses_val, axis=1)
plt.plot(epochs, m, label="validation,\n$L_{{final}}={:.2f} \pm {:.2f}$".format(m[-1], s[-1]))
plt.fill_between(epochs, m-s, m+s, alpha=0.5)

plt.legend()
plt.ylim(12,20)
plt.xlabel("Training epoch")
plt.ylabel("Reconstruction loss [a.u.]")
plt.title("Supervised MLPF training")
plt.xlim(0,500)
plt.savefig("plots/clic/supervised_mlpf_loss.pdf", bbox_inches="tight")

In [None]:
plt.figure(figsize=(10,5))
ax = plt.axes()
m = np.mean(losses_train, axis=1)
s = np.std(losses_train, axis=1)
plt.plot(epochs, m, label="training,\n$L_{{final}}={:.2f} \pm {:.2f}$".format(m[-1], s[-1]))
plt.fill_between(epochs, m-s, m+s, alpha=0.5)

m = np.mean(losses_val, axis=1)
s = np.std(losses_val, axis=1)
plt.plot(epochs, m, label="validation,\n$L_{{final}}={:.2f} \pm {:.2f}$".format(m[-1], s[-1]))
plt.fill_between(epochs, m-s, m+s, alpha=0.5)

plt.legend()
plt.ylim(12,20)
plt.xlabel("Training epoch")
plt.ylabel("Reconstruction loss [a.u.]")
plt.title("Semi-supervised MLPF training")
plt.xlim(0,500)
ax.text(0.2, 0.9, "FIXME: replace with real SSL training", color="red", transform=ax.transAxes)
plt.savefig("plots/clic/semi_supervised_mlpf_loss.pdf", bbox_inches="tight")

In [None]:
datas = []
for fi in glob.glob("../experiments/pytorch_52218749/MLPF/MLPF_test_native/native/valid_dataset_mlpf/*.parquet"):
    d = ak.from_parquet(fi)
    datas.append(d)
    
mlpf_reco = {}
mlpf_reco["particles"] = ak.concatenate([d["particles"] for d in datas])
mlpf_reco["jets"] = ak.concatenate([d["jets"] for d in datas])
mlpf_reco["matched_jets"] = ak.concatenate([d["matched_jets"] for d in datas])

mlpf_reco = ak.Record(mlpf_reco)

In [None]:
gp_genjet = vector.awk(mlpf_reco["jets"]["gen"][mlpf_reco["matched_jets"]["gen_to_pred"]["gen"]])
gp_predjet = vector.awk(mlpf_reco["jets"]["pred"][mlpf_reco["matched_jets"]["gen_to_pred"]["pred"]])

gc_genjet = vector.awk(mlpf_reco["jets"]["gen"][mlpf_reco["matched_jets"]["gen_to_cand"]["gen"]])
gc_candjet = vector.awk(mlpf_reco["jets"]["cand"][mlpf_reco["matched_jets"]["gen_to_cand"]["cand"]])

In [None]:
pf_genjet_pt = ak.flatten(gc_genjet.pt)
mlpf_genjet_pt = ak.flatten(gp_genjet.pt)

pf_response = ak.flatten(gc_candjet.pt)/ak.flatten(gc_genjet.pt)
mlpf_response = ak.flatten(gp_predjet.pt)/ak.flatten(gp_genjet.pt)

In [None]:
genjet_bins = [10,20,40,60,80,100,200]

x_vals = []
pf_vals = []
mlpf_vals = []
b = np.linspace(0,2,100)

fig, axs = plt.subplots(2, 3, figsize=(3*5, 2*5))
axs = axs.flatten()
for ibin in range(len(genjet_bins)-1):
    lim_low = genjet_bins[ibin]
    lim_hi = genjet_bins[ibin+1]
    x_vals.append(np.mean([lim_low, lim_hi]))
    
    mask_genjet = (pf_genjet_pt>lim_low) & (pf_genjet_pt<=lim_hi)
    pf_subsample = pf_response[mask_genjet]
    
    pf_p25 = np.percentile(pf_subsample, 25)
    pf_p50 = np.percentile(pf_subsample, 50)
    pf_p75 = np.percentile(pf_subsample, 75)
    pf_vals.append([pf_p25, pf_p50, pf_p75])
        
    mask_genjet = (mlpf_genjet_pt>lim_low) & (mlpf_genjet_pt<=lim_hi)
    mlpf_subsample = mlpf_response[mask_genjet]

    mlpf_p25 = np.percentile(mlpf_subsample, 25)
    mlpf_p50 = np.percentile(mlpf_subsample, 50)
    mlpf_p75 = np.percentile(mlpf_subsample, 75)
    mlpf_vals.append([mlpf_p25, mlpf_p50, mlpf_p75])
    
    plt.sca(axs[ibin])
    plt.hist(pf_subsample, bins=b, histtype="step", lw=2, label="PF")
    plt.hist(mlpf_subsample, bins=b, histtype="step", lw=2, label="MLPF")
    #plt.ylim(0,1500)
    plt.xlim(0,2)
    plt.xticks([0,0.5,1,1.5,2])
    plt.ylabel("Matched jets / bin")
    plt.xlabel("jet $p_{T,reco} / p_{T,gen}$")
    plt.axvline(1.0, ymax=0.7, color="black", ls="--")
    plt.legend(loc=1, fontsize=16)
    plt.title("${} \less p_{{T,gen}} \leq {}$".format(lim_low, lim_hi))
    plt.yscale("log")
    
plt.tight_layout()
plt.savefig("plots/clic/jet_response_binned.pdf", bbox_inches="tight")

x_vals = np.array(x_vals)
pf_vals = np.array(pf_vals)
mlpf_vals = np.array(mlpf_vals)


In [None]:
fig, axs = plt.subplots(2, 1, sharex=True)

rand = np.random.uniform(low=0.9, high=1.1, size=len(x_vals))

plt.sca(axs[0])
plt.plot(x_vals, pf_vals[:, 1], marker="o", label="PF")
plt.plot(x_vals, mlpf_vals[:, 1], marker="o", label="MLPF")
plt.ylim(0.75,1.25)
plt.ylabel("Response median")
plt.legend()

plt.sca(axs[1])
plt.plot(x_vals, pf_vals[:, 2]-pf_vals[:, 0], marker="o", label="PF")
plt.plot(x_vals, mlpf_vals[:, 2]-mlpf_vals[:, 0], marker="o", label="MLPF")
plt.ylabel("Response IQR")
plt.legend()
plt.xlabel("gen-jet $p_T$ [GeV]")

plt.tight_layout()
plt.savefig("plots/clic/jet_response_med_iqr.pdf", bbox_inches="tight")