In [None]:
import pandas as pd
import json
import glob
import matplotlib.pyplot as plt
import numpy as np

import sklearn
import sklearn.metrics
import matplotlib
import scipy
import mplhep as hep

import pandas

In [None]:
def flatten(arr):
    return arr.reshape((arr.shape[0]*arr.shape[1], arr.shape[2]))

In [None]:
def cms_label(x0=0.12, x1=0.23, x2=0.67, y=0.90):
    plt.figtext(x0, y,'CMS',fontweight='bold', wrap=True, horizontalalignment='left', fontsize=12)
    plt.figtext(x1, y,'Simulation Preliminary', style='italic', wrap=True, horizontalalignment='left', fontsize=10)
    plt.figtext(x2, y,'Run 3 (14 TeV), $\mathrm{t}\overline{\mathrm{t}}$ events',  wrap=False, horizontalalignment='left', fontsize=10)

def sample_label(ax, x=0.03, y=0.98):
    plt.text(x, y, "$\mathrm{t}\overline{\mathrm{t}}$ events", va="top", ha="left", size=10, transform=ax.transAxes)


In [None]:
def apply_thresholds_f(ypred_raw_f, thresholds):
    msk = np.ones_like(ypred_raw_f)
    for i in range(len(thresholds)):
        msk[:, i] = ypred_raw_f[:, i]>thresholds[i]
    ypred_id_f = np.argmax(ypred_raw_f*msk, axis=-1)
    return ypred_id_f

def apply_thresholds(ypred_raw, thresholds):
    msk = np.ones_like(ypred_raw)
    for i in range(len(thresholds)):
        msk[:, :, i] = ypred_raw[:, :, i]>thresholds[i]
    ypred_id = np.argmax(ypred_raw*msk, axis=-1)
    return ypred_id

In [None]:
pid_names = {
    1: "ch.had",
    2: "n.had",
    3: "HFEM",
    4: "HFHAD",
    5: "g",
    6: "el",
    7: "mu"
}

pid_names_long = {
    1: "charged hadrons",
    2: "neutral hadrons",
    3: "HFEM",
    4: "HFHAD",
    5: "photons",
    6: "electrons",
    7: "muons"
}

var_names = {
    1: "charge",
    2: "pt",
    3: "eta",
    4: "sin phi",
    5: "cos phi",
    6: "energy"
}

x_labels = [
    "track", "PS1", "PS2", "ECAL", "HCAL", "GSF", "BREM", "HFEM", "HFHAD", "SC", "HO"
]
y_labels = [pid_names[i] for i in range(1,8)]

In [None]:
path = "../experiments/cms-gnn-dense-2cc4e7f9.gpu0.local/"

In [None]:
Xs = []
ygens = []
ycands = []
ypreds = []
ypreds_raw = []
for fi in glob.glob(path + "/pred*.npz"):
    dd = np.load(fi)
    Xs.append(dd["X"])
    ygens.append(dd["ygen"])
    ycands.append(dd["ycand"])
    ypreds.append(dd["ypred"])
    ypreds_raw.append(dd["ypred_raw"])

X = np.concatenate(Xs)
ygen = np.concatenate(ygens)
ycand = np.concatenate(ycands)
ypred = np.concatenate(ypreds)
ypred_raw = np.concatenate(ypreds_raw)

X_f = X.reshape((X.shape[0]*X.shape[1], X.shape[2]))
msk_X = X_f[:, 0]!=0
ygen_f = ygen.reshape((ygen.shape[0]*ygen.shape[1], ygen.shape[2]))
ycand_f = ycand.reshape((ycand.shape[0]*ycand.shape[1], ycand.shape[2]))
ypred_f = ypred.reshape((ypred.shape[0]*ypred.shape[1], ypred.shape[2]))
ypred_raw_f = ypred_raw.reshape((ypred_raw.shape[0]*ypred_raw.shape[1], ypred_raw.shape[2]))

#ad-hoc correction factors
thresholds = np.array([0.0, 0.4, 0.63, 0.5, 0.57, 0.15, 0.98, 0.75])
ypred_id = apply_thresholds(ypred_raw, thresholds)
ypred_id_f = apply_thresholds_f(ypred_raw_f, thresholds)

ypred[ypred_id==3, 6] *= 1.7
ypred[ypred_id==4, 6] *= 0.7

ypred_f[ypred_id_f==3, 6] *= 1.7
ypred_f[ypred_id_f==4, 6] *= 0.7

In [None]:
msk_f = X_f[:, 0]!=0

In [None]:
sklearn.metrics.accuracy_score(ycand_f[msk_f, 0], ypred_id_f[msk_f])

In [None]:
sklearn.metrics.balanced_accuracy_score(ycand_f[msk_f, 0], ypred_id_f[msk_f])

In [None]:
for icls in range(1,8):
    npred = np.sum(ypred_id == icls, axis=1)
    ncand = np.sum(ycand[:, :, 0] == icls, axis=1)
    plt.figure(figsize=(6,6))
    plt.scatter(ncand, npred, marker=".", alpha=0.8)
    a = 0.5*min(np.min(npred), np.min(ncand))
    b = 1.5*max(np.max(npred), np.max(ncand))
    plt.xlim(a,b)
    plt.ylim(a,b)
    plt.plot([a,b],[a,b], color="black", ls="--")
    plt.title(pid_names_long[icls],y=1.05)
    plt.xlabel("number of PFCandidates")
    plt.ylabel("number of MLPFCandidates")
    cms_label(x2=0.6, y=0.89)
    plt.savefig("num_cls{}.pdf".format(icls))


In [None]:
def load_history(path, max_epoch=None):
    ret = {}
    for fi in glob.glob(path):
        data = json.load(open(fi))
        epoch = int(fi.split("_")[-1].split(".")[0])
        ret[epoch] = data
    
    if not max_epoch:
        max_epoch = max(ret.keys())
    ret2 = []
    for i in range(max_epoch):
        ret2.append(ret[i])
    return pandas.DataFrame(ret2)

In [None]:
history = load_history(path + "/history_*.json", 500)

In [None]:
ax = plt.axes()
plt.plot(history["loss"], label="train")
plt.plot(history["val_loss"], label="test")
#plt.yscale("log")
#plt.xlim(50,500)
plt.ylim(history["val_loss"].values[-1]*0.9, history["val_loss"].values[-1]*1.1)
plt.legend(loc="best", frameon=False)
plt.xlabel("epoch")
plt.ylabel("Total loss")
cms_label()
sample_label(ax, x=0.03, y=0.10)
plt.savefig("loss.pdf", bbox_inches="tight")

In [None]:
ax = plt.axes()
plt.plot(history["cls_loss"], label="train")
plt.plot(history["val_cls_loss"], label="test")
#plt.yscale("log")
#plt.xlim(50,500)
plt.ylim(history["val_cls_loss"].values[-1]*0.9, history["val_cls_loss"].values[-1]*1.1)
plt.legend(loc="best")
plt.xlabel("epoch")
plt.ylabel("Classification loss")
cms_label()
sample_label(ax, x=0.03, y=0.10)
plt.savefig("cls_loss.pdf", bbox_inches="tight")

In [None]:
ax = plt.axes()
plt.plot(history["energy_loss"], label="train")
plt.plot(history["val_energy_loss"], label="test")
#plt.yscale("log")
#plt.xlim(50,500)
plt.ylim(history["val_energy_loss"].values[-1]*0.9, history["val_energy_loss"].values[-1]*1.1)
plt.legend(loc="best")
plt.xlabel("epoch")
plt.ylabel("Energy loss")
cms_label(x1=0.2, x2=0.6)
plt.savefig("energy_loss.pdf", bbox_inches="tight")

In [None]:
ax = plt.axes()
plt.plot(history["pt_loss"], label="train")
plt.plot(history["val_pt_loss"], label="test")
#plt.yscale("log")
#plt.xlim(50,500)
plt.ylim(history["val_pt_loss"].values[-1]*0.9, history["val_pt_loss"].values[-1]*1.1)
plt.legend(loc="best")
plt.xlabel("epoch")
plt.ylabel("Pt loss")
cms_label(x1=0.2, x2=0.6)
plt.savefig("pt_loss.pdf", bbox_inches="tight")

In [None]:
plt.plot(history["sin_phi_loss"], label="train")
plt.plot(history["val_sin_phi_loss"], label="test")
#plt.yscale("log")
#plt.xlim(50,500)
plt.ylim(history["val_sin_phi_loss"].values[-1]*0.9, history["val_sin_phi_loss"].values[-1]*1.1)
plt.legend(loc="best")
plt.xlabel("epoch")
plt.ylabel("sin phi loss")
cms_label(x1=0.2, x2=0.6)
plt.savefig("sin_phi_loss.pdf", bbox_inches="tight")

In [None]:
ax = plt.axes()
plt.plot(history["cos_phi_loss"], label="train")
plt.plot(history["val_cos_phi_loss"], label="test")
#plt.yscale("log")
#plt.xlim(50,500)
plt.ylim(history["val_cos_phi_loss"].values[-1]*0.9, history["val_cos_phi_loss"].values[-1]*1.1)
plt.legend(loc="best")
plt.xlabel("epoch")
plt.ylabel("cos phi loss")
cms_label(x1=0.2, x2=0.6)
plt.savefig("cos_phi_loss.pdf", bbox_inches="tight")

In [None]:
ax = plt.axes()
plt.plot(history["eta_loss"], label="train")
plt.plot(history["val_eta_loss"], label="test")
#plt.yscale("log")
#plt.xlim(50,500)
plt.ylim(history["val_eta_loss"].values[-1]*0.9, history["val_eta_loss"].values[-1]*1.1)
plt.legend(loc="best")
plt.xlabel("epoch")
plt.ylabel("eta loss")
cms_label(x1=0.2, x2=0.6)
plt.savefig("eta_loss.pdf", bbox_inches="tight")

In [None]:
ax = plt.axes()
plt.plot(history["charge_loss"], label="train")
plt.plot(history["val_charge_loss"], label="test")
#plt.yscale("log")
#plt.xlim(50,500)
plt.ylim(history["val_charge_loss"].values[-1]*0.9, history["val_charge_loss"].values[-1]*1.1)
plt.legend(loc="best")
plt.xlabel("epoch")
plt.ylabel("charge loss")
cms_label(x1=0.2, x2=0.6)
plt.savefig("charge_loss.pdf", bbox_inches="tight")

In [None]:
for icls in range(1,8):
    fig = plt.figure()
    ax = plt.axes()
    msk = (ycand_f[:, 0] == icls)
    plt.hist(ypred_raw_f[msk & (X_f[:, 0] != 0), icls], bins=100, density=1, histtype="step", lw=2, color="blue", label="true "+pid_names[icls]);
    plt.hist(ypred_raw_f[~msk & (X_f[:, 0] != 0), icls], bins=100, density=1, histtype="step", lw=2, color="red", label="other particles");
    plt.axvline(thresholds[icls], 0, 0.7, ls="--",
        color="black", label="threshold: {:.2f}".format(thresholds[icls]), lw=1)
    plt.yscale("log")
    plt.title("Particle reconstruction for {}".format(pid_names[icls]), y=1.05)
    plt.xlabel("Classification output {}".format(icls))
    plt.ylabel("Normalized number of particles [a.u.]")
    plt.legend(loc=2, frameon=False)
    plt.ylim(1e-2, 1e4)
    cms_label(x1=0.2, x2=0.6)
    plt.savefig("cls_output_{}.pdf".format(icls))

In [None]:
#perm = np.random.permutation(ycand_f[msk_X].shape[0])[:100000]

cm_norm = sklearn.metrics.confusion_matrix(
    ycand_f[msk_X, 0],
    ypred_id_f[msk_X],
    labels=range(8),
    normalize="true"
)

cm = sklearn.metrics.confusion_matrix(
    ycand_f[msk_X, 0],
    ypred_id_f[msk_X],
    labels=range(8),
)

In [None]:
plt.figure(figsize=(8, 8))
ax = plt.axes()
plt.imshow(cm_norm[1:, 1:], cmap="Blues")
plt.colorbar()

cms_label(x1=0.18, x2=0.52, y=0.82)
#sample_label(ax, x=0.8, y=1.0)
plt.xticks(range(len(y_labels)), y_labels);
plt.yticks(range(len(y_labels)), y_labels);
plt.xlabel("Predicted PFCandidate")
plt.ylabel("True PFCandidate")
plt.title("MLPF trained on PF", y=1.03)
#plt.tight_layout()
plt.savefig("cm_normed.pdf", bbox_inches="tight")

In [None]:
plt.figure(figsize=(8, 8))
ax = plt.axes()
plt.imshow(cm[1:, 1:], cmap="Blues")
plt.colorbar()

cms_label(x1=0.18, x2=0.52, y=0.82)
#sample_label(ax, x=0.8, y=1.0)
plt.xticks(range(len(y_labels)), y_labels);
plt.yticks(range(len(y_labels)), y_labels);
plt.xlabel("Predicted PFCandidate")
plt.ylabel("True PFCandidate")
plt.title("MLPF trained on PF", y=1.03)
plt.savefig("cm.pdf", bbox_inches="tight")

In [None]:
bins = {
    2: np.linspace(0,100,100),
    3: np.linspace(-8,8,100),
    4: np.linspace(-1,1,100),
    5: np.linspace(-1,1,100),
    6: np.linspace(0,500,100),
}

In [None]:
for icls in range(1,8):
    for ivar in range(2,7):
        plt.figure()
        ax = plt.axes()
        b = bins[ivar]
        #plt.hist(ygen_f[ygen_f[:, 0]==icls, ivar], bins=b, histtype="step", lw=2, label="gen");
        plt.hist(ycand_f[ycand_f[:, 0]==icls, ivar], bins=b, histtype="step", lw=2, label="PF");
        plt.hist(ypred_f[ypred_id_f==icls, ivar], bins=b, histtype="step", lw=2, label="MLPF");
        plt.yscale("log")
        plt.legend()
        plt.title(pid_names_long[icls], y=1.05)
        plt.xlabel(var_names[ivar])
        plt.ylabel("Number of particles")
        cms_label(x1=0.2, x2=0.6)
        plt.savefig("distribution_icls{}_ivar{}.pdf".format(icls, ivar))

In [None]:
fig, axes = plt.subplots(7, 6, figsize=(6*6,7*5))

for axs, icls in zip(axes, range(1,8)):    
    axes = axs.flatten()
    
    npred = np.sum(ypred_id == icls, axis=1)
    ncand = np.sum(ycand[:, :, 0] == icls, axis=1)
    ngen = np.sum(ygen[:, :, 0] == icls, axis=1)
    
    a = 0.5*min(np.min(npred), np.min(ncand))
    b = 1.5*max(np.max(npred), np.max(ncand))
    
    axes[0].scatter(ncand, npred, marker=".")
    
    axes[0].set_xlim(a,b)
    axes[0].set_ylim(a,b)
    axes[0].plot([a,b],[a,b], color="black", ls="--")
    axes[0].set_title(pid_names[icls])
    axes[0].set_xlabel("number of PFCandidates")
    axes[0].set_ylabel("number of MLPFCandidates")
    
    msk_both = (ycand_f[:, 0]==icls) & (ypred_id_f==icls)
    print(icls, np.sum(msk_both))

    for ivar, ax in zip([2,3,4,5,6], axes[1:]):
        
        hist = np.histogram2d(
            ycand_f[msk_both, ivar],
            ypred_f[msk_both, ivar], bins=(bins[ivar], bins[ivar])
        )
        norm = matplotlib.colors.Normalize(vmin=0, vmax=max(10, np.max(hist[0])))
        if ivar == 2 or ivar == 6:
            norm =  matplotlib.colors.LogNorm(vmin=1, vmax=max(10, 10*np.max(hist[0])))
        hep.hist2dplot(
            hist, cmap="Blues",
            norm=norm,
            ax=ax
        )
        ax.plot([bins[ivar][0],bins[ivar][-1]], [bins[ivar][0], bins[ivar][-1]], color="black", ls="--")
        ax.set_title("pred. {}, {}".format(pid_names[icls], var_names[ivar]))
        ax.set_xlabel("true value (PFCandidate)")
        ax.set_ylabel("reconstructed value (MLPF)")
plt.tight_layout()
plt.savefig("full_performance.pdf", bbox_inches="tight")