In [None]:
import numpy as np
import matplotlib.pyplot as plt
import numba
import pickle
import glob
import pandas as pd
from matplotlib.colors import LogNorm

In [None]:
def get_df(fl, name):
    bls = []
    for fi in fl:
        d = pickle.load(open(fi, "rb"))
        bls += [d[name]]
    return pd.DataFrame(bls)

def text_in_box(mat, thresh):
    for i in range(len(mat)):
        for j in range(len(mat)):
            plt.text(i, j, "{0:.3f}".format(mat[i,j]), ha="center", va="center", color="white" if mat[i, j] > thresh else "black")

In [None]:
def plot_num_blocks(df_blocks, df_blocks_dummy, df_blocks_clue, df_blocks_gnn, sample):
    plt.figure(figsize=(5,5))
    plt.scatter(df_blocks["num_blocks_true"], df_blocks["num_blocks_pred"], marker=".", label="Edge classifier", alpha=0.5)
    plt.scatter(df_blocks_dummy["num_blocks_true"], df_blocks_dummy["num_blocks_pred"], marker="x", label="PFBlockAlgo", alpha=0.5)
    plt.scatter(df_blocks_clue["num_blocks_true"], df_blocks_clue["num_blocks_pred"], marker="^", label="CLUE", alpha=0.5)
    plt.scatter(df_blocks_gnn["num_blocks_true"], df_blocks_gnn["num_blocks_pred"], marker="^", label="GNN", alpha=0.5)
    plt.xlim(0,5000)
    plt.ylim(0,5000)
    plt.plot([0,5000], [0,5000], color="black", lw=1, ls="--")
    plt.xlabel("number of blocks (true)")
    plt.ylabel("number of blocks (pred)")
    plt.title("Number of blocks, {0}".format(sample))
    plt.legend(frameon=False, loc="best")
    plt.savefig("num_blocks_{0}.pdf".format(sample), bbox_inches="tight")

In [None]:
def plot_block_size(df_blocks, df_blocks_dummy, df_blocks_clue, df_blocks_gnn, sample):
    plt.figure(figsize=(5,5))
    plt.scatter(df_blocks["max_block_size_true"], df_blocks["max_block_size_pred"], marker=".", label="Edge classifier", alpha=0.3)
    plt.scatter(df_blocks_dummy["max_block_size_true"], df_blocks_dummy["max_block_size_pred"], marker="x", label="PFBlockAlgo", alpha=0.3)
    plt.scatter(df_blocks_clue["max_block_size_true"], df_blocks_clue["max_block_size_pred"], marker="^", label="CLUE", alpha=0.3)
    plt.scatter(df_blocks_gnn["max_block_size_true"], df_blocks_gnn["max_block_size_pred"], marker="^", label="GNN", alpha=0.3)
    plt.xlim(0,3000)
    plt.ylim(0,3000)
    plt.plot([0,3000], [0,3000], color="black", lw=1, ls="--")
    plt.xlabel("maximum block size (true)")
    plt.ylabel("maximum block size (pred)")
    plt.title("Block finder model, {0}".format(sample))
    plt.legend(frameon=False, loc="best")
    plt.savefig("block_size_{0}.pdf".format(sample), bbox_inches="tight")

In [None]:
def plot_precision_recall(df_blocks, df_blocks_dummy, df_blocks_clue, df_blocks_gnn, sample):
    plt.figure(figsize=(5,5))
    plt.scatter(df_blocks["edge_precision"], df_blocks["edge_recall"], marker=".", alpha=0.5, label="Edge classifier")
    plt.scatter(df_blocks_dummy["edge_precision"], df_blocks_dummy["edge_recall"], marker="x", alpha=0.5, label="PFBlockAlgo")
    plt.scatter(df_blocks_clue["edge_precision"], df_blocks_clue["edge_recall"], marker="^", alpha=0.5, label="CLUE")
    plt.scatter(df_blocks_gnn["edge_precision"], df_blocks_gnn["edge_recall"], marker="^", alpha=0.5, label="GNN")

    plt.xlim(0,1.2)
    plt.ylim(0,1.2)

    plt.xlabel("edge precision: TP / (TP + FP)")
    plt.ylabel("edge recall: TP / (TP + FN)")
    plt.title("Edge classification, {0}".format(sample))
    plt.legend(frameon=False)
    plt.savefig("edge_precision_recall_{0}.pdf".format(sample), bbox_inches="tight")

In [None]:
def plot_block_size_histo(df_blocks, df_blocks_dummy, df_blocks_clue, df_blocks_gnn, sample):
    plt.figure(figsize=(5,5))
    b = np.logspace(0.1, 4, 40)
    plt.hist(df_blocks["max_block_size_pred"], bins=b, histtype="step", lw=2, label="Edge classifier, m={0:.0f}".format(np.mean(df_blocks["max_block_size_pred"])));
    plt.hist(df_blocks_dummy["max_block_size_pred"], bins=b, histtype="step", lw=2, label="PFBlockAlgo, m={0:.0f}".format(np.mean(df_blocks_dummy["max_block_size_pred"])));
    plt.hist(df_blocks_clue["max_block_size_pred"], bins=b, histtype="step", lw=2, label="GLUE, m={0:.0f}".format(np.mean(df_blocks_clue["max_block_size_pred"])));
    plt.hist(df_blocks_gnn["max_block_size_pred"], bins=b, histtype="step", lw=2, label="GNN, m={0:.0f}".format(np.mean(df_blocks_gnn["max_block_size_pred"])));
    plt.hist(df_blocks["max_block_size_true"], bins=b, histtype="step", lw=2, label="True blocks, m={0:.0f}".format(np.mean(df_blocks["max_block_size_true"])));
    plt.xscale("log")
    plt.legend(frameon=False)
    plt.title("Maximum block size, {0}".format(sample))
    plt.savefig("max_block_size_{0}.pdf".format(sample), bbox_inches="tight")

In [None]:
fl = glob.glob("../data/NuGun_run3/step3*.pkl")
df_blocks = get_df(fl, "blocks")
df_blocks_dummy = get_df(fl, "blocks_dummy")
df_blocks_clue = get_df(fl, "blocks_clue")

plot_num_blocks(df_blocks, df_blocks_dummy, df_blocks_clue, df_blocks_gnn, "NuGun-Run3")
plot_block_size(df_blocks, df_blocks_dummy, df_blocks_clue, df_blocks_gnn, "NuGun-Run3")
plot_block_size_histo(df_blocks, df_blocks_dummy, df_blocks_clue, df_blocks_gnn, "NuGun-Run3")
plot_precision_recall(df_blocks, df_blocks_dummy, df_blocks_clue, df_blocks_gnn, "NuGun-Run3")

In [None]:
fl = glob.glob("../data/QCD_run3/step3*.pkl")
df_blocks = get_df(fl, "blocks")
df_blocks_dummy = get_df(fl, "blocks_dummy")
df_blocks_clue = get_df(fl, "blocks_clue")

plot_num_blocks(df_blocks, df_blocks_dummy, df_blocks_clue, df_blocks_gnn, "QCD-Run3")
plot_block_size(df_blocks, df_blocks_dummy, df_blocks_clue, df_blocks_gnn, "QCD-Run3")
plot_block_size_histo(df_blocks, df_blocks_dummy, df_blocks_clue, df_blocks_gnn, "QCD-Run3")
plot_precision_recall(df_blocks, df_blocks_dummy, df_blocks_clue, df_blocks_gnn, "QCD-Run3")

In [None]:
fl = glob.glob("../data/TTbar_run3/step3*.pkl")
df_blocks = get_df(fl, "blocks")
df_blocks_dummy = get_df(fl, "blocks_dummy")
df_blocks_clue = get_df(fl, "blocks_clue")
df_blocks_gnn = get_df(fl, "blocks_gnn")

plot_num_blocks(df_blocks, df_blocks_dummy, df_blocks_clue, df_blocks_gnn, "TTbar-Run3")
plot_block_size(df_blocks, df_blocks_dummy, df_blocks_clue, df_blocks_gnn, "TTbar-Run3")
plot_block_size_histo(df_blocks, df_blocks_dummy, df_blocks_clue, df_blocks_gnn, "TTbar-Run3")
plot_precision_recall(df_blocks, df_blocks_dummy, df_blocks_clue, df_blocks_gnn, "TTbar-Run3")

In [None]:
# b = np.linspace(0,1,100)
# plt.hist(df_blocks["adjusted_mutual_info_score"], bins=b, label="Edge classifier");
# plt.hist(df_blocks_dummy["adjusted_mutual_info_score"], bins=b, label="PFBlockAlgo");
# plt.xlabel("adjusted MI score\n(higher is better)")
# plt.legend(frameon=False)

In [None]:
df_true_blocks = get_df(fl, "cand_true_blocks")

In [None]:
df_true_blocks.keys()

In [None]:
plt.figure(figsize=(5,5))
plt.scatter(df_true_blocks["num_cands_true"], df_true_blocks["num_cands_pred"], marker=".")
plt.xlim(0,4000)
plt.ylim(0,4000)
plt.title("True blocks, true vs. predicted candidates")
plt.plot([0,4000], [0,4000], color="black", lw=1, ls="--")

plt.xlabel("number of true candidates")
plt.ylabel("number of predicted candidates")

In [None]:
plt.figure(figsize=(5,5))
mat = df_true_blocks["ncand_confusion_matrix"].sum()
mat = 100.0 * mat / np.sum(mat)
plt.imshow(mat, cmap="Blues")
text_in_box(mat, 60)
plt.colorbar()
labels = range(4)
plt.xticks(range(len(labels)), labels=[int(x) for x in labels])
plt.yticks(range(len(labels)), labels=[int(x) for x in labels])
plt.xlim(-0.5,3.5)
plt.ylim(-0.5,3.5)
plt.title("True blocks, true vs. predicted candidates")
plt.xlabel("Number of true candidates")
plt.ylabel("Number of predicted candidates")

In [None]:
plt.figure(figsize=(10,10))
mat = df_true_blocks["pdgid_confusion_matrix"].sum()
mat = 100.0 * mat / np.sum(mat)
plt.imshow(mat, cmap="Blues")
text_in_box(mat, 20)
plt.colorbar()
labels = [-211, -13, 0, 1, 2, 13, 22, 130, 211]
plt.xticks(range(len(labels)), labels=[int(x) for x in labels])
plt.yticks(range(len(labels)), labels=[int(x) for x in labels])
plt.xlim(-0.5,8.5)
plt.ylim(-0.5,8.5)

plt.title("True blocks, true vs. predicted candidates (%)")
plt.xlabel("pdgid of true candidates")
plt.ylabel("pdgid of predicted candidates")

In [None]:
bins = np.linspace(0, 10, 20)
mat = df_true_blocks["pt_matrix"].sum()
plt.title("True blocks, true vs. predicted candidates")
plt.imshow(mat, norm=LogNorm(vmin=1, vmax=10*np.max(mat)), origin="lower", cmap="Blues", extent=(min(bins), max(bins), min(bins), max(bins)))
plt.colorbar()

plt.xlabel("true candidate $p_T$ [GeV]")
plt.ylabel("predicted candidate $p_T$ [GeV]")

In [None]:
bins = np.linspace(-6, 6, 20)
mat = df_true_blocks["eta_matrix"].sum()
#mat = 100 * mat / np.sum(mat)
plt.imshow(mat, norm=LogNorm(vmin=1, vmax=10*np.max(mat)), origin="lower", cmap="Blues", extent=(min(bins), max(bins), min(bins), max(bins)))
plt.colorbar()

plt.title("True blocks, true vs. predicted candidates")
plt.xlabel("true candidate $\eta$")
plt.ylabel("predicted candidate $\eta$")

In [None]:
mat = df_true_blocks["phi_matrix"].sum()
bins = np.linspace(-4, 4, 20)
plt.imshow(mat, norm=LogNorm(vmin=1, vmax=10*np.max(mat)), origin="lower", cmap="Blues", extent=(min(bins), max(bins), min(bins), max(bins)))
plt.colorbar()

plt.title("True blocks, true vs. predicted candidates")
plt.xlabel("true candidate $\phi$")
plt.ylabel("predicted candidate $\phi$")

In [None]:
df_pred_blocks = get_df(fl, "cand_pred_blocks")

In [None]:
df_pred_blocks.keys()

In [None]:
plt.figure(figsize=(5,5))
plt.scatter(df_pred_blocks["num_cands_true"], df_pred_blocks["num_cands_pred"], marker=".")
plt.xlim(0,4000)
plt.ylim(0,4000)
plt.plot([0,4000], [0,4000], color="black", lw=1, ls="--")

plt.title("True vs. predicted candidates\nusing predicted blocks")

plt.xlabel("number of true blocks")
plt.ylabel("number of predicted blocks")

In [None]:
plt.figure(figsize=(5,5))
plt.scatter(df_pred_blocks["num_cands_true"], df_pred_blocks["num_cands_matched"], marker=".")

plt.xlim(0,4000)
plt.ylim(0,4000)
plt.plot([0,4000], [0,4000], color="black", lw=1, ls="--")

plt.title("True vs. predicted candidates\nusing predicted blocks")

plt.xlabel("number of true candidates")
plt.ylabel("number of matched candidates")

In [None]:
plt.figure(figsize=(10,10))
mat = df_pred_blocks["pdgid_confusion_matrix"].sum()
mat = 100.0 * mat / np.sum(mat)
plt.imshow(mat, cmap="Blues")
text_in_box(mat, 20)
plt.colorbar()
labels = [-211, -13, 0, 1, 2, 13, 22, 130, 211]
plt.xticks(range(len(labels)), labels=[int(x) for x in labels])
plt.yticks(range(len(labels)), labels=[int(x) for x in labels])
plt.xlim(-0.5,8.5)
plt.ylim(-0.5,8.5)

plt.title("Predicted blocks, true vs. predicted candidates (matched) (%)")
plt.xlabel("pdgid of true candidates")
plt.ylabel("pdgid of predicted candidates")

In [None]:
mat = df_pred_blocks["pt_matrix"].sum()
bins = np.linspace(0, 10, 20)

#mat = 100 * mat / np.sum(mat)
plt.title("Predicted blocks, true vs. matched candidates")
plt.imshow(mat, norm=LogNorm(vmin=1, vmax=10*np.max(mat)), origin="lower", cmap="Blues", extent=(min(bins), max(bins), min(bins), max(bins)))
plt.colorbar()

plt.xlabel("true candidate $p_T$ [GeV]")
plt.ylabel("predicted candidate $p_T$ [GeV]")