In [None]:
import torch
import torch_geometric
import sklearn
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from torch_geometric.data import Data, DataLoader
import pandas
import mplhep

import sys
sys.path += ["../test"]

import train_end2end
%matplotlib inline

In [None]:
from collections import Counter

In [None]:
model = "PFNet6_TTbar_14TeV_TuneCUETP8M1_cfi_gen__npar_221712__cfg_ac56a704be__user_jpata__ntrain_1800__lr_0.001__1586582232"
epoch = 30
big_df = pandas.read_pickle("../data/{}/epoch_{}/df.pkl.bz2".format(model, epoch))
#big_df = pandas.read_pickle("../test/TTbar_14TeV_TuneCUETP8M1_cfi.pkl.bz2")

In [None]:
losses_train = pandas.read_csv("../data/{}/epoch_{}/losses_train.txt".format(model, epoch),header=None,sep=" ")
losses_test = pandas.read_csv("../data/{}/epoch_{}/losses_test.txt".format(model, epoch),header=None,sep=" ")
start_idx = 0
idx = epoch

In [None]:
def rm(x, N):
    return np.convolve(x, np.ones((N,))/N, mode='valid')

In [None]:
ax = plt.axes()
plt.plot(rm(losses_train.values.sum(axis=1)[start_idx:idx],1), color="blue")
ax.set_ylabel("train loss", color="blue")
ax = ax.twinx()
plt.plot(rm(losses_test.values.sum(axis=1)[start_idx:idx],1), color="orange")
ax.set_ylabel("train loss", color="orange")

In [None]:
ax = plt.axes()
plt.plot(rm(losses_train[0][start_idx:idx], 1), color="blue")
ax.set_ylabel("train loss", color="blue")
ax = ax.twinx()
plt.plot(rm(losses_test[0][start_idx:idx], 1), color="orange")
ax.set_ylabel("test loss", color="orange")

In [None]:
ax = plt.axes()
plt.plot(rm(losses_train[1][start_idx:idx], 2), color="blue")
ax.set_ylabel("test loss", color="blue")
ax = ax.twinx()
plt.plot(rm(losses_test[1][start_idx:idx], 2), color="orange")
ax.set_ylabel("test loss", color="orange")

In [None]:
ax = plt.axes()
plt.plot(rm(losses_train[2][start_idx:idx], 1), color="blue")
ax.set_ylabel("test loss", color="blue")
ax = ax.twinx()
plt.plot(rm(losses_test[2][start_idx:idx], 1), color="orange")
ax.set_ylabel("test loss", color="orange")

In [None]:
Counter(big_df["gen_pid"])

In [None]:
msk = big_df["gen_pid"]!=0

confusion1 = sklearn.metrics.confusion_matrix(
    big_df["gen_pid"][msk], big_df["cand_pid"][msk],
    labels=train_end2end.class_labels[1:]
)
train_end2end.plot_confusion_matrix(
    cm=confusion1, target_names=[int(x) for x in train_end2end.class_labels][1:], normalize=True
)
plt.title("Standard PF")

In [None]:
confusion2 = sklearn.metrics.confusion_matrix(
    big_df["gen_pid"][msk], big_df["pred_pid"][msk],
    labels=train_end2end.class_labels[1:]
)
train_end2end.plot_confusion_matrix(
    cm=confusion2, target_names=[int(x) for x in train_end2end.class_labels][1:], normalize=True
)
plt.title("ML-PF to generator-level")

In [None]:
bins_eta = np.linspace(-4, 4, 21)
bins_pt = np.logspace(-1, 2, 21)

In [None]:
def get_eff(df, target_pid=None):
    v0 = np.sum(df==target_pid)
    return (v0 / len(df), np.sqrt(v0)/len(df))

In [None]:
def get_effs_cand_pred(pid, by, bins):
    bs = by + "_bins"
    big_df[bs] = np.searchsorted(bins, big_df[by])
    
    vals = big_df[(big_df["gen_pid"]==pid)].groupby(bs)["cand_pid"].apply(get_eff, target_pid=pid)
    xs1 = [bins[min(k, len(bins)-1)] for k in vals.keys()][:-1]
    ys1 = [v[0] for v in vals.values][:-1]
    es1 = [v[1] for v in vals.values][:-1]

    vals = big_df[(big_df["gen_pid"]==pid)].groupby(bs)["pred_pid"].apply(get_eff, target_pid=pid)
    xs2 = [bins[min(k, len(bins)-1)] for k in vals.keys()][:-1]
    ys2 = [v[0] for v in vals.values][:-1]
    es2 = [v[1] for v in vals.values][:-1]
    
    return xs1, (ys1, es1), (ys2, es2)

In [None]:
#for pid in [211, -211, 130, 22, 1, 2, -11, 11]:
for pid in [211, 130, 22, 1, 2]:
    plt.figure(figsize=(4,4))
    xs, (ys1, es1), (ys2, es2) = get_effs_cand_pred(pid, "gen_pt", bins_pt)
    plt.errorbar(xs, ys1, es1, lw=0, elinewidth=1, marker="v", label="standard PF")
    plt.errorbar(xs, ys2, es2, lw=0, elinewidth=1, marker="^", label="ML-PF")
    plt.legend(frameon=False)
    plt.ylim(0, 1.5)
    plt.xscale("log")
    plt.xlabel("gen pt")
    plt.title("pid={}".format(pid))

    plt.figure(figsize=(4,4))
    xs, (ys1, es1), (ys2, es2) = get_effs_cand_pred(pid, "gen_eta", bins_eta)
    plt.errorbar(xs, ys1, es1, lw=0, elinewidth=1, marker="v", label="standard PF")
    plt.errorbar(xs, ys2, es2, lw=0, elinewidth=1, marker="^", label="ML-PF")
    plt.legend(frameon=False)
    plt.ylim(0, 1.5)
    plt.xlabel("gen eta")
    plt.title("pid={}".format(pid))

In [None]:
xs = np.arange(len(confusion1))
plt.bar(xs, np.diag(confusion1)/confusion1.sum(axis=1), width=0.2, label="Standard PF")
plt.bar(xs + 0.2, np.diag(confusion2)/confusion2.sum(axis=1), width=0.2, label="ML-PF (gen-level)")
plt.xticks(xs+0.1, [int(x) for x in train_end2end.class_labels][1:]);
#plt.yscale("log")
plt.ylabel("Fraction of GenParticles\nreconstructed")
plt.ylim(0,1.2)
plt.yticks(np.arange(0,1.1,0.1))
plt.legend(frameon=False)
#plt.axhline(1, color="black", lw=1.0)

In [None]:
def make_plot_reg(big_df, pid, pred_type, val, bins):

    m = big_df[(big_df["gen_pid"]==pid) & (big_df["{}_pid".format(pred_type)]==pid)][["gen_{}".format(val), "{}_{}".format(pred_type, val)]].values
    corr = np.corrcoef(m[:, 0], m[:, 1])[0,1]
    
    plt.figure(figsize=(4,4))
    plt.hist(m[:, 0], bins=bins, histtype="step", lw=2, label="true")
    plt.hist(m[:, 1], bins=bins, histtype="step", lw=2, label="pred")
    plt.xlabel(val)
    plt.legend(frameon=False)
    
    ngen = np.sum((big_df["gen_pid"]==pid))
    tpr = np.sum((big_df["gen_pid"]==pid) & (big_df["{}_pid".format(pred_type)]==pid)) / float(np.sum((big_df["gen_pid"]==pid)))
    fpr = np.sum((big_df["gen_pid"]!=pid) & (big_df["{}_pid".format(pred_type)]==pid)) / float(np.sum((big_df["{}_pid".format(pred_type)]==pid)))

    plt.figure(figsize=(4,4))
    plt.title("tpr={:.4f} fpr={:.4f}\nngen={} corr={:.4f}".format(tpr, fpr, ngen, corr))
    h = np.histogram2d(m[:, 0], m[:, 1], bins=(bins, bins))
    mplhep.hist2dplot(h[0], h[1], h[2], cmap="Blues", cbar=False)
    plt.xlabel("True {}".format(val))
    plt.ylabel("Predicted {}".format(val))

    plt.figure(figsize=(4,4))
    var = np.abs(m[:, 1] / m[:, 0])
    var[var>100] = 100
    var[var < 0] = 0
    plt.hist(var, bins=np.linspace(0, 2, 101))
    plt.xlabel("true {} / predicted {}".format(val, val))
    plt.title("mu={:.4f} s={:.4f}".format(np.mean(var), np.std(var)))
    #plt.axvline(1.0, color="black")
    ##plt.yscale("log")


In [None]:
bins = np.linspace(0, 2, 201)
pid = 211
val = "pt"

make_plot_reg(big_df, pid, "cand", val, bins)
make_plot_reg(big_df, pid, "pred", val, bins)

In [None]:
bins = np.linspace(-4, 4, 101)
pid = 211
val = "eta"

make_plot_reg(big_df, pid, "cand", val, bins)
make_plot_reg(big_df, pid, "pred", val, bins)

In [None]:
bins = np.linspace(-4, 4, 101)
pid = 211
val = "phi"

make_plot_reg(big_df, pid, "cand", val, bins)
make_plot_reg(big_df, pid, "pred", val, bins)

In [None]:
# n_preds = []
# n_trues = []
# for i in range(len(pred_ids)):
#     n_true = np.sum(true_ids[i]!=0)
#     n_pred = np.sum(pred_ids[i]!=0)
#     n_preds += [n_pred]
#     n_trues += [n_true]

In [None]:
# plt.figure(figsize=(5, 5))
# ax = plt.axes()
# plt.plot([1500,5000],[1500,5000], color="black", lw=0.5)
# plt.scatter(n_trues, n_preds, marker=".", alpha=0.5)
# plt.xlim(1500,5000)
# plt.ylim(1500,5000)
# plt.xlabel("Number of Target PF Candidates",fontsize=13)
# plt.ylabel("Number of Predicted GNN Candidates",fontsize=13)
# #plt.title("QCD Run3")

# plt.text(0.67, 1.05, "Run 3 (14 TeV)", transform=ax.transAxes, va="top", ha="left",size=12)
# plt.text(0.02, 0.98, "CMS", transform=ax.transAxes, va="top", ha="left",size=16, fontweight='bold')
# plt.text(0.18, 0.975, "Simulation Preliminary", transform=ax.transAxes, va="top", ha="left",size=12,style='italic')
# #plt.text(0.03, 0.92, "QCD dijet events", transform=ax.transAxes, va="top", ha="left",size=12)
# plt.text(0.03, 0.92, "$\mathrm{t}\overline{\mathrm{t}}$ events", transform=ax.transAxes, va="top", ha="left",size=12)
# plt.tight_layout()
# #plt.savefig("num_pred.pdf")

In [None]:
# cms = []
# for i in range(len(pred_ids)):
#     cm = sklearn.metrics.confusion_matrix(
#         true_ids[i],
#         pred_ids[i], labels=range(len(train_end2end.class_labels))
#     )
#     cms += [cm]
# cm = sum(cms)
# cm = cm / 1000.0
# cm = np.round(cm, 1)#.astype(np.int)

In [None]:
# train_end2end.plot_confusion_matrix(cm, [int(x) for x in train_end2end.class_labels], normalize=True)
# #plt.xlim(-0.5, 9.5)
# #plt.ylim(-0.5, 9.5)
# plt.title("Normalized Confusion Matrix (QCD Run3)")
# #plt.text(0.02, 0.98, "CMS Simulation, preliminary", transform=ax.transAxes, va="top", ha="left")
# #plt.tight_layout()
# plt.savefig("cm.pdf")

In [None]:
# pm = np.concatenate(pred_momenta)
# tm = np.concatenate(true_momenta)
# ti = np.concatenate(true_ids)
# pi = np.concatenate(pred_ids)


# pm[:, 0] = np.power(10, pm[:, 0])
# tm[:, 0] = np.power(10, tm[:, 0])

In [None]:
# plt.figure(figsize=(5, 5))

# ax = plt.axes()
# bins = np.linspace(0, 50, 100)
# h0 = plt.hist(pm[pi!=0, 0], bins=bins, histtype="step", lw=1, label="PF");
# h1 = plt.hist(tm[ti!=0, 0], bins=bins, histtype="step", lw=1, label="GNN");
# plt.yscale("log")
# plt.legend(frameon=False)
# plt.ylim(10, 1e7)

# plt.xlabel("Candidate $p_{\mathrm{T}}$ (a.u.)",fontsize=13)
# plt.ylabel("Number of Candidates",fontsize=13)
# #plt.title("QCD Run 3")

# plt.text(0.67, 1.05, "Run 3 (14 TeV)", transform=ax.transAxes, va="top", ha="left",size=12)
# plt.text(0.02, 0.98, "CMS", transform=ax.transAxes, va="top", ha="left",size=16, fontweight='bold')
# plt.text(0.18, 0.975, "Simulation Preliminary", transform=ax.transAxes, va="top", ha="left",size=12,style='italic')
# #plt.text(0.03, 0.92, "QCD dijet events", transform=ax.transAxes, va="top", ha="left",size=12)
# plt.text(0.03, 0.92, "$\mathrm{t}\overline{\mathrm{t}}$ events", transform=ax.transAxes, va="top", ha="left",size=12)
# plt.tight_layout()
# plt.savefig("pt_hist.pdf")

In [None]:
# plt.figure(figsize=(5, 5))
# ax = plt.axes()

# bins = np.linspace(-4, 4, 100)
# plt.hist(pm[pi!=0, 1], bins=bins, histtype="step", lw=1);
# plt.hist(tm[ti!=0, 1], bins=bins, histtype="step", lw=1);
# plt.yscale("log")

# plt.ylim(1000, 1e6)
# plt.xlabel("Candidate $\eta$ (a.u.)",fontsize=13)
# plt.ylabel("Number of Candidates",fontsize=13)
# #plt.title("QCD Run 3")
# plt.text(0.67, 1.05, "Run 3 (14 TeV)", transform=ax.transAxes, va="top", ha="left",size=12)
# plt.text(0.02, 0.98, "CMS", transform=ax.transAxes, va="top", ha="left",size=16, fontweight='bold')
# plt.text(0.18, 0.975, "Simulation Preliminary", transform=ax.transAxes, va="top", ha="left",size=12,style='italic')
# #plt.text(0.03, 0.92, "QCD dijet events", transform=ax.transAxes, va="top", ha="left",size=12)
# plt.text(0.03, 0.92, "$\mathrm{t}\overline{\mathrm{t}}$ events", transform=ax.transAxes, va="top", ha="left",size=12)
# plt.tight_layout()
# plt.savefig("eta_hist.pdf")

In [None]:
# plt.figure(figsize=(5, 5))

# ax = plt.axes()
# bins = np.linspace(-3, 3, 60)
# plt.hist(pm[pi!=0, 2], bins=bins, histtype="step", lw=1);
# plt.hist(tm[ti!=0, 2], bins=bins, histtype="step", lw=1);
# plt.yscale("log")
# plt.ylim(1000, 1e6)

# plt.xlabel("Candidate $\phi$ (a.u.)",fontsize=13)
# plt.ylabel("Number of Candidates",fontsize=13)
# #plt.title("QCD Run 3")

# plt.text(0.67, 1.05, "Run 3 (14 TeV)", transform=ax.transAxes, va="top", ha="left",size=12)
# plt.text(0.02, 0.98, "CMS", transform=ax.transAxes, va="top", ha="left",size=16, fontweight='bold')
# plt.text(0.18, 0.975, "Simulation Preliminary", transform=ax.transAxes, va="top", ha="left",size=12,style='italic')
# #plt.text(0.03, 0.92, "QCD dijet events", transform=ax.transAxes, va="top", ha="left",size=12)
# plt.text(0.03, 0.92, "$\mathrm{t}\overline{\mathrm{t}}$ events", transform=ax.transAxes, va="top", ha="left",size=12)
# plt.tight_layout()
# plt.savefig("phi_hist.pdf")

In [None]:
# plt.figure(figsize=(5, 5))
# ax = plt.axes()

# subidx = np.where((pi!=0)&(ti!=0))[0]
# rp = np.random.permutation(range(len(subidx)))[:1000]

# plt.scatter(pm[subidx[rp], 0], tm[subidx[rp], 0], marker=".", alpha=0.5)
# plt.xlim(0,2)
# plt.ylim(0,2)
# plt.plot([0,2],[0,2], color="black")

# plt.xlabel("Target PF Candidate $p_{\mathrm{T}}$ (a.u.)",fontsize=13)
# plt.ylabel("Predicted GNN Candidate $p_{\mathrm{T}}$ (a.u.)", fontsize=13)
# #plt.title("QCD Run 3, 1000 candidates")

# plt.text(0.67, 1.05, "Run 3 (14 TeV)", transform=ax.transAxes, va="top", ha="left",size=12)
# plt.text(0.02, 0.98, "CMS", transform=ax.transAxes, va="top", ha="left",size=16, fontweight='bold')
# plt.text(0.18, 0.975, "Simulation Preliminary", transform=ax.transAxes, va="top", ha="left",size=12,style='italic')
# #plt.text(0.03, 0.92, "QCD dijet events", transform=ax.transAxes, va="top", ha="left",size=12)
# plt.text(0.03, 0.92, "$\mathrm{t}\overline{\mathrm{t}}$ events", transform=ax.transAxes, va="top", ha="left",size=12)
# plt.tight_layout()
# plt.savefig("pt_corr.pdf")

In [None]:
# plt.figure(figsize=(5, 5))
# ax = plt.axes()

# plt.plot([-7, 7], [-7, 7], color="black", lw=0.5)
# plt.scatter(pm[subidx[rp], 1], tm[subidx[rp], 1], marker=".", alpha=0.5)
# plt.xlim(-7, 7)
# plt.ylim(-7, 7)

# plt.xlabel("Target PF Candidate $\eta$ (a.u.)",fontsize=13)
# plt.ylabel("Predicted GNN Candidate $\eta$ (a.u.)",fontsize=13)
# #plt.title("QCD Run 3, 1000 candidates")
# plt.text(0.67, 1.05, "Run 3 (14 TeV)", transform=ax.transAxes, va="top", ha="left",size=12)
# plt.text(0.02, 0.98, "CMS", transform=ax.transAxes, va="top", ha="left",size=16, fontweight='bold')
# plt.text(0.18, 0.975, "Simulation Preliminary", transform=ax.transAxes, va="top", ha="left",size=12,style='italic')
# #plt.text(0.03, 0.92, "QCD dijet events", transform=ax.transAxes, va="top", ha="left",size=12)
# plt.text(0.03, 0.92, "$\mathrm{t}\overline{\mathrm{t}}$ events", transform=ax.transAxes, va="top", ha="left",size=12)
# plt.tight_layout()
# plt.savefig("eta_corr.pdf")

In [None]:
# plt.figure(figsize=(5, 5))
# ax = plt.axes()

# plt.plot([-5, 5], [-5, 5], color="black", lw=0.5)
# plt.scatter(pm[subidx[rp], 2], tm[subidx[rp], 2], marker=".", alpha=0.5)
# plt.xlim(-3,3)
# plt.ylim(-3,3)


# plt.xlabel("Target PF Candidate $\phi$ (a.u.)",fontsize=13)
# plt.ylabel("Predicted GNN Candidate $\phi$ (a.u.)",fontsize=13)
# #plt.title("QCD Run3, 1000 candidates")

# plt.text(0.67, 1.05, "Run 3 (14 TeV)", transform=ax.transAxes, va="top", ha="left",size=12)
# plt.text(0.02, 0.98, "CMS", transform=ax.transAxes, va="top", ha="left",size=16, fontweight='bold')
# plt.text(0.18, 0.975, "Simulation Preliminary", transform=ax.transAxes, va="top", ha="left",size=12,style='italic')
# #plt.text(0.03, 0.92, "QCD dijet events", transform=ax.transAxes, va="top", ha="left",size=12)
# plt.text(0.03, 0.92, "$\mathrm{t}\overline{\mathrm{t}}$ events", transform=ax.transAxes, va="top", ha="left",size=12)
# plt.tight_layout()
# plt.savefig("phi_corr.pdf")

In [None]:
# import pandas as pd
# import tqdm

# import matplotlib as mpl
# mpl.rcParams['figure.figsize'] = [8.0, 6.0]
# mpl.rcParams['font.size'] = 12
# mpl.rcParams['legend.fontsize'] = 'large'
# mpl.rcParams['figure.titlesize'] = 'medium'

# d = full_dataset.get(1)
# d.batch = torch.zeros((len(d.x)), dtype=torch.long)
# d = d.to(device=device)
# train_end2end.data_prep(d, device=device)
# edges, cand_id_onehot, cand_momentum = model(d)
# output = edges.detach().cpu().numpy()
# d = full_dataset.get(1)
# x_data = d.x.detach().cpu().numpy()
# mask = ((x_data[:,4]==0) & (x_data[:,5]==0) & (x_data[:,6]==0) & (x_data[:,7]==0))
# good_index = np.zeros((x_data.shape[0],1,2),dtype=int)
# good_x = x_data[:,2:4].copy()                                                                            
# good_x[~mask] = x_data[~mask,2:4].copy()
# df = pd.DataFrame(good_x, columns=['eta','phi'])
# df['isTrack'] = ~mask
# row, col = d.edge_index.cpu().detach().numpy()
# y_truth = d.ycand.cpu().detach().numpy()

# min_phi = -1.25
# max_phi = 1.25
# min_eta = -1.25
# max_eta = 1.25
# extra = 1.0
# x = 'eta'
# y = 'phi'
# for plot_type in [['input'],['truth'],['output']]: 
#     k = 0
#     plt.figure(figsize=(8, 6))                        
#     for i, j in tqdm.tqdm(zip(row, col),total=len(y_truth)):
#         x1 = df[x][i]
#         x2 = df[x][j]
#         y1 = df[y][i]
#         y2 = df[y][j]
#         if (x1 < min_eta-extra or x1 > max_eta+extra) or (x2 < min_eta-extra or x2 > max_eta+extra): continue
#         if (y1 < min_phi-extra or y1 > max_phi+extra) or (y2 < min_phi-extra or y2 > max_phi+extra): continue
#         if 'input' in plot_type:
#             seg_args = dict(c='b',alpha=0.1,zorder=1)
#             plt.plot([df[x][i], df[x][j]],
#                  [df[y][i], df[y][j]], '-', **seg_args)
#         if 'truth' in plot_type and y_truth[k]:
#             seg_args = dict(c='r',alpha=0.8,zorder=2)
#             plt.plot([df[x][i], df[x][j]],
#                  [df[y][i], df[y][j]], '-', **seg_args)
#         if 'output' in plot_type:
#             seg_args = dict(c='g',alpha=output[k].item(),zorder=3)
#             plt.plot([df[x][i], df[x][j]],
#                  [df[y][i], df[y][j]], '-', **seg_args)
#         k+=1
#     cut_mask = (df[x] > min_eta-extra) & (df[x] < max_eta+extra) & (df[y] > min_phi-extra) & (df[y] < max_phi+extra)
#     cluster_mask = cut_mask & ~df['isTrack']
#     track_mask = cut_mask & df['isTrack']
#     plt.scatter(df[x][cluster_mask], df[y][cluster_mask],c='g',marker='o',s=50,zorder=4,alpha=1)
#     plt.scatter(df[x][track_mask], df[y][track_mask],c='b',marker='p',s=50,zorder=5,alpha=1)
#     plt.xlabel("Track or Cluster $\eta$",fontsize=18)
#     plt.ylabel("Track or Cluster $\phi$",fontsize=18)
#     plt.xlim(min_eta, max_eta)
#     plt.ylim(min_phi, max_phi)
#     plt.figtext(0.12, 0.90,'CMS',fontweight='bold', wrap=True, horizontalalignment='left', fontsize=20)
#     plt.figtext(0.22, 0.90,'Simulation Preliminary', style='italic', wrap=True, horizontalalignment='left', fontsize=18)
#     plt.figtext(0.67, 0.90,'Run 3 (14 TeV)',  wrap=True, horizontalalignment='left', fontsize=18)
#     plt.savefig('graph_%s_%s_%s.pdf'%(x,y,'_'.join(plot_type)))