In [None]:
!ls ../../experiments/pyg-cms_20250517_232752_544969/
!mkdir -p pu_performance

In [None]:
import vector
from tqdm.auto import tqdm
import awkward as ak

import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, roc_auc_score

import numpy as np

import glob

import sys
sys.path += ["../../mlpf/plotting/"]
from plot_utils import experiment_label, sample_label

def cms_label(ax):
    return experiment_label(ax, experiment="CMS", tag1="Simulation Preliminary", tag2="Run 3 (13.6 TeV)", x1=0.13)

prcoesses = {
    'cms_pf_ttbar': 't'+r'$\bar{t}$+PU'+'\nPU 55-75',
    'cms_pf_qcd': 'QCD $p_{T}\in$[15, 3000] GeV\nPU 55-75',
    'cms_pf_qcd13p6': 'QCD $p_{T}\in$[15, 3000] GeV\nPU 55-75',
    'cms_pf_ztt': r'$Z\rightarrow \tau\tau$'+'\nPU 55-75'
}

In [None]:
which_process = 'cms_pf_qcd13p6'
#modify path as appropriate
files = glob.glob(f'../../experiments/pyg-cms_20250517_232752_544969/preds_checkpoint-08-3.863894/{which_process}/*parquet')[:500]
arr = ak.concatenate([ak.from_parquet(file) for file in tqdm(files)])

In [None]:
particle_types = {
    'ch. had.': 1,
    'n. had.': 2,
    '$e^\pm$': 6,
    '$\mu^\pm$': 7,
    '$\gamma$': 5,
    'HF had.': 3,
    'HF e.m.': 4,
}

particle_types_literal = {
    'ch. had.': 'chhad',
    'n. had.': 'nhad',
    '$e^\pm$': 'e',
    '$\mu^\pm$': 'mu',
    '$\gamma$': 'gamma',
    'HF had.': 'hfhad',
    'HF e.m.': 'hfem',
}

pu_target_pred = {}

for particle_type in particle_types:
    sel = (arr.particles.target.cls_id == particle_types[particle_type])
    sel1 = (arr.particles.target.pt > 5)# & (arr.particles.pred.pt > 5)
    pu_target_pred[particle_type] = [arr.particles.target[sel & sel1].ispu, arr.particles.pred[sel & sel1].ispu]

In [None]:
import mplhep as hep

fig = plt.figure()
ax = plt.axes()
plt.style.use(hep.style.CMS)

for particle_type in particle_types:
    fpr, tpr, thresholds = roc_curve(ak.flatten(pu_target_pred[particle_type][0]) == 1, ak.flatten(pu_target_pred[particle_type][1]))
    auc = roc_auc_score(ak.flatten(pu_target_pred[particle_type][0]) == 1, ak.flatten(pu_target_pred[particle_type][1]))
    ls = "-"
    if particle_type in ["HF had.", "HF e.m."]:
        ls = "--"
    plt.plot(fpr, tpr, label = particle_type+', AUC = %0.3f' % auc, linewidth=2.5, ls=ls)

plt.plot([0, 1], [0, 1],'k--', label='AUC=0.5')
plt.legend(loc = 'lower right', frameon=False, title=prcoesses[which_process], fontsize=16, title_fontsize=24)
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True positive rate', loc='top')
plt.xlabel('False positive rate', loc='right')

cms_label(ax)
#hep.cms.label("Preliminary", data=False, com=13.6, year='Run 3')

#modify path as appropriate
plt.savefig(f'pu_performance/{which_process}_pu_roc.pdf')

In [None]:
np.searchsorted(thresholds[::-1], 0.1)

In [None]:
import mplhep as hep

fig = plt.figure()
ax = plt.axes()
plt.style.use(hep.style.CMS)

for particle_type, ls in zip(['$\gamma$', 'n. had.'], ["-", "--"]):
    fpr, tpr, thresholds = roc_curve(ak.flatten(pu_target_pred[particle_type][0]) == 1, ak.flatten(pu_target_pred[particle_type][1]))
    thresh_idx = len(thresholds) - np.searchsorted(thresholds[::-1], 0.1)
    auc = roc_auc_score(ak.flatten(pu_target_pred[particle_type][0]) == 1, ak.flatten(pu_target_pred[particle_type][1]))
    c = plt.plot(fpr, tpr, label = particle_type+', AUC = %0.3f' % auc, linewidth=2.5, ls=ls)
    plt.plot([fpr[thresh_idx]], [tpr[thresh_idx]], marker="o", color=c[0].get_color(), markersize=10)

plt.plot([0, 1], [0, 1],'k--', label='AUC=0.5')
plt.legend(loc = 'lower right', frameon=False, title=prcoesses[which_process] + "\n$p_{T,ptcl}$ > 5 GeV", fontsize=16, title_fontsize=24)
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True positive rate', loc='top')
plt.xlabel('False positive rate', loc='right')

cms_label(ax)
#hep.cms.label("Preliminary", data=False, com=13.6, year='Run 3')

#modify path as appropriate
plt.savefig(f'pu_performance/{which_process}_pu_roc.pdf')

In [None]:
for particle_type in pu_target_pred:

    plt.style.use(hep.style.CMS)

    fig = plt.figure()
    ax = plt.gca()

    bins = np.linspace(0,1,100)
    plt.hist(ak.flatten(pu_target_pred[particle_type][1][pu_target_pred[particle_type][0]==0]), histtype='step', bins=bins, label='non-PU', linewidth=2.5, density=1)
    plt.hist(ak.flatten(pu_target_pred[particle_type][1][pu_target_pred[particle_type][0]==1]), histtype='step', bins=bins, label='PU', linewidth=2.5, density=1)

    ax.set_yscale('log')
    ax.set_xlabel("MLPF pileup pred. ", loc='right')
    ax.set_ylabel('Particles', loc='top')

    cms_label(ax)
    
    plt.legend(title=prcoesses[which_process]+'\n'+particle_type, frameon=False, loc=1)
    plt.ylim(top=ax.get_ylim()[1]*100)
    
    #modify path as appropriate
    plt.savefig(f'pu_performance/{which_process}_{particle_types_literal[particle_type]}_pu.pdf')