In [None]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import glob
import tqdm
import awkward as ak
import boost_histogram as bh
import sys

import mplhep
mplhep.style.use(mplhep.style.CMS)

In [None]:
path = "../data/clic_edm4hep/p8_ee_qcd_ecm365/"

In [None]:
def sum_overflow_into_last_bin(all_values):
    values = all_values[1:-1]
    values[-1] = values[-1] + all_values[-1]
    values[0] = values[0] + all_values[0]
    return values


def to_bh(data, bins, cumulative=False):
    h1 = bh.Histogram(bh.axis.Variable(bins))
    h1.fill(data)
    if cumulative:
        h1[:] = np.sum(h1.values()) - np.cumsum(h1)
    h1[:] = sum_overflow_into_last_bin(h1.values(flow=True)[:])
    return h1


In [None]:
# Load the datasets, process to flattened (X,ygen,ycand) format
ret = []
filelist = list(glob.glob("{}/*.parquet".format(path)))[:50]
print(len(filelist))

X_track = []
X_cluster = []

ygen_track = []
ygen_cluster = []

ycand_track = []
ycand_cluster = []

for fn in tqdm.tqdm(filelist):
    dd = ak.from_parquet(fn)
    
    X_track.append(dd["X_track"])
    X_cluster.append(dd["X_cluster"])
    
    ygen_track.append(dd["ygen_track"])
    ygen_cluster.append(dd["ygen_cluster"])
    
    ycand_track.append(dd["ycand_track"])
    ycand_cluster.append(dd["ycand_cluster"])

X_track = ak.concatenate(X_track)
X_cluster = ak.concatenate(X_cluster)
ygen_track = ak.concatenate(ygen_track)
ygen_cluster = ak.concatenate(ygen_cluster)
ycand_track = ak.concatenate(ycand_track)
ycand_cluster = ak.concatenate(ycand_cluster)


msk = (ak.num(X_track)>5) & (ak.num(X_cluster)>5) & (ak.sum(ygen_track[:, :, 0]!=0, axis=1)>2) & (ak.sum(ygen_cluster[:, :, 0]!=0, axis=1)>2)
X = ak.concatenate([X_track, X_cluster], axis=1)[msk]
ygen = ak.concatenate([ygen_track, ygen_cluster], axis=1)[msk]
ycand = ak.concatenate([ycand_track, ycand_cluster], axis=1)[msk]

## Number of PFelements per event

In [None]:
b = np.linspace(0, 200, 101)

h1 = to_bh(ak.num(X_track), b)
h2 = to_bh(ak.num(X_cluster), b)

mplhep.histplot(h1, histtype="step", lw=2, label="tracks", color="black")
mplhep.histplot(h2, histtype="step", lw=2, label="clusters", color="red")
plt.xlabel("Number of PFElements / event")
plt.ylabel("Number of events")
plt.legend()

In [None]:
gen_pt = ak.flatten(ygen[ygen[:, :, 0]!=0][:, :, 2])
cand_pt = ak.flatten(ycand[ycand[:, :, 0]!=0][:, :, 2])

b = np.logspace(-2,3,100)
h1 = to_bh(gen_pt, b)
h2 = to_bh(cand_pt, b)

mplhep.histplot(h1, histtype="step", lw=2, label="Gen")
mplhep.histplot(h2, histtype="step", lw=2, label="PF")
plt.xscale("log")
plt.xlabel("particle $p_T$ [GeV]")
plt.ylabel("Number of particles / bin")
plt.legend()

## Energy per event

In [None]:
E_gen_per_event = ak.to_numpy(ak.sum(ygen[:, :, 5], axis=1))
E_cand_per_event = ak.to_numpy(ak.sum(ycand[:, :, 5], axis=1))

b = np.linspace(0, 500, 61)
h1 = to_bh(E_gen_per_event, b)
h2 = to_bh(E_cand_per_event, b)

mplhep.histplot(h1, histtype="step", lw=2, label="Gen")
mplhep.histplot(h2, histtype="step", lw=2, label="PF")
plt.xlabel("Total E per event [GeV]")
plt.ylabel("Number of events")
plt.legend()

In [None]:
plt.figure(figsize=(12,10))
plt.hist2d(E_gen_per_event, E_cand_per_event, bins=(b, b), cmap="hot_r")
plt.plot([0, 500], [0, 500], lw=0.5, ls="--", color="black")
plt.xlabel("Gen sum E [GeV]")
plt.ylabel("PF sum E [GeV]")
plt.colorbar(label="events / bin")

## Gen vs. PF energy of individual particles

In [None]:
msk = (ygen[:, :, 0] != 0) & (ycand[:, :, 0] != 0)
gen_e = ak.to_numpy(ak.flatten(ygen[msk][:, :, 5]))
cand_e = ak.to_numpy(ak.flatten(ycand[msk][:, :, 5]))

b = np.logspace(-2, 3, 101)

plt.figure(figsize=(12,10))

plt.hist2d(gen_e, cand_e, bins=(b, b), cmap="hot_r")
plt.plot([10**-2, 10**3], [10**-2, 10**3], lw=0.5, ls="--", color="black")
plt.xscale("log")
plt.yscale("log")

plt.xlabel("Gen particle E [GeV]")
plt.ylabel("PF particle E [GeV]")
plt.colorbar(label="particles / bin")