In [None]:
import numpy as np
import glob
import matplotlib.pyplot as plt
import numba
from collections import Counter
import math
import sklearn
import sklearn.metrics

In [None]:
@numba.njit
def get_types_in_block(X, y, blk):
    return [int(x) for x in sorted(X[y==blk, 0])]

Load all elements

In [None]:
all_sgs = []

num_clusters = []
num_tracks = []
num_cands = []
num_blocks = []

for fi in glob.glob("../data/TTbar/*ev*.npz")[:500]:
    fi = open(fi, "rb")
    data = np.load(fi)
    
    #list of PF input elements in the event
    X = data["elements"]
    
    #tracks have type=1
    num_clusters += [np.sum(X[:, 0] != 1)]
    num_tracks += [np.sum(X[:, 0] == 1)]
    
    #unique ID for each cluster/block of elements that the PFAlgo considered independently
    #this can be considered as the target output of an improved PFBlockAlgo
    y = data["element_block_id"]
    num_blocks += [len(np.unique(y))]

    #List of candidates produced in the event.
    #This can be considered as the output of PFAlgo
    cands = data["candidates"]
    num_cands += [len(cands)]

    #get the types of the elements for each cluster/block
    sgs = [tuple(get_types_in_block(X, y, blk)) for blk in np.unique(y)]
    all_sgs += sgs

In [None]:
plt.hist(num_clusters, bins=np.linspace(0,5000,100), label="clusters", alpha=0.5);
plt.hist(num_tracks, bins=np.linspace(0,5000,100), label="tracks", alpha=0.5);
plt.legend()
plt.xlabel("number of elements")
plt.ylabel("number of events")

In [None]:
plt.hist(num_cands, bins=np.linspace(0,4000,100));
plt.xlabel("number of candidates")
plt.ylabel("number of events")

In [None]:
plt.hist(num_blocks, bins=np.linspace(0,4000,100));
plt.xlabel("number of miniblocks")
plt.ylabel("number of events")

Now we look at the number of blocks of a certain size.

In [None]:
block_sizes = Counter([len(sg) for sg in all_sgs])
print("block sizes", block_sizes)

In [None]:
plt.hist([len(sg) for sg in all_sgs], bins=np.linspace(0,100,101));
plt.xlabel("block size")
plt.ylabel("Number of blocks")

In [None]:
plt.hist([len(sg) for sg in all_sgs], bins=np.linspace(0,100,101));
plt.yscale("log")
plt.xlabel("block size")
plt.ylabel("number of blocks")

Let's look at what the blocks f size, 1, 2, 3 and 4 are made of.

In [None]:
def plot_block_nelem(blocks_nelem):
    kv = list(blocks_nelem.items())
    xs = np.arange(len(kv))
    ys = np.array([v for k, v in kv])

    plt.bar(xs, ys)
    plt.xticks(xs, [k for k, v in kv], rotation=90)
    

for blocksize in range(1,5):
    sizes = [",".join(map(str, sg)) for sg in all_sgs if len(sg)==blocksize]
    blocks_nelem = Counter(sizes)
    print("{0}-element blocks".format(blocksize), blocks_nelem)
    plt.figure(figsize=(4,4))
    plt.title("Blocks of size {0}: {1} ({2:.0f}%)".format(blocksize, len(sizes), 100.0*len(sizes)/len(all_sgs)))
    plot_block_nelem(blocks_nelem)
    plt.xlabel("Block element types")

Look at the first 10 blocks.

In [None]:
block_ids = data["element_block_id"]
inds_elem = np.arange(len(X))
inds_cand = np.arange(len(cands))
for blk in np.unique(block_ids)[:20]:
    candidates_from_block = data["candidate_block_id"] == blk
    elems_in_block = y == blk
    
    print("in block", blk, "had the following elements: {0}".format(get_types_in_block(X, y, blk)))
    for ielem in inds_elem[elems_in_block]:
        print("  elements[{0}]: type={1} energy={2:.2f}".format(ielem, int(X[ielem, 0]), X[ielem, 1]))
    print("from which the following candidates were produced")
    for icand in inds_cand[candidates_from_block]:
        print("  candidates[{0}]: pdgid={1} pt={2:.2f}".format(icand, int(cands[icand, 0]), cands[icand, 1]))
    print()

In [None]:
def get_unique_X_y(X, Xbl, y, ybl, blsize=3, maxn=3):
    uniqs = np.unique(Xbl)

    Xs = []
    ys = []
    for bl in uniqs:
        subX = X[Xbl==bl]
        suby = y[ybl==bl][:maxn]
        
        #choose only miniblocks with 3 elements to simplify the problem
        if subX.shape[0] > blsize:
            continue

        subX = np.pad(subX, ((0, blsize - subX.shape[0]), (0,0)), mode="constant")
        suby = np.pad(suby, ((0, maxn - suby.shape[0]), (0,0)), mode="constant")

        Xs += [subX]
        ys += [suby]

    return Xs, ys

In [None]:
all_Xs = []
all_ys = []

for fi in glob.glob("../data/TTbar/*ev*.npz")[:500]:
    fi = open(fi, "rb")
    data = np.load(fi)
    
    Xs, ys = get_unique_X_y(data["elements"], data["element_block_id"], data["candidates"], data["candidate_block_id"])

    all_Xs += [Xs]
    all_ys += [ys]
    
all_Xs = np.vstack(all_Xs)
all_ys = np.vstack(all_ys)

shuf = np.random.permutation(range(len(all_Xs)))
all_Xs = all_Xs[shuf]
all_ys = all_ys[shuf]

In [None]:
all_Xs.shape

In [None]:
all_Xs_types = all_Xs[:, :, 0]
all_Xs_kin = all_Xs[:, :, 1:]

all_ys_types = all_ys[:, :, 0]
all_ys_kin = all_ys[:, :, 1:]

all_Xs_kin = np.copy(all_Xs_kin.reshape(all_Xs_kin.shape[0], all_Xs_kin.shape[1]*all_Xs_kin.shape[2]))
all_ys_kin = np.copy(all_ys_kin.reshape(all_ys_kin.shape[0], all_ys_kin.shape[1]*all_ys_kin.shape[2]))

In [None]:
import sklearn.preprocessing

In [None]:
scaler_X = sklearn.preprocessing.StandardScaler().fit(all_Xs_kin)
scaler_y = sklearn.preprocessing.StandardScaler().fit(all_ys_kin)

In [None]:
enc_X = sklearn.preprocessing.OneHotEncoder(categories="auto", sparse=False)
enc_y = sklearn.preprocessing.OneHotEncoder(categories="auto", sparse=False)

In [None]:
enc_X.fit(all_Xs_types)
trf = enc_X.transform(all_Xs_types)
X = np.hstack([trf, scaler_X.transform(all_Xs_kin)])

enc_y.fit(all_ys_types)
trf = enc_y.transform(all_ys_types)
y = np.hstack([trf, scaler_y.transform(all_ys_kin)])

num_onehot_y = trf.shape[1]

In [None]:
import keras
model = keras.models.Sequential()

nunit = 512
dropout = 0.2

model.add(keras.layers.Dense(nunit, input_shape=(X.shape[1], )))

model.add(keras.layers.advanced_activations.LeakyReLU())
model.add(keras.layers.Dropout(dropout))
model.add(keras.layers.Dense(nunit))
model.add(keras.layers.BatchNormalization())

model.add(keras.layers.advanced_activations.LeakyReLU())
model.add(keras.layers.Dropout(dropout))
model.add(keras.layers.Dense(nunit))
model.add(keras.layers.BatchNormalization())

model.add(keras.layers.advanced_activations.LeakyReLU())
model.add(keras.layers.Dropout(dropout))
model.add(keras.layers.Dense(nunit))
model.add(keras.layers.BatchNormalization())

model.add(keras.layers.advanced_activations.LeakyReLU())
model.add(keras.layers.Dropout(dropout))
model.add(keras.layers.Dense(nunit))
model.add(keras.layers.BatchNormalization())

model.add(keras.layers.advanced_activations.LeakyReLU())
model.add(keras.layers.Dense(y.shape[1]))

opt = keras.optimizers.Adam(lr=1e-3)

model.compile(loss="mse", optimizer=opt)
model.summary()

In [None]:
X[4, :], y[4, :]

In [None]:
ntrain = int(0.8*len(all_Xs))
ret = model.fit(
    X[:ntrain], y[:ntrain],
    validation_data=(X[ntrain:], y[ntrain:]),
    batch_size=1000, epochs=100
)

In [None]:
plt.plot(ret.history["loss"])
plt.plot(ret.history["val_loss"])
plt.yscale("log")

In [None]:
pp = model.predict(X, batch_size=10000)

In [None]:
pp_candids = enc_y.inverse_transform(pp[:, :num_onehot_y]>0.5)
ncands = np.sum(pp_candids!=0, axis=1)
ncands_true = np.sum(all_ys_types!=0, axis=1)

In [None]:
msk_test = np.zeros(len(X), dtype=np.bool)
msk_test[ntrain:] = 1

msk_1true = np.zeros(len(X), dtype=np.bool)
msk_1true[ncands_true==1] = 1

msk_1pred = np.zeros(len(X), dtype=np.bool)
msk_1pred[ncands==1] = 1

msk_2true = np.zeros(len(X), dtype=np.bool)
msk_2true[ncands_true==2] = 1

msk_3true = np.zeros(len(X), dtype=np.bool)
msk_3true[ncands_true==3] = 1

In [None]:
np.unique(all_ys_types)

In [None]:
m = all_ys_types[:, 0] == 0

In [None]:
Counter([tuple(sorted(map(int, i))) for i in all_Xs_types[m]])

In [None]:
cmatrix_ncands = sklearn.metrics.confusion_matrix(ncands_true[msk_test], ncands[msk_test], labels=[0,1,2,3])
plt.imshow(cmatrix_ncands, norm=matplotlib.colors.LogNorm())
plt.colorbar()
plt.xlabel("true ncand")
plt.ylabel("predicted ncand")

In [None]:
labels = np.unique(all_ys_types)
mat = sklearn.metrics.confusion_matrix(all_ys_types[msk_test & msk_1true, 0], pp_candids[msk_test & msk_1true, 0], labels=labels)
mat

In [None]:
import matplotlib

In [None]:
plt.imshow(mat, norm=matplotlib.colors.LogNorm())
plt.colorbar()
plt.xlabel("true pdgid")
plt.ylabel("predicted pdgid")

In [None]:
pp_transformed = scaler_y.inverse_transform(pp[:, num_onehot_y:])

In [None]:
#set the candidate momentum to 0 for the candidates that were not predicted
@numba.njit
def postprocess_cand_momentum(pp_transformed, ncands_pred):
    for i in range(len(ncands_pred)):
        ncands = ncands_pred[i]
        d = np.copy(pp_transformed[i, :])
        pp_transformed[i, :] = 0
        pp_transformed[i, :ncands*3] = d[:ncands*3]

In [None]:
pp_transformed[1, :]

In [None]:
postprocess_cand_momentum(pp_transformed, ncands)

In [None]:
plt.figure(figsize=(4,4))
plt.scatter(all_ys_kin[msk_test&msk_1true&msk_1pred, 0], pp_transformed[msk_test&msk_1true&msk_1pred, 0], marker=".")
plt.plot([0,10],[0,10], lw=1, color="black")
plt.xlim(-1,10)
plt.ylim(-1,10)
plt.xlabel("First PFCandidate pT (true)")
plt.ylabel("First PFCandidate pT (predicted)")

plt.figure(figsize=(4,4))
bins = np.linspace(0,5,60)
plt.hist(all_ys_kin[msk_test & msk_1true&msk_1pred, 0], bins=bins, histtype="step", lw=2);
plt.hist(pp_transformed[msk_test & msk_1true&msk_1pred, 0], bins=bins, histtype="step", lw=2);

In [None]:
plt.figure(figsize=(4,4))

plt.scatter(all_ys_kin[msk_test&msk_1true&msk_1pred, 1], pp_transformed[msk_test&msk_1true&msk_1pred, 1], marker=".")
plt.plot([-6,6],[-6,6], lw=1, color="black")
plt.xlim(-6,6)
plt.ylim(-6,6)
plt.xlabel("First PFCandidate eta (true)")
plt.ylabel("First PFCandidate eta (predicted)")

plt.figure(figsize=(4,4))
bins = np.linspace(-6,6,60)
plt.hist(all_ys_kin[msk_test & msk_1true&msk_1pred, 1], bins=bins, histtype="step", lw=2);
plt.hist(pp_transformed[msk_test & msk_1true&msk_1pred, 1], bins=bins, histtype="step", lw=2);

In [None]:
plt.figure(figsize=(4,4))

plt.scatter(all_ys_kin[msk_test&msk_1true&msk_1pred, 2], pp_transformed[msk_test&msk_1true&msk_1pred, 2], marker=".")
plt.plot([-4,4],[-4,4], lw=1, color="black")
plt.xlim(-4,4)
plt.ylim(-4,4)


plt.figure(figsize=(4,4))
bins = np.linspace(-4,4,60)
plt.hist(all_ys_kin[msk_test & msk_1true&msk_1pred, 2], bins=bins, histtype="step", lw=2);

plt.hist(pp_transformed[msk_test & msk_1true&msk_1pred, 2], bins=bins, histtype="step", lw=2);