In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="-1"    
os.environ["KERAS_BACKEND"] = "tensorflow"

import numpy as np
import glob
import matplotlib.pyplot as plt
import numba
from collections import Counter
import math
import sklearn
import sklearn.metrics
import sklearn.ensemble
import scipy.sparse
import keras
import sys
import pickle
sys.path += ["../test"]
from train_clustering import encode_triu, decode_triu
from train_regression import get_unique_X_y

from matplotlib.colors import LogNorm

In [None]:
@numba.njit
def get_types_in_block(X, y, blk):
    return [int(x) for x in sorted(X[y==blk, 0])]

def get_blocksize_candsize_matrix(el_bl_id, cand_bl_id):
    blids = np.unique(el_bl_id)
    sizes = np.zeros((len(blids), 2), dtype=np.float32)
    i = 0
    els_counter = Counter(el_bl_id)
    cands_counter = Counter(cand_bl_id)
    for bl in blids:
        sizes[i, 0] = els_counter[bl]
        sizes[i, 1] = cands_counter[bl]
        i += 1
        
    b = np.linspace(0,20,21)
    c, _, _ = np.histogram2d(sizes[:, 0], sizes[:, 1], bins=(b, b))
    return c

Load all elements

In [None]:
all_sgs = []

num_clusters = []
num_tracks = []
num_cands = []
num_blocks = []

blsize_candsize_matrices = []

for fi in glob.glob("../data/TTbar/*ev*.npz")[:10]:
    fi = open(fi, "rb")
    data = np.load(fi)
    
    #list of PF input elements in the event
    X = data["elements"]
    
    #tracks have type=1
    num_clusters += [np.sum(X[:, 0] != 1)]
    num_tracks += [np.sum(X[:, 0] == 1)]
    
    #unique ID for each cluster/block of elements that the PFAlgo considered independently
    #this can be considered as the target output of an improved PFBlockAlgo
    y = data["element_block_id"]
    num_blocks += [len(np.unique(y))]

    #List of candidates produced in the event.
    #This can be considered as the output of PFAlgo
    cands = data["candidates"]
    num_cands += [len(cands)]

    #get the types of the elements for each cluster/block
    sgs = [tuple(get_types_in_block(X, y, blk)) for blk in np.unique(y)]
    all_sgs += sgs
    
    blsize_candsize_matrices += [get_blocksize_candsize_matrix(data["element_block_id"], data["candidate_block_id"])]

In [None]:
bins = np.linspace(0,20,21)

In [None]:
plt.figure(figsize=(6,6))
cmat = sum(blsize_candsize_matrices)
plt.imshow(cmat, norm=LogNorm(vmin=1, vmax=10*np.sum(cmat)), origin="lower", interpolation=None)

plt.colorbar()
plt.xticks(bins);
plt.yticks(bins);

plt.title("Miniblock size to number of\nproduced PFCandidates")
plt.xlabel("number of candidates")
plt.ylabel("number of elements in block")

In [None]:
plt.plot(bins[:-1], cmat.sum(axis=1).cumsum()/np.sum(cmat), marker="o")
plt.xticks(bins);
plt.xlabel("maximum block size")
plt.ylabel("fraction of candidates")
plt.xlim(0,3)

In [None]:
plt.plot(bins[:-1], cmat.sum(axis=1).cumsum()/np.sum(cmat), marker="o")
plt.xticks(bins);
plt.xlabel("maximum block size")
plt.ylabel("fraction of candidates")
plt.ylim(0.9, 1.0)
plt.xlim(2,20)

In [None]:
plt.hist(num_clusters, bins=np.linspace(0, 5000, 100), label="clusters", alpha=0.5);
plt.hist(num_tracks, bins=np.linspace(0, 5000, 100), label="tracks", alpha=0.5);
plt.hist(num_blocks, bins=np.linspace(0, 5000, 100), label="blocks", alpha=0.5);
plt.hist(num_cands, bins=np.linspace(0, 5000, 100), label="candidates", alpha=0.5);
plt.legend()
plt.xlabel("number of els/cands/blocks")
plt.ylabel("number of events")

Now we look at the number of blocks of a certain size.

In [None]:
block_sizes = Counter([len(sg) for sg in all_sgs])
print("block sizes", block_sizes)

In [None]:
plt.hist([len(sg) for sg in all_sgs], bins=np.linspace(0,100,101));
plt.xlabel("block size")
plt.ylabel("Number of blocks")

In [None]:
plt.hist([len(sg) for sg in all_sgs], bins=np.linspace(0,100,101));
plt.yscale("log")
plt.xlabel("block size")
plt.ylabel("number of blocks")

Let's look at what the blocks f size, 1, 2, 3 and 4 are made of.

In [None]:
def plot_block_nelem(blocks_nelem):
    kv = list(blocks_nelem.items())
    xs = np.arange(len(kv))
    ys = np.array([v for k, v in kv])

    plt.bar(xs, ys)
    plt.xticks(xs, [k for k, v in kv], rotation=90)
    

for blocksize in range(1,5):
    sizes = [",".join(map(str, sg)) for sg in all_sgs if len(sg)==blocksize]
    blocks_nelem = Counter(sizes)
    print("{0}-element blocks".format(blocksize), blocks_nelem)
    plt.figure(figsize=(4,4))
    plt.title("Blocks of size {0}: {1} ({2:.0f}%)".format(blocksize, len(sizes), 100.0*len(sizes)/len(all_sgs)))
    plot_block_nelem(blocks_nelem)
    plt.xlabel("Block element types")

Look at the first 10 blocks.

In [None]:
fi = open("../data/TTbar/step3_AOD_1_ev0.npz", "rb")
data = np.load(fi)

dm = scipy.sparse.load_npz(open("../data/TTbar/step3_AOD_1_dist0.npz", "rb")).todense()
dm[dm==0] = 999

In [None]:
block_ids = data["element_block_id"]
inds_elem = np.arange(len(data["elements"]))
inds_cand = np.arange(len(data["candidates"]))
for blk in np.unique(block_ids)[:20]:
    candidates_from_block = data["candidate_block_id"] == blk
    elems_in_block = data["element_block_id"] == blk
    
    print("in block", blk, "had the following elements: {0}".format(get_types_in_block(data["elements"], data["element_block_id"], blk)))
    for ielem in inds_elem[elems_in_block]:
        print("  elements[{0}]: type={1} energy={2:.2f}".format(ielem, int(data["elements"][ielem, 0]), data["elements"][ielem, 1]))
    print("from which the following candidates were produced")
    for icand in inds_cand[candidates_from_block]:
        print("  candidates[{0}]: pdgid={1} pt={2:.2f}".format(icand, int(data["candidates"][icand, 0]), data["candidates"][icand, 1]))
    print()

In [None]:
import matplotlib.colors as mcolors


In [None]:
fig = plt.figure(figsize=(15,5))
ax1 = plt.subplot(1,3,1)

block_ids = np.unique(data["element_block_id"])
np.random.shuffle(block_ids)
block_ids = block_ids[:10]
colors = list(mcolors.TABLEAU_COLORS)

block_masks = [(data["element_block_id"] == bid) for bid in block_ids]

track_mask = (data["elements"][:, 0] == 1)
ecal_mask = (data["elements"][:, 0] == 4)
hcal_mask = (data["elements"][:, 0] == 5)

plt.title("ECAL surface")
plt.xlim(-5, 5)
plt.ylim(-5, 5)

#tracks
plt.title("tracker surface")
for ibl in range(len(block_ids)):
    plt.scatter(
        data["elements"][track_mask & block_masks[ibl], 2],
        data["elements"][track_mask & block_masks[ibl], 3],
        marker="x", color=colors[ibl])

    plt.scatter(
        data["candidates"][data["candidate_block_id"]==block_ids[ibl], 2],
        data["candidates"][data["candidate_block_id"]==block_ids[ibl], 3],
        marker="o", color=colors[ibl], alpha=0.5, s=50)


ax2 = plt.subplot(1,3,2)
plt.title("ECAL surface")
plt.xlim(-5, 5)
plt.ylim(-5, 5)

for ibl in range(len(block_ids)):
    plt.scatter(
        data["elements"][track_mask & block_masks[ibl], 4],
        data["elements"][track_mask & block_masks[ibl], 5],
        marker="x", color=colors[ibl])
    plt.scatter(
        data["elements"][ecal_mask & block_masks[ibl], 2],
        data["elements"][ecal_mask & block_masks[ibl], 3],
        marker="s", color=colors[ibl], alpha=0.5, s=100)

ax3 = plt.subplot(1,3,3)
plt.title("HCAL surface")
plt.xlim(-5, 5)
plt.ylim(-5, 5)

for ibl in range(len(block_ids)):
    plt.scatter(data["elements"][track_mask & block_masks[ibl], 6], data["elements"][track_mask & block_masks[ibl], 7], marker="x", color=colors[ibl])
    plt.scatter(data["elements"][hcal_mask & block_masks[ibl], 2], data["elements"][hcal_mask & block_masks[ibl], 3], marker="s", color=colors[ibl], alpha=0.5, s=100)


In [None]:
data["candidates"][data["candidate_block_id"] == block_id1]

In [None]:
all_elems = (track_mask | ecal_mask | hcal_mask) & (block_mask1 | block_mask2)
inds = np.nonzero(all_elems)[0]

In [None]:
data["elements"][all_elems]

In [None]:
for i1 in inds:
    for i2 in inds:
        if i1 < i2:
            print(i1, i2, data["elements"][i1, 0], data["elements"][i2, 0], dm[i1, i2], )