In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="-1"    
os.environ["KERAS_BACKEND"] = "tensorflow"

import numpy as np
import glob
import matplotlib.pyplot as plt
import numba
from collections import Counter
import math
import sklearn
import sklearn.metrics
import sklearn.ensemble
import sklearn.cluster
import scipy.sparse
import keras
import sys
import pickle
import matplotlib

sys.path += ["../test"]
from train_clustering import encode_triu, decode_triu
from train_regression import get_unique_X_y

from matplotlib.colors import LogNorm

In [None]:
@numba.njit
def get_types_in_block(X, y, blk):
    return [int(x) for x in sorted(X[y==blk, 0])]

def get_blocksize_candsize_matrix(el_bl_id, cand_bl_id):
    blids = np.unique(el_bl_id)
    sizes = np.zeros((len(blids), 2), dtype=np.float32)
    i = 0
    els_counter = Counter(el_bl_id)
    cands_counter = Counter(cand_bl_id)
    for bl in blids:
        sizes[i, 0] = els_counter[bl]
        sizes[i, 1] = cands_counter[bl]
        i += 1
        
    b = np.linspace(0,20,21)
    c, _, _ = np.histogram2d(sizes[:, 0], sizes[:, 1], bins=(b, b))
    return c

Load all elements

In [None]:
all_sgs = []

num_clusters = []
num_tracks = []
num_cands = []
num_blocks = []

blsize_candsize_matrices = []

for fi in glob.glob("../data/TTbar/*ev*.npz")[:10]:
    fi = open(fi, "rb")
    data = np.load(fi)
    
    #list of PF input elements in the event
    X = data["elements"]
    
    #tracks have type=1
    num_clusters += [np.sum(X[:, 0] != 1)]
    num_tracks += [np.sum(X[:, 0] == 1)]
    
    #unique ID for each cluster/block of elements that the PFAlgo considered independently
    #this can be considered as the target output of an improved PFBlockAlgo
    y = data["element_block_id"]
    num_blocks += [len(np.unique(y))]

    #List of candidates produced in the event.
    #This can be considered as the output of PFAlgo
    cands = data["candidates"]
    num_cands += [len(cands)]

    #get the types of the elements for each cluster/block
    sgs = [tuple(get_types_in_block(X, y, blk)) for blk in np.unique(y)]
    all_sgs += sgs
    
    blsize_candsize_matrices += [get_blocksize_candsize_matrix(data["element_block_id"], data["candidate_block_id"])]

In [None]:
bins = np.linspace(0,20,21)

In [None]:
plt.figure(figsize=(6,6))
cmat = sum(blsize_candsize_matrices)
plt.imshow(cmat, norm=LogNorm(vmin=1, vmax=10*np.sum(cmat)), origin="lower", interpolation=None)

plt.colorbar()
plt.xticks(bins);
plt.yticks(bins);

plt.title("Miniblock size to number of\nproduced PFCandidates")
plt.xlabel("number of candidates")
plt.ylabel("number of elements in block")

In [None]:
plt.plot(bins[:-1], cmat.sum(axis=1).cumsum()/np.sum(cmat), marker="o")
plt.xticks(bins);
plt.xlabel("maximum block size")
plt.ylabel("fraction of candidates")
plt.xlim(0,3)

In [None]:
plt.plot(bins[:-1], cmat.sum(axis=1).cumsum()/np.sum(cmat), marker="o")
plt.xticks(bins);
plt.xlabel("maximum block size")
plt.ylabel("fraction of candidates")
plt.ylim(0.9, 1.0)
plt.xlim(2,20)

In [None]:
plt.hist(num_clusters, bins=np.linspace(0, 5000, 100), label="clusters", alpha=0.5);
plt.hist(num_tracks, bins=np.linspace(0, 5000, 100), label="tracks", alpha=0.5);
plt.hist(num_blocks, bins=np.linspace(0, 5000, 100), label="blocks", alpha=0.5);
plt.hist(num_cands, bins=np.linspace(0, 5000, 100), label="candidates", alpha=0.5);
plt.legend()
plt.xlabel("number of els/cands/blocks")
plt.ylabel("number of events")

Now we look at the number of blocks of a certain size.

In [None]:
block_sizes = Counter([len(sg) for sg in all_sgs])
print("block sizes", block_sizes)

In [None]:
plt.hist([len(sg) for sg in all_sgs], bins=np.linspace(0,100,101));
plt.xlabel("block size")
plt.ylabel("Number of blocks")

In [None]:
plt.hist([len(sg) for sg in all_sgs], bins=np.linspace(0,100,101));
plt.yscale("log")
plt.xlabel("block size")
plt.ylabel("number of blocks")

Let's look at what the blocks f size, 1, 2, 3 and 4 are made of.

In [None]:
def plot_block_nelem(blocks_nelem):
    kv = list(blocks_nelem.items())
    xs = np.arange(len(kv))
    ys = np.array([v for k, v in kv])

    plt.bar(xs, ys)
    plt.xticks(xs, [k for k, v in kv], rotation=90)
    

for blocksize in range(1,5):
    sizes = [",".join(map(str, sg)) for sg in all_sgs if len(sg)==blocksize]
    blocks_nelem = Counter(sizes)
    print("{0}-element blocks".format(blocksize), blocks_nelem)
    plt.figure(figsize=(4,4))
    plt.title("Blocks of size {0}: {1} ({2:.0f}%)".format(blocksize, len(sizes), 100.0*len(sizes)/len(all_sgs)))
    plot_block_nelem(blocks_nelem)
    plt.xlabel("Block element types")

Look at the first 10 blocks.

In [None]:
fi = open("../data/TTbar/step3_AOD_1_ev0.npz", "rb")
data = np.load(fi)

dm = scipy.sparse.load_npz(open("../data/TTbar/step3_AOD_1_dist0.npz", "rb")).todense()
dm[dm==0] = 999

In [None]:
block_ids = data["element_block_id"]
inds_elem = np.arange(len(data["elements"]))
inds_cand = np.arange(len(data["candidates"]))
for blk in np.unique(block_ids)[:20]:
    candidates_from_block = data["candidate_block_id"] == blk
    elems_in_block = data["element_block_id"] == blk
    
    print("in block", blk, "had the following elements: {0}".format(get_types_in_block(data["elements"], data["element_block_id"], blk)))
    for ielem in inds_elem[elems_in_block]:
        print("  elements[{0}]: type={1} energy={2:.2f}".format(ielem, int(data["elements"][ielem, 0]), data["elements"][ielem, 1]))
    print("from which the following candidates were produced")
    for icand in inds_cand[candidates_from_block]:
        print("  candidates[{0}]: pdgid={1} pt={2:.2f}".format(icand, int(data["candidates"][icand, 0]), data["candidates"][icand, 1]))
    print()

In [None]:
import matplotlib.colors as mcolors

In [None]:
# NONE=0,
# TRACK=1, 
# PS1=2, 
# PS2=3, 
# ECAL=4, 
# HCAL=5,
# GSF=6,
# BREM=7,
# HFEM=8,
# HFHAD=9,
# SC=10,
# HO=11

In [None]:
def get_tracker_surface(elements, candidates):
    coords_trk = []
    coords_cand = []
    for iel in range(len(elements)):
        tp = elements[iel, 0]
        if tp == 1 and elements[iel, 2]!=0 and elements[iel, 3]!=0:
            coords_trk += [(elements[iel, 2], elements[iel, 3])]
    for icand in range(len(candidates)):
        coords_cand += [(candidates[icand, 2], candidates[icand, 3], candidates[icand, 0])]
    return coords_trk, coords_cand

def get_ecal_surface(elements):
    coords_trk = []
    coords_cal = []
    for iel in range(len(elements)):
        tp = elements[iel, 0]
        if (tp ==1 or tp == 6) and (elements[iel, 4] != 0 and elements[iel, 5] != 0):
            coords_trk += [(elements[iel, 4], elements[iel, 5])]
        elif tp == 2 or tp == 3 or tp == 4 or tp == 10:
            coords_cal += [(elements[iel, 2], elements[iel, 3])]
    return coords_trk, coords_cal

def get_hcal_surface(elements):
    coords_trk = []
    coords_cal = []
    for iel in range(len(elements)):
        tp = elements[iel, 0]
        if tp == 1 and elements[iel, 6] != 0 and elements[iel, 7] != 0:
            coords_trk += [(elements[iel, 6], elements[iel, 7])]
        elif tp == 5:
            coords_cal += [(elements[iel, 2], elements[iel, 3])]
    return coords_trk, coords_cal

def get_hf_surface(elements):
    coords_cal = []
    for iel in range(len(elements)):
        tp = elements[iel, 0]
        if tp == 8 or tp == 9 or tp == 11:
            coords_cal += [(elements[iel, 2], elements[iel, 3])]
    return coords_cal

def render_candidates(ax, cands, color, alpha=1.0, do_label=True):
    for c in cands:
        ax.scatter([c[0]], [c[1]], s=50, marker="o", color=color, alpha=alpha)
        if do_label:
            ax.text(c[0]+0.5, c[1], "{0}".format(int(c[2])), ha="left", va="center", alpha=alpha)
        
def render_tracks(ax, tracks, color, alpha=1.0):
    for t in tracks:
        ax.scatter([t[0]], [t[1]], marker="x", color=color, alpha=alpha)
        
def render_calo(ax, clusters, color, alpha=1.0):
    for c in clusters:
        ax.scatter([c[0]], [c[1]], s=50, marker="s", color=color, alpha=alpha)

In [None]:
block_id = 10
tracker_coords_trk, tracker_coords_cand = get_tracker_surface(
    data["elements"][data["element_block_id"] == block_id],
    data["candidates"][data["candidate_block_id"] == block_id])
ecal_coords_trk, ecal_coords_cal = get_ecal_surface(data["elements"][data["element_block_id"] == block_id])
hcal_coords_trk, hcal_coords_cal = get_hcal_surface(data["elements"][data["element_block_id"] == block_id])

In [None]:
import matplotlib.colors as mcolors
import ipywidgets.widgets as widgets
block_ids = np.unique(data["element_block_id"])

@widgets.interact_manual(index=(0, 1000), num_blocks=(1,5))
def plot_blocks_index(index=0, num_blocks=1):

    colors = list(mcolors.TABLEAU_COLORS)

    fig = plt.figure(figsize=(10, 10), dpi=100)

    ax1 = plt.subplot(2,2,1)
    plt.title("tracker surface")
    plt.xlim(-6, 6)
    plt.ylim(-4, 4)

    ax2 = plt.subplot(2,2,2)
    plt.title("ECAL")
    plt.xlim(-6, 6)
    plt.ylim(-4, 4)

    ax3 = plt.subplot(2,2,3)
    plt.title("HCAL")
    plt.xlim(-6, 6)
    plt.ylim(-4, 4)

    ax4 = plt.subplot(2,2,4)
    plt.title("HF")
    plt.xlim(-6, 6)
    plt.ylim(-4, 4)

    block_ids_plot = block_ids[index*num_blocks:index*num_blocks + num_blocks]
    title_string = "Blocks [{0}]".format(", ".join([str(s) for s in block_ids_plot]))

    for blid in block_ids_plot:
        l1 = list(data["elements"][data["element_block_id"] == blid][:, 0].astype(int))
        l2 = list(data["candidates"][data["candidate_block_id"] == blid][:, 0].astype(int))
        if len(l1)>5:
            l1 = l1[:5] + ["..."]
        if len(l2)>5:
            l2 = l2[:5] + ["..."]
        l1 = "[" + ", ".join([str(x) for x in l1]) + "]"
        l2 = "[" + ", ".join([str(x) for x in l2]) + "]"
        title_string += "\nBlock {0}: {1} -> {2}".format(
            blid,
            l1,
            l2,
        )
    plt.suptitle(title_string, y=0.9, va="bottom")

    tracker_coords_trk, tracker_coords_cand = get_tracker_surface(
        data["elements"],
        data["candidates"])
    ecal_coords_trk, ecal_coords_cal = get_ecal_surface(data["elements"])
    hcal_coords_trk, hcal_coords_cal = get_hcal_surface(data["elements"])
    hf_coords_cal = get_hf_surface(data["elements"])

    render_candidates(ax1, tracker_coords_cand, "gray", alpha=0.05, do_label=False)
    render_tracks(ax1, tracker_coords_trk, "gray", alpha=0.05)

    render_calo(ax2, ecal_coords_cal, "gray", alpha=0.05)
    render_tracks(ax2, ecal_coords_trk, "gray", alpha=0.05)

    render_calo(ax3, hcal_coords_cal, "gray", alpha=0.05)
    render_tracks(ax3, hcal_coords_trk, "gray", alpha=0.05)

    render_calo(ax4, hf_coords_cal, "gray", alpha=0.05)
        
    for block_id in block_ids_plot:
        color = colors.pop()

        tracker_coords_trk, tracker_coords_cand = get_tracker_surface(
            data["elements"][data["element_block_id"] == block_id],
            data["candidates"][data["candidate_block_id"] == block_id])
        ecal_coords_trk, ecal_coords_cal = get_ecal_surface(data["elements"][data["element_block_id"] == block_id])
        hcal_coords_trk, hcal_coords_cal = get_hcal_surface(data["elements"][data["element_block_id"] == block_id])
        hf_coords_cal = get_hf_surface(data["elements"][data["element_block_id"] == block_id])

        render_candidates(ax1, tracker_coords_cand, color, alpha=0.5)
        render_tracks(ax1, tracker_coords_trk, color)

        render_calo(ax2, ecal_coords_cal, color, alpha=0.5)
        render_tracks(ax2, ecal_coords_trk, color)

        render_calo(ax3, hcal_coords_cal, color, alpha=0.5)
        render_tracks(ax3, hcal_coords_trk, color)

        render_calo(ax4, hf_coords_cal, color, alpha=0.5)

In [None]:
# for i in range(585):
#     plot_blocks_index(i, 3)
#     plt.savefig("blocks_idx{0}.png".format(i), bbox_inches=0)
#     plt.clf()

In [None]:
import networkx as nx

In [None]:
def cluster_with_dm_aspf(dm):       
    dm2 = dm.copy()
    dm2[dm2==999] = 0
    dm2[dm2>0] = 1
    g = nx.from_numpy_matrix(dm2)

    block_id_aspf = np.zeros_like(data["element_block_id"])
    for ibl, conn in enumerate(nx.connected_components(g)):
        block_id_aspf[np.array(list(conn), dtype=np.int32)] = ibl
        
    return block_id_aspf

def cluster_with_dm_optics(dm):       
    dm2 = dm.copy()
    dm2[dm2==0] = 999
    print(dm2)
    optics = sklearn.cluster.OPTICS(min_samples=1, metric="precomputed", max_eps=0.25)
    res = optics.fit_predict(dm2)
        
    return res

In [None]:
block_id_aspf = cluster_with_dm_aspf(dm)
block_id_optics = cluster_with_dm_optics(dm)

In [None]:
sklearn.metrics.adjusted_rand_score(data["element_block_id"], block_id_aspf)

In [None]:
sklearn.metrics.adjusted_rand_score(data["element_block_id"], block_id_optics)

In [None]:
sklearn.metrics.adjusted_mutual_info_score(data["element_block_id"], block_id_aspf, average_method='arithmetic')

In [None]:
sklearn.metrics.adjusted_mutual_info_score(data["element_block_id"], block_id_optics, average_method='arithmetic')

In [None]:
msk_tracks_in_ecal = np.zeros(len(data["elements"]), dtype=np.bool)

In [None]:
@numba.njit
def find_tracks_in_ecal(elements):
    res = np.zeros((len(elements), ), dtype=np.int32)
    for i in range(len(elements)):
        tp = elements[i, 0]
        if tp == 1 or tp == 6:
            if elements[i, 4] != 0 and elements[i, 5] != 0:
                res[i] = 1
    return res == 1

@numba.njit
def find_tracks_in_hcal(elements):
    res = np.zeros((len(elements), ), dtype=np.int32)
    for i in range(len(elements)):
        tp = elements[i, 0]
        if tp == 1:
            if elements[i, 6] != 0 and elements[i, 7] != 0:
                res[i] = 1
    return res == 1

@numba.njit
def find_hcal_clusters(elements):
    res = np.zeros((len(elements), ), dtype=np.int32)
    for i in range(len(elements)):
        tp = elements[i, 0]
        if tp == 5:
            res[i] = 1
    return res==1

@numba.njit
def find_ecal_clusters(elements):
    res = np.zeros((len(elements), ), dtype=np.int32)
    for i in range(len(elements)):
        tp = elements[i, 0]
        if tp == 4:
            res[i] = 1
    return res==1

In [None]:
msk_trks_ecal = find_tracks_in_ecal(data["elements"])
trks_ecal = data["elements"][msk_trks_ecal, 4:6]

msk_ecal_clusters = find_ecal_clusters(data["elements"])
ecal_clusters = data["elements"][msk_ecal_clusters, 2:4]

plt.figure(figsize=(20, 20))
plt.title("ECAL surface")
plt.scatter(trks_ecal[:, 0], trks_ecal[:, 1], marker="x", color="gray")
plt.scatter(ecal_clusters[:, 0], ecal_clusters[:, 1], s=100, marker="s", alpha=0.5, color="gray")

In [None]:
msk_trks_hcal = find_tracks_in_hcal(data["elements"])
trks_hcal = data["elements"][msk_trks_hcal, 6:8]

msk_hcal_clusters = find_hcal_clusters(data["elements"])
hcal_clusters = data["elements"][msk_hcal_clusters, 2:4]

plt.figure(figsize=(20, 20))
plt.title("HCAL surface")
plt.scatter(trks_hcal[:, 0], trks_hcal[:, 1], marker="x", color="gray")
plt.scatter(hcal_clusters[:, 0], hcal_clusters[:, 1], s=100, marker="s", alpha=0.5, color="gray")

In [None]:
@numba.njit
def dist(points, i, j):
    p0 = points[i]
    p1 = points[j]
    #dphi = np.mod(p0[1] - p1[1], 2*math.pi) - math.pi
    dphi = p0[1] - p1[1]
    dphi = np.mod(dphi + np.pi, 2*np.pi) - np.pi
    deta = p0[0] - p1[0]
    return np.sqrt(dphi**2 + deta**2)

In [None]:
@numba.njit
def fill_local_density(points, points_data, dc=0.3):
    points_data[:, 0] = 0
    
    Np = len(points)
    for i in range(Np):
        for j in range(Np):
            d = dist(points, i, j)
            if d < dc:
                fact = 1.0 if i==j else 0.5
                points_data[i, 0] += points_data[j, 1]*fact

@numba.njit
def find_nearest_higher(points, points_data):
    Np = len(points)
    
    for i in range(Np):
        delta = 999.0
        nearestHigher = -1
        
        for j in range(Np):
            d = dist(points, i, j)
            if d < delta and points_data[j, 0] > points_data[i, 0]:
                nearestHigher = j
                delta = d

        points_data[i, 2] = delta
        points_data[i, 3] = nearestHigher
        

def assign_cluster_id(points, points_types, points_data, rho_crit=10, delta_crit=0.2):
    cluster_id = np.zeros((len(points),), dtype=np.int32)
    Np = len(points)
    nClusters = 0
    
    buffer_seeds = []
    followers = {i: [] for i in range(Np)}
    
    for i in range(Np):
        isSeed = (points_data[i, 0] > rho_crit) and (points_data[i, 2] > delta_crit)
        isOutlier = (points_data[i, 0] <= rho_crit) and (points_data[i, 2] > 2*delta_crit)
        
        if isSeed and points_types[i]==5:
            cluster_id[i] = nClusters
            nClusters += 1
            buffer_seeds += [i]
        else:
            if not isOutlier:
                #add as a follower to the nearest highest point
                nearestHighest = points_data[i, 3]
                if nearestHighest != -1:
                    followers[nearestHighest] += [i]
    
    #Now add set the cluster ID for all children of all seeds
    while len(buffer_seeds) > 0:
        i = buffer_seeds.pop()
        for fl in followers[i]:
            cluster_id[fl] = cluster_id[i]
            buffer_seeds += [fl]

    return cluster_id

In [None]:
points = np.vstack([trks_hcal, hcal_clusters])
points_types = np.hstack([data["elements"][msk_trks_hcal][:, 0], data["elements"][msk_hcal_clusters][:, 0]])

#rho, weight, delta, nearestHigher
points_data = np.zeros((len(points), 4))
points_data[:, 1] = 1.0

fill_local_density(points, points_data)
find_nearest_higher(points, points_data)
clid = assign_cluster_id(points, points_types, points_data, rho_crit=0.5, delta_crit=0.1)
clid_true = np.hstack([data["element_block_id"][msk_trks_hcal], data["element_block_id"][msk_hcal_clusters]])
nclusters = len(np.unique(clid))

In [None]:
plt.hist(points_data[:, 0], bins=np.linspace(0,30,100));

In [None]:
plt.hist(points_data[:, 2], bins=np.linspace(0,1,100));

In [None]:
plt.figure(figsize=(10, 10))
plt.scatter(trks_hcal[:, 0], trks_hcal[:, 1], marker="x")
plt.scatter(hcal_clusters[:, 0], hcal_clusters[:, 1], marker="s", alpha=0.2)

In [None]:
plt.figure(figsize=(10, 10))
plt.scatter(points[:, 0], points[:, 1], points_data[:, 0]*5, alpha=0.2)

In [None]:
plt.figure(figsize=(10, 10))
plt.scatter(points[:, 0], points[:, 1], points_data[:, 0]*5, alpha=0.2)

for i in range(len(points)):
    p0 = points[i]
    idx_nearest = int(points_data[i, 3])
    if idx_nearest != -1:
        p1 = points[idx_nearest]
        plt.plot([p0[0], p1[0]], [p0[1], p1[1]], marker=None, lw=1, color="black")

In [None]:
len(clid), len(points_types)

In [None]:
def compute_cluster_centers(points, points_types, clid):
    centers = {}
    for i in np.unique(clid):
        msk = (clid==i) & (points_types==5)
        if np.sum(msk) > 0:
            mx = np.mean(points[msk, 0])
            my = np.mean(points[msk, 1])
            centers[i] = (mx, my)
    return centers

In [None]:
plt.figure(figsize=(10, 10))
plt.scatter(points[:, 0], points[:, 1], alpha=0.2)

centers = compute_cluster_centers(points, points_types, clid)

for i in np.unique(clid):
    ps = points[clid==i]
    if i in centers.keys():
        for p in ps:
            if abs(p[1] - centers[i][1]) < 1.0:
                plt.plot([p[0], centers[i][0]], [p[1], centers[i][1]], lw=1, marker=None, color="black")

In [None]:
plt.figure(figsize=(10, 10))
plt.scatter(trks_hcal[:, 0], trks_hcal[:, 1], marker="x")
plt.scatter(hcal_clusters[:, 0], hcal_clusters[:, 1], marker="s", alpha=0.2)

centers = compute_cluster_centers(points, points_types, clid_true)

for i in np.unique(clid_true):
    ps = points[clid_true==i]
    if i in centers.keys():
        for p in ps:
            if abs(p[1] - centers[i][1]) < 1.0:
                plt.plot([p[0], centers[i][0]], [p[1], centers[i][1]], lw=1, marker=None, color="black")

In [None]:
sklearn.metrics.adjusted_rand_score(clid_true, clid)