In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="-1"    
os.environ["KERAS_BACKEND"] = "tensorflow"

import numpy as np
import glob
import matplotlib.pyplot as plt
import numba
from collections import Counter
import math
import sklearn
import sklearn.metrics
import sklearn.ensemble
import sklearn.cluster
import scipy.sparse
import keras
import sys
import pickle
import matplotlib

sys.path += ["../test"]
from train_clustering import encode_triu, decode_triu
from train_regression import get_unique_X_y
from benchmark_solution import create_points

from matplotlib.colors import LogNorm

In [None]:
@numba.njit
def get_types_in_block(X, y, blk):
    return [int(x) for x in sorted(X[y==blk, 0])]

def get_blocksize_candsize_matrix(el_bl_id, cand_bl_id):
    blids = np.unique(el_bl_id)
    sizes = np.zeros((len(blids), 2), dtype=np.float32)
    i = 0
    els_counter = Counter(el_bl_id)
    cands_counter = Counter(cand_bl_id)
    for bl in blids:
        sizes[i, 0] = els_counter[bl]
        sizes[i, 1] = cands_counter[bl]
        i += 1
        
    b = np.linspace(0,20,21)
    c, _, _ = np.histogram2d(sizes[:, 0], sizes[:, 1], bins=(b, b))
    return c

Load all elements

In [None]:
all_sgs = []

num_clusters = []
num_tracks = []
num_cands = []
num_blocks = []

blsize_candsize_matrices = []

for fi in glob.glob("../data/TTbar_run3/*ev*.npz"):
    fi = open(fi, "rb")
    data = np.load(fi)
    
    #list of PF input elements in the event
    X = data["elements"]
    
    #tracks have type=1
    num_clusters += [np.sum(X[:, 0] != 1)]
    num_tracks += [np.sum(X[:, 0] == 1)]
    
    #unique ID for each cluster/block of elements that the PFAlgo considered independently
    #this can be considered as the target output of an improved PFBlockAlgo
    y = data["element_block_id"]
    num_blocks += [len(np.unique(y))]

    #List of candidates produced in the event.
    #This can be considered as the output of PFAlgo
    cands = data["candidates"]
    num_cands += [len(cands)]

    #get the types of the elements for each cluster/block
    sgs = [tuple(get_types_in_block(X, y, blk)) for blk in np.unique(y)]
    all_sgs += sgs
    
    blsize_candsize_matrices += [get_blocksize_candsize_matrix(data["element_block_id"], data["candidate_block_id"])]

In [None]:
bins = np.linspace(0,20,21)

In [None]:
plt.figure(figsize=(6,6))
cmat = sum(blsize_candsize_matrices)
plt.imshow(cmat, norm=LogNorm(vmin=1, vmax=10*np.sum(cmat)), origin="lower", interpolation=None)

plt.colorbar()
plt.xticks(bins);
plt.yticks(bins);

plt.title("Miniblock size to number of\nproduced PFCandidates")
plt.xlabel("number of candidates")
plt.ylabel("number of elements in block")

In [None]:
plt.plot(bins[:-1], cmat.sum(axis=1).cumsum()/np.sum(cmat), marker="o")
plt.xticks(bins);
plt.xlabel("maximum block size")
plt.ylabel("fraction of candidates")
plt.xlim(0,3)

In [None]:
plt.plot(bins[:-1], cmat.sum(axis=1).cumsum()/np.sum(cmat), marker="o")
plt.xticks(bins);
plt.xlabel("maximum block size")
plt.ylabel("fraction of candidates")
plt.ylim(0.9, 1.0)
plt.xlim(2,20)
plt.savefig("cand_blocksize.pdf", bbox_inches="tight")

In [None]:
plt.hist(num_clusters, bins=np.linspace(0, 5000, 100), label="clusters", histtype="step", lw=2);
plt.hist(num_tracks, bins=np.linspace(0, 5000, 100), label="tracks", histtype="step", lw=2);
plt.hist(num_blocks, bins=np.linspace(0, 5000, 100), label="blocks", histtype="step", lw=2);
plt.hist(num_cands, bins=np.linspace(0, 5000, 100), label="candidates", histtype="step", lw=2);
plt.legend(frameon=False)
plt.xlabel("number of els/cands/blocks")
plt.ylabel("number of events")
plt.savefig("num_elems.pdf", bbox_inches="tight")

Now we look at the number of blocks of a certain size.

In [None]:
block_sizes = Counter([len(sg) for sg in all_sgs])
print("block sizes", block_sizes)

In [None]:
plt.hist([len(sg) for sg in all_sgs], bins=np.linspace(0,100,101));
plt.xlabel("block size")
plt.ylabel("Number of blocks")

In [None]:
plt.hist([len(sg) for sg in all_sgs], bins=np.linspace(0,100,101), histtype="step", lw=2);
plt.yscale("log")
plt.xlabel("block size")
plt.ylabel("number of blocks")
plt.savefig("block_sizes.pdf", bbox_inches="tight")

Let's look at what the blocks f size, 1, 2, 3 and 4 are made of.

In [None]:
def plot_block_nelem(blocks_nelem):
    kv = list(blocks_nelem.items())
    xs = np.arange(len(kv))
    ys = np.array([v for k, v in kv])

    plt.bar(xs, ys)
    plt.xticks(xs, [k for k, v in kv], rotation=90)
    

for blocksize in range(1,5):
    sizes = [",".join(map(str, sg)) for sg in all_sgs if len(sg)==blocksize]
    blocks_nelem = Counter(sizes)
    print("{0}-element blocks".format(blocksize), blocks_nelem)
    plt.figure(figsize=(4,4))
    plt.title("Blocks of size {0}: {1} ({2:.0f}%)".format(blocksize, len(sizes), 100.0*len(sizes)/len(all_sgs)))
    plot_block_nelem(blocks_nelem)
    plt.xlabel("Block element types")

Look at the first 10 blocks.

In [None]:
fi = open("../data/TTbar_run3/step3_ntuple_10_ev39.npz", "rb")
data = np.load(fi)

dm = scipy.sparse.load_npz(open("../data/TTbar_run3/step3_ntuple_10_dist39.npz", "rb")).todense()

In [None]:
largest_blocks = sorted(Counter(data["element_block_id"]).items(), key=lambda x: x[1], reverse=True)
largest_blocks[:10]

In [None]:
block_ids = data["element_block_id"]
inds_elem = np.arange(len(data["elements"]))
inds_cand = np.arange(len(data["candidates"]))
for blk, blksize in largest_blocks[:10]:
    candidates_from_block = data["candidate_block_id"] == blk
    elems_in_block = data["element_block_id"] == blk
    tps = get_types_in_block(data["elements"], data["element_block_id"], blk)
    print("in block", blk, "had the following elements: {0}".format(Counter(tps)))
    for ielem in inds_elem[elems_in_block]:
        print("  elements[{0}]: type={1} energy={2:.2f}".format(ielem, int(data["elements"][ielem, 0]), data["elements"][ielem, 1]))
    print("from which the following {0} candidates were produced".format(len(inds_cand[candidates_from_block])))
    for icand in inds_cand[candidates_from_block]:
        print("  candidates[{0}]: pdgid={1} pt={2:.2f}".format(icand, int(data["candidates"][icand, 0]), data["candidates"][icand, 1]))
    print()

# Scratchpad

In [None]:
# NONE=0,
# TRACK=1, 
# PS1=2, 
# PS2=3, 
# ECAL=4, 
# HCAL=5,
# GSF=6,
# BREM=7,
# HFEM=8,
# HFHAD=9,
# SC=10,
# HO=11

In [None]:
import networkx as nx
import pandas

In [None]:
def make_df(points_data, points_pos, points_to_elem, elems_block_id):
    df = pandas.DataFrame(points_data.copy(),
        columns=["id", "type", "layer"],
        index=points_data[:, 0])

    df["block_id"] = [elems_block_id[points_to_elem[ip]] for ip in range(len(df))]
    df["pos_eta"] = np.array(points_pos[:, 0])
    df["pos_phi"] = np.array(points_pos[:, 1])
    df["energy"] = np.array(points_pos[:, 2])
    df["size"] = 1
    df["symbol"] = "dot"
    df["color"] = df["type"]
    df["layer"] = 1 + 2*df["layer"]

    df["pos_x"] = 2*df["pos_eta"]
    df["pos_y"] = df["layer"]*np.cos(df["pos_phi"])
    df["pos_z"] = df["layer"]*np.sin(df["pos_phi"])

    df.loc[df["type"]==1, "size"] = 0.2
    return df

In [None]:
import itertools
def color_largest_blocks(block_ids, highlight_blocks):
    colors = []
    cols_to_take = itertools.cycle(["red", "green", "blue", "orange", "purple", "cyan", "yellow", "brown"])
    colmap = {t: next(cols_to_take) for t in highlight_blocks}
    for i in block_ids:
        if i in highlight_blocks:
            colors.append(colmap[i])
        else:
            colors.append("gray")
    return colors


def cluster_pfblockalgo(Nelem, distance_matrix):
    dm2 = distance_matrix.copy()
    dm2[dm2>0] = 1
    g = nx.from_numpy_matrix(dm2)

    block_id_aspf = np.zeros((Nelem, ), dtype=np.int32)
    for ibl, conn in enumerate(nx.connected_components(g)):
        block_id_aspf[np.array(list(conn), dtype=np.int32)] = ibl
    return block_id_aspf

In [None]:
import plotly.express as px
import plotly.graph_objects as go
import colorlover as cl

def draw_plot(dfsel, highlight_blocks, point_to_point_link, title, layers_to_plot=[1,3,5,7], do_tracks=True):
    
    
    msk_blocks = np.vstack([dfsel["block_id"] == b for b in highlight_blocks]).sum(axis=0)>=1
    msk_layers = np.vstack([dfsel["layer"] == b for b in layers_to_plot]).sum(axis=0)>=1
    
    trk = (dfsel["type"]==1) | (dfsel["type"]==6)

    points_trk_blk = go.Scatter3d(
        x=dfsel.loc[trk & msk_blocks & msk_layers, 'pos_x'].values,
        y=dfsel.loc[trk&msk_blocks & msk_layers, 'pos_y'].values,
        z=dfsel.loc[trk&msk_blocks & msk_layers, 'pos_z'].values,
        mode="markers",
        marker={
            "symbol": "cross",
            "opacity": 0.8,
            "size": 5,
            "color": color_largest_blocks(dfsel.loc[trk&msk_blocks&msk_layers, "block_id"], highlight_blocks),
            #"colorscale": cl.scales['11']['qual']["Set3"]
        },
        name="track point in block"
    )

    points_trk = go.Scatter3d(
        x=dfsel.loc[trk & ~msk_blocks, 'pos_x'].values,
        y=dfsel.loc[trk & ~msk_blocks, 'pos_y'].values,
        z=dfsel.loc[trk & ~msk_blocks, 'pos_z'].values,
        mode="markers",
        marker={
            "symbol": "cross",
            "opacity": 0.05,
            "size": 5,
            "color": "gray"
            #"colorscale": cl.scales['11']['qual']["Set3"]
        },
        name="track point"
    )

    points_other_blk = go.Scatter3d(
        x=dfsel.loc[(~trk) & msk_blocks & msk_layers, 'pos_x'].values,
        y=dfsel.loc[(~trk) & msk_blocks & msk_layers, 'pos_y'].values,
        z=dfsel.loc[(~trk) & msk_blocks & msk_layers, 'pos_z'].values,
        mode="markers",
        marker={
            "symbol": "circle",
            "opacity": 0.8,
            "size": 5,
            "color": color_largest_blocks(dfsel.loc[~trk&msk_blocks&msk_layers, "block_id"], highlight_blocks),
        },
        name="calo cluster in block"
    )


    points_other = go.Scatter3d(
        x=dfsel.loc[~trk & ~msk_blocks, 'pos_x'].values,
        y=dfsel.loc[~trk & ~msk_blocks, 'pos_y'].values,
        z=dfsel.loc[~trk & ~msk_blocks, 'pos_z'].values,
        mode="markers",
        marker={
            "symbol": "circle",
            "opacity": 0.05,
            "size": 5,
            "color": "gray"
        },
        name="calo cluster"
    )

    line_points_x = []
    line_points_y = []
    line_points_z = []
    
    for ip in np.array(range(len(point_to_point_link))):
        p0 = point_to_point_link[ip, 0]
        p1 = point_to_point_link[ip, 1]
        if dfsel.loc[p0, "block_id"] in highlight_blocks or dfsel.loc[p1, "block_id"] in highlight_blocks:
            if p0 in dfsel.index and p1 in dfsel.index:
                line_points_x += [dfsel.loc[p0, "pos_x"], dfsel.loc[p1, "pos_x"], None]
                line_points_y += [dfsel.loc[p0, "pos_y"], dfsel.loc[p1, "pos_y"], None]
                line_points_z += [dfsel.loc[p0, "pos_z"], dfsel.loc[p1, "pos_z"], None]


    tracks = go.Scatter3d(
        x=line_points_x,
        y=line_points_y,
        z=line_points_z,
        mode="lines",
        opacity=0.2,
        line={"color": "black"},
        name="track between layers")
    
    data=[
            points_trk,
            points_other,
            points_trk_blk,
            points_other_blk,
        ]

    if do_tracks:
        data += [tracks]
    fig = go.Figure(data=data)

    fig.update_layout(
        autosize=False,
        width=700,
        height=500,
        margin=go.layout.Margin(
            l=50,
            r=0,
            b=0,
            t=50,
        ),
        title=title,
        scene_camera={
            "eye": dict(x=0.8, y=0.8, z=0.8)
        }
    )

    fig.show()
    return fig

In [None]:
points_data, points_pos, point_to_point_link, point_to_elem = create_points(data["elements"])
df = make_df(points_data, points_pos, point_to_elem, data["element_block_id"])

In [None]:
largest_blocks = sorted(Counter(df["block_id"]).items(), key=lambda x: x[1], reverse=True)
largest_blocks[:10]

In [None]:
fig = draw_plot(df, [22, 189, 229], point_to_point_link, "PFAlgo-based true blocks")
fig.write_image("blocks_true.pdf")

In [None]:
df_pfalgo = make_df(points_data, points_pos, point_to_elem, cluster_pfblockalgo(len(data["elements"]), dm))
largest_blocks = sorted(Counter(df_pfalgo["block_id"][df_pfalgo["type"]==1]).items(), key=lambda x: x[1], reverse=True)
largest_blocks[:10]

In [None]:
fig = draw_plot(df_pfalgo, [0, 1, 2], point_to_point_link, "PFBlockAlgo-based blocks, tracker surface", [1])
fig.write_image("blocks_pfblockalgo_tracker.pdf")

In [None]:
fig = draw_plot(df_pfalgo, [0, 1, 2], point_to_point_link, "PFBlockAlgo-based blocks, tracker surface", [1], do_tracks=False)
fig.write_image("blocks_pfblockalgo_tracker_notracks.pdf")

In [None]:
fig = draw_plot(df_pfalgo, [0, 1, 2], point_to_point_link, "PFBlockAlgo-based blocks, ECAL surface", [3])
fig.write_image("blocks_pfblockalgo_ecal.pdf")

In [None]:
fig = draw_plot(df_pfalgo, [0, 1, 2], point_to_point_link, "PFBlockAlgo-based blocks, ECAL surface", [3], do_tracks=False)
fig.write_image("blocks_pfblockalgo_ecal_notracks.pdf")

In [None]:
fig = draw_plot(df_pfalgo, [0, 1, 2], point_to_point_link, "PFBlockAlgo-based blocks, HCAL surface", [5])
fig.write_image("blocks_pfblockalgo_hcal.pdf")

In [None]:
fig = draw_plot(df_pfalgo, [0, 1, 2], point_to_point_link, "PFBlockAlgo-based blocks, HCAL surface", [5], do_tracks=False)
fig.write_image("blocks_pfblockalgo_hcal_notracks.pdf")

In [None]:
fig = draw_plot(df_pfalgo, [0, 1, 2], point_to_point_link, "PFBlockAlgo-based blocks")
fig.write_image("blocks_pfblockalgo.pdf")

In [None]:
fig = draw_plot(df_pfalgo, np.unique(df_pfalgo["block_id"]), point_to_point_link, "PFBlockAlgo-based blocks")
fig.write_image("blocks_pfblockalgo_all.pdf")

In [None]:
fig = draw_plot(df, np.unique(df["block_id"]), point_to_point_link, "PFBAlgo-based true blocks",)
fig.write_image("blocks_true_all.pdf")