In [1]:
import numpy as np
import torch
import torch.nn as nn
from nets import GraphNN_KNN
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_curve, accuracy_score, average_precision_score
from torch_geometric.data import DataLoader
from preprocessing import preprocess_dataset
from utils import RunningAverageMeter, plot_aucs
from tqdm import tqdm


def predict_one_shower(shower, graph_embedder, edge_classifier):
    embeddings = graph_embedder(shower)
    edge_labels_true = (shower.y[shower.edge_index[0]] == shower.y[shower.edge_index[1]]).view(-1)
    edge_data = torch.cat([
        embeddings[shower.edge_index[0]],
        embeddings[shower.edge_index[1]]
    ], dim=1)
    edge_labels_predicted = edge_classifier(edge_data).view(-1)

    return edge_labels_true, edge_labels_predicted

In [2]:
datafile='./data/train_.pt'; epochs=1000; learning_rate=1e-3; dim_out=144; device='cpu';
device = torch.device(device)
showers = preprocess_dataset(datafile)
showers_train, showers_test = train_test_split(showers, random_state=1337)

train_loader = DataLoader(showers_train, batch_size=1, shuffle=True)
test_loader = DataLoader(showers_test, batch_size=1, shuffle=True)

k = showers[0].x.shape[1]
print(k)
graph_embedder = GraphNN_KNN(dim_out=dim_out, k=k).to(device)
edge_classifier = nn.Sequential(nn.Linear(dim_out * 2, 144),
                                nn.Tanh(),
                                nn.Linear(144, 144),
                                nn.Tanh(),
                                nn.Linear(144, 32),
                                nn.Tanh(),
                                nn.Linear(32, 1),
                                nn.Sigmoid()).to(device)

10


In [3]:
graph_embedder.load_state_dict(torch.load('graph_embedder.pt', map_location=device))
graph_embedder.eval()
edge_classifier.load_state_dict(torch.load('edge_classifier.pt', map_location=device))
edge_classifier.eval()

Sequential(
  (0): Linear(in_features=288, out_features=144, bias=True)
  (1): Tanh()
  (2): Linear(in_features=144, out_features=144, bias=True)
  (3): Tanh()
  (4): Linear(in_features=144, out_features=32, bias=True)
  (5): Tanh()
  (6): Linear(in_features=32, out_features=1, bias=True)
  (7): Sigmoid()
)

In [4]:
import networkx as nx

def preprocess_torch_shower_to_nx(shower, scale=10000, threshold=0.5):
    node_id = 0
    G = nx.DiGraph()
    nodes_to_add = []
    showers_data = []
    y = shower.y.cpu().detach().numpy()
    x = shower.x.cpu().detach().numpy()
    for shower_id in tqdm(np.unique(y)):
        shower_data = shower.shower_data[y==shower_id].unique(dim=0).detach().cpu().numpy()[0]
        showers_data.append(
            {
            'numtracks': shower_data[-2],
            'signal': shower_id,
            'ele_P': shower_data[0],
            'ele_SX': shower_data[1],
            'ele_SY': shower_data[2],
            'ele_SZ': shower_data[3],
            'ele_TX': shower_data[4],
            'ele_TY': shower_data[5]
            }
        )
    for k in range(len(y)):
        nodes_to_add.append(
            (
                node_id,
                {
                    'features': {
                        'SX': x[k, 0],
                        'SY': x[k, 1],
                        'SZ': x[k, 2],
                        'TX': x[k, 3],
                        'TY': x[k, 4],
                    },
                    'signal': y[k]
                }
            )
        )
        node_id += 1 

    edges_to_add = []
    _, weights = predict_one_shower(shower.to(device), graph_embedder=graph_embedder, edge_classifier=edge_classifier)
    weights = weights.detach().cpu().numpy()
    edge_index = shower.edge_index.t().detach().cpu().numpy()
    edge_index = edge_index[weights > threshold]
    weights = weights[weights > threshold]
    weights = -np.log(weights)
    print(len(weights))
    for k, (p0, p1) in enumerate(edge_index):
        edges_to_add.append((p0, p1, weights[k]))

    G.add_nodes_from(nodes_to_add)
    G.add_weighted_edges_from(edges_to_add)

    G.graph['showers_data'] = showers_data
    return G

In [26]:
G = preprocess_torch_shower_to_nx(showers[0], threshold=0.95)

100%|██████████| 200/200 [00:00<00:00, 284.81it/s]


557882


In [27]:
from hdbscan import run_hdbscan_on_brick, run_hdbscan

In [29]:
%%time
graphx, clusters, roots = run_hdbscan_on_brick(G, min_cl=40, cl_size=40)

277, 3447, 27746, 448, 7858, 37931, 572, 925, 659, 1529, 2350, 478, 50, 

100%|██████████| 13/13 [00:53<00:00,  4.10s/it]

CPU times: user 59.5 s, sys: 1.59 s, total: 1min 1s
Wall time: 1min





In [11]:
selected_tracks = 0
total_tracks = 0

number_of_lost_showers = 0
number_of_broken_showers = 0
number_of_stucked_showers = 0
total_number_of_showers = 0
number_of_good_showers = 0
number_of_survived_showers = 0
second_to_first_ratios = []

E_raw = []
E_true = []

x_raw = []
x_true = []

y_raw = []
y_true = []

z_raw = []
z_true = []

tx_raw = []
tx_true = []

ty_raw = []
ty_true = []

In [12]:
import clustering_metrics
from clustering_metrics import class_disbalance_graphx, class_disbalance_graphx__
from clustering_metrics import estimate_e, estimate_start_xyz, estimate_txty

In [13]:
showers_data = clusterized_brick['graphx'].graph['showers_data']
clusters = clusterized_brick['clusters']
for shower_data in showers_data:
    shower_data['clusters'] = []

for cluster in clusters:
    print(class_disbalance_graphx(cluster))
    selected_tracks += len(cluster)
    for label, label_count in class_disbalance_graphx(cluster):
        if label_count / showers_data[label]['numtracks'] >= 0.1:
            showers_data[label]['clusters'].append(cluster)

for shower_data in showers_data:
    total_tracks += shower_data['numtracks']

for shower_data in showers_data:
    total_number_of_showers += 1

    signals_per_cluster = []
    idx_cluster = []
    for i, cluster in enumerate(shower_data['clusters']):
        labels, counts = class_disbalance_graphx__(cluster)
        signals_per_cluster.append(counts[labels==shower_data['signal']][0])
        idx_cluster.append(i)
    signals_per_cluster = np.array(signals_per_cluster)
    idx_cluster = np.array(idx_cluster)
    second_to_first_ratio = 0.

    if len(signals_per_cluster) == 0:
        number_of_lost_showers += 1
        continue
    if len(signals_per_cluster) == 1:
        second_to_first_ratio = 0.
        second_to_first_ratios.append(second_to_first_ratio)
    else:
        second_to_first_ratio = np.sort(signals_per_cluster)[-2] / signals_per_cluster.max()
        second_to_first_ratios.append(second_to_first_ratio)

    cluster = shower_data['clusters'][np.argmax(signals_per_cluster)]

    # not enough signal
    if (signals_per_cluster.max() / shower_data['numtracks']) <= 0.1:
        continue


    labels, counts = class_disbalance_graphx__(cluster)
    counts = counts/ counts.sum()
    # high contamination
    if counts[labels==shower_data['signal']] < 0.9:
        number_of_stucked_showers += 1
        continue

    if second_to_first_ratio > 0.3:
        number_of_broken_showers += 1
        continue


    ## good showers next
    number_of_good_showers += 1
    # E
    E_raw.append(estimate_e(cluster))
    E_true.append(shower_data['ele_P'])

    # x, y, z
    x, y, z = estimate_start_xyz(cluster)

    x_raw.append(x)
    x_true.append(shower_data['ele_SX'])

    y_raw.append(y)
    y_true.append(shower_data['ele_SY'])

    z_raw.append(z)
    z_true.append(shower_data['ele_SZ'])

    # tx, ty
    tx, ty = estimate_txty(cluster)

    tx_raw.append(tx)
    tx_true.append(shower_data['ele_TX'])

    ty_raw.append(ty)
    ty_true.append(shower_data['ele_TY'])

[(33, 21), (51, 1), (72, 2), (146, 1), (169, 20)]
[(38, 2), (198, 147)]
[(38, 135)]
[(38, 169), (54, 3)]
[(54, 188)]
[(30, 109), (179, 1)]
[(30, 70)]
[(30, 47)]
[(30, 100)]
[(88, 97)]
[(88, 59)]
[(158, 856), (185, 12)]
[(158, 188)]
[(179, 42)]
[(79, 45), (179, 9)]
[(4, 1), (136, 300)]
[(136, 50)]
[(27, 2), (29, 207)]
[(29, 111)]
[(27, 101), (136, 1)]
[(156, 98)]
[(156, 48)]
[(56, 45)]
[(56, 225), (156, 43)]
[(156, 45)]
[(56, 274)]
[(4, 18), (27, 3), (95, 84)]
[(57, 399)]
[(78, 62)]
[(20, 1), (78, 56)]
[(20, 1), (57, 1), (78, 408)]
[(20, 50)]
[(20, 350), (78, 1)]
[(102, 49), (113, 1)]
[(21, 1), (102, 275), (113, 1)]
[(113, 65)]
[(86, 12), (113, 187)]
[(5, 1), (41, 1), (90, 4), (105, 54), (154, 3)]
[(94, 1), (98, 1), (110, 38), (132, 1), (135, 3), (159, 2), (162, 1)]
[(110, 81)]
[(110, 77)]
[(119, 2), (174, 81)]
[(61, 1), (119, 91)]
[(61, 122)]
[(138, 229), (146, 89)]
[(138, 274)]
[(149, 84)]
[(149, 126)]
[(183, 253)]
[(183, 71)]
[(183, 67)]
[(48, 1), (64, 132)]
[(5, 6), (45, 1), (108, 4

In [14]:
clusters_graphx = []
for cluster in clusters:
    clusters_graphx.append(
        nx.DiGraph(graphx.subgraph(cluster.nodes))
    )
    
clusterized_brick = {
            'graphx': graphx,
            'clusters': clusters_graphx,
        }

[-56340.19,
 -52103.78,
 -53948.31,
 -50816.78,
 -10858.092,
 37482.484,
 -40363.09,
 -3307.6392,
 38430.785,
 -17340.582,
 -50519.4,
 -56805.7,
 -30865.367,
 -15550.672,
 -13486.158,
 17137.67,
 -50344.816,
 41393.254,
 4120.989,
 -56111.688,
 48026.2,
 -58652.836,
 -4838.3823,
 -8488.671,
 5839.2354,
 -45456.367,
 -24487.312,
 22714.348,
 -21271.049,
 39952.37,
 50552.15,
 49212.258,
 36894.703,
 30887.422,
 -15408.68,
 -31029.953,
 -38405.285,
 4700.473,
 -24424.361,
 17417.31,
 -52662.848,
 -57989.617,
 44670.613,
 -53979.97,
 1920.8695,
 42249.656,
 52997.297,
 -54759.652,
 -31839.447,
 -47545.6,
 51518.312,
 38760.19,
 39784.395,
 -5751.718,
 16914.59,
 36067.9,
 -61530.715,
 -24448.842,
 -58205.395,
 -28625.111,
 -24512.215,
 52105.047,
 23771.021,
 -38131.844,
 33711.223,
 -15731.653,
 2230.3223,
 -18483.875,
 -17183.172,
 24293.406,
 34258.227,
 1010.6779,
 31714.963,
 27174.168,
 -5509.5327,
 -32324.094,
 -30632.377,
 -7645.224,
 31726.867,
 -49366.715,
 25117.174,
 -8466.343

In [15]:
x_raw

[-5.638329029083252,
 -5.2319655418396,
 -5.413792610168457,
 -5.078458786010742,
 -1.5044629573822021,
 3.7503678798675537,
 -3.8641765117645264,
 -0.5732431411743164,
 3.9296350479125977,
 -1.6163904666900635,
 -5.182336807250977,
 -5.395615577697754,
 -3.1370179653167725,
 -1.5616650581359863,
 -1.1753793954849243,
 1.7447922229766846,
 -5.064754486083984,
 4.138876438140869,
 0.44147762656211853,
 -5.600687503814697,
 4.737910270690918,
 -5.881694316864014,
 -0.6137437224388123,
 -0.9679712653160095,
 0.596903920173645,
 -4.6352219581604,
 -2.467719554901123,
 2.5289664268493652,
 -2.1215269565582275,
 3.994237184524536,
 5.079256057739258,
 5.216305255889893,
 3.707195281982422,
 2.86393666267395,
 -1.53508722782135,
 -2.9925460815429688,
 -3.6421852111816406,
 0.4161340892314911,
 -2.4413223266601562,
 1.750068187713623,
 -5.1368536949157715,
 -5.7268757820129395,
 4.309061050415039,
 -5.383358955383301,
 -0.15963217616081238,
 4.122994422912598,
 5.282889366149902,
 -5.772327899

In [16]:
E_raw = np.array(E_raw)
E_true = np.array(E_true)

x_raw = np.array(x_raw)
x_true = np.array(x_true)

y_raw = np.array(y_raw)
y_true = np.array(y_true)

z_raw = np.array(z_raw)
z_true = np.array(z_true)

tx_raw = np.array(tx_raw)
tx_true = np.array(tx_true)

ty_raw = np.array(ty_raw)
ty_true = np.array(ty_true)

In [17]:
print('Good showers = {}'.format(number_of_good_showers / total_number_of_showers))
print('Stuck showers = {}'.format(number_of_stucked_showers / total_number_of_showers))
print('Broken showers = {}'.format(number_of_broken_showers / total_number_of_showers))
print('Lost showers = {}'.format(number_of_lost_showers / total_number_of_showers))

Good showers = 0.505
Stuck showers = 0.155
Broken showers = 0.245
Lost showers = 0.095


In [18]:
from sklearn.linear_model import TheilSenRegressor, LinearRegression, HuberRegressor

In [19]:
r = HuberRegressor()

r.fit(X=E_raw.reshape((-1, 1)), y=E_true, sample_weight=1/E_true)

E_pred = r.predict(E_raw.reshape((-1, 1)))

In [20]:
x_raw

array([-5.63832903e+00, -5.23196554e+00, -5.41379261e+00, -5.07845879e+00,
       -1.50446296e+00,  3.75036788e+00, -3.86417651e+00, -5.73243141e-01,
        3.92963505e+00, -1.61639047e+00, -5.18233681e+00, -5.39561558e+00,
       -3.13701797e+00, -1.56166506e+00, -1.17537940e+00,  1.74479222e+00,
       -5.06475449e+00,  4.13887644e+00,  4.41477627e-01, -5.60068750e+00,
        4.73791027e+00, -5.88169432e+00, -6.13743722e-01, -9.67971265e-01,
        5.96903920e-01, -4.63522196e+00, -2.46771955e+00,  2.52896643e+00,
       -2.12152696e+00,  3.99423718e+00,  5.07925606e+00,  5.21630526e+00,
        3.70719528e+00,  2.86393666e+00, -1.53508723e+00, -2.99254608e+00,
       -3.64218521e+00,  4.16134089e-01, -2.44132233e+00,  1.75006819e+00,
       -5.13685369e+00, -5.72687578e+00,  4.30906105e+00, -5.38335896e+00,
       -1.59632176e-01,  4.12299442e+00,  5.28288937e+00, -5.77232790e+00,
       -3.23015833e+00, -4.75450993e+00,  5.13356304e+00,  3.80587697e+00,
        3.99568748e+00, -

In [22]:
scale_mm = 10000

In [23]:
print('Energy resolution = {}'.format(np.std((E_true - E_pred) / E_true)))
print()
print('Track efficiency = {}'.format(selected_tracks / total_tracks))
print()
print('Good showers = {}'.format(number_of_good_showers / total_number_of_showers))
print('Stuck showers = {}'.format(number_of_stucked_showers / total_number_of_showers))
print('Broken showers = {}'.format(number_of_broken_showers / total_number_of_showers))
print('Lost showers = {}'.format(number_of_lost_showers / total_number_of_showers))
print()
print('MAE for x = {}'.format(np.abs((x_raw * scale_mm - x_true) / scale_mm).mean()))
print('MAE for y = {}'.format(np.abs((y_raw * scale_mm - y_true) / scale_mm).mean()))
print('MAE for z = {}'.format(np.abs((z_raw * scale_mm - z_true) / scale_mm).mean()))
print()
print('MAE for tx = {}'.format(np.abs((tx_raw - tx_true)).mean()))
print('MAE for ty = {}'.format(np.abs((ty_raw - ty_true)).mean()))

Energy resolution = 0.6370955555006261

Track efficiency = 0.4299886457671805

Good showers = 0.505
Stuck showers = 0.155
Broken showers = 0.245
Lost showers = 0.095

MAE for x = 0.09472677859173591
MAE for y = 0.10557550539805156
MAE for z = 1998.9633063924134

MAE for tx = 0.036689677002752705
MAE for ty = 0.12443466945382688
