In [391]:
import comet_ml
import pandas as pd
#from comet_ml import Experiment
import numpy as np
from collections import deque
from collections import defaultdict
import torch
import torch.nn as nn
import click
from nets import GraphNN_KNN_v1_v1, EdgeClassifier_v3
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_curve, accuracy_score, average_precision_score
from torch_geometric.data import DataLoader
from preprocessing1 import preprocess_dataset
from utils import RunningAverageMeter, plot_aucs
from tqdm import tqdm
import networkx as nx
from hdbscan_ import run_hdbscan_on_brick, run_hdbscan
import clustering_metrics
from st_library_clustering_metrics import class_disbalance, class_disbalance__
from st_library_clustering_metrics import estimate_start_xyz, estimate_txty
from sklearn.linear_model import TheilSenRegressor, LinearRegression, HuberRegressor, RANSACRegressor
from sklearn.model_selection import cross_val_predict

from random import seed
from random import randrange

from sklearn.cluster import MiniBatchKMeans
import hdbscan

from time import time

In [5]:
def predict_one_shower_(shower, graph_embedder, edge_classifier):
    embeddings = graph_embedder(shower)
    edge_labels_true = (shower.y[shower.edge_index[0]] == shower.y[shower.edge_index[1]]).view(-1)
    edge_data = torch.cat([
        embeddings[shower.edge_index[0]],
        embeddings[shower.edge_index[1]]
    ], dim=1)
    
    print(len(edge_data[0]))
    for layer in edge_classifier._layers[:-4]:
        print(layer)
        edge_data = layer(edge_data)
        print(len(edge_data[0]))
    
    
    #edge_labels_predicted = edge_classifier(edge_data).view(-1)

    return edge_labels_true, edge_data.view(-1)



In [9]:
def predict_one_shower(shower, graph_embedder, edge_classifier):
    embeddings = graph_embedder(shower)
    edge_labels_true = (shower.y[shower.edge_index[0]] == shower.y[shower.edge_index[1]]).view(-1)
    edge_data = torch.cat([
        embeddings[shower.edge_index[0]],
        embeddings[shower.edge_index[1]]
    ], dim=1)
    edge_labels_predicted = edge_classifier(edge_data).view(-1)

    return edge_labels_true, edge_labels_predicted

In [210]:
def preprocess_torch_shower_to_nx(shower, graph_embedder, edge_classifier, threshold=0.5):
    node_id = 0
    G = nx.DiGraph()
    nodes_to_add = []
    showers_data = []
    y = shower.y.cpu().detach().numpy()
    x = shower.x.cpu().detach().numpy()
    y_torch = shower.y
    for shower_id in tqdm(np.unique(y)):
        shower_data = shower.shower_data[y_torch == shower_id].unique(dim=0).detach().cpu().numpy()[0]
        showers_data.append(
            {
                'numtracks': shower_data[-2],
                'signal': shower_id,
                'ele_P': shower_data[0],
                'ele_SX': shower_data[1],
                'ele_SY': shower_data[2],
                'ele_SZ': shower_data[3],
                'ele_TX': shower_data[4],
                'ele_TY': shower_data[5]
            }
        )
    print(len(showers_data))
    for k in range(len(y)):
        nodes_to_add.append(
            (
                node_id,
                {
                    'features': {
                        'SX': x[k, 0],
                        'SY': x[k, 1],
                        'SZ': x[k, 2],
                        'TX': x[k, 3],
                        'TY': x[k, 4],
                    },
                    'signal': y[k]
                }
            )
        )
        node_id += 1

    edges_to_add = []
    _, weights = predict_one_shower(shower, graph_embedder=graph_embedder, edge_classifier=edge_classifier)
    weights = weights.detach().cpu().numpy()
    
    edge_index = shower.edge_index.t().detach().cpu().numpy()
    edge_index = edge_index[weights > threshold]
    weights = weights[weights > threshold]
    weights = -np.log(weights) # TODO: which transformation to use?
    print(len(weights))
    for k, (p0, p1) in enumerate(edge_index):
        edges_to_add.append((p0, p1, weights[k]))

    G.add_nodes_from(nodes_to_add)
    G.add_weighted_edges_from(edges_to_add)

    G.graph['showers_data'] = showers_data
    return G



In [384]:
def calc_clustering_metrics(clusterized_brick):
    selected_tracks = 0
    total_tracks = 0

    number_of_lost_showers = 0
    number_of_broken_showers = 0
    number_of_stucked_showers = 0
    total_number_of_showers = 0
    number_of_good_showers = 0
    number_of_survived_showers = 0
    second_to_first_ratios = []

    E_raw = deque()
    E_true = deque()

    x_raw = deque()
    x_true = deque()

    y_raw = deque()
    y_true = deque()

    z_raw = deque()
    z_true = deque()

    tx_raw = deque()
    tx_true = deque()

    ty_raw = deque()
    ty_true = deque()
    
    showers_data = G.graph['showers_data']
    for shower_data in showers_data:
            shower_data['clusters'] = []

    
    for i in range(len(clusters)):
        cluster = clusters[i]
        selected_tracks += len(cluster)
        for label, label_count in class_disbalance(cluster):
                if label_count/showers_data[label]['numtracks'] >= 0.1:
                    showers_data[label]['clusters'].append(cluster)
     

    for i, shower_data in enumerate(showers_data):
        total_tracks += shower_data['numtracks']
        

    for shower_data in showers_data:
        total_number_of_showers += 1

        signals_per_cluster = []
        idx_cluster = []
        for i, cluster in enumerate(shower_data['clusters']):
            labels, counts = class_disbalance__(cluster)
            signals_per_cluster.append(counts[labels == shower_data['signal']][0])
            idx_cluster.append(i)
            
        signals_per_cluster = np.array(signals_per_cluster)
        idx_cluster = np.array(idx_cluster)
        second_to_first_ratio = 0.

        if len(signals_per_cluster) == 0:
            number_of_lost_showers += 1
            continue
        if len(signals_per_cluster) == 1:
            second_to_first_ratio = 0.
            second_to_first_ratios.append(second_to_first_ratio)
        else:
            second_to_first_ratio = np.sort(signals_per_cluster)[-2] / signals_per_cluster.max()
            second_to_first_ratios.append(second_to_first_ratio)

        cluster = shower_data['clusters'][np.argmax(signals_per_cluster)]

            # not enough signal
        if (signals_per_cluster.max() / shower_data['numtracks']) <= 0.1:
            continue

        labels, counts = class_disbalance__(cluster)

        counts = counts / counts.sum()
       
            # high contamination
        if counts[labels == shower_data['signal']] < 0.9:
            number_of_stucked_showers += 1
            continue

        if second_to_first_ratio > 0.3:
            number_of_broken_showers += 1
            continue

        # for good showers
        number_of_good_showers += 1
        
        # E
        E_raw.append(len(cluster))
        E_true.append(shower_data['ele_P'])

            # x, y, z
        x, y, z = estimate_start_xyz(cluster)

        x_raw.append(x)
        x_true.append(shower_data['ele_SX'])

        y_raw.append(y)
        y_true.append(shower_data['ele_SY'])

        z_raw.append(z)
        z_true.append(shower_data['ele_SZ'])

            # tx, ty
        tx, ty = estimate_txty(cluster)

        tx_raw.append(tx)
        tx_true.append(shower_data['ele_TX'])

        ty_raw.append(ty)
        ty_true.append(shower_data['ele_TY'])

    E_raw = np.array(E_raw)
    E_true = np.array(E_true)

    x_raw = np.array(x_raw)
    x_true = np.array(x_true)

    y_raw = np.array(y_raw)
    y_true = np.array(y_true)

    z_raw = np.array(z_raw)
    z_true = np.array(z_true)

    tx_raw = np.array(tx_raw)
    tx_true = np.array(tx_true)

    ty_raw = np.array(ty_raw)
    ty_true = np.array(ty_true)


    # Split a dataset into k folds
    def cross_validation_split(dataset, folds=2):
        dataset_split = list()
        dataset_copy = list(dataset)
        fold_size = int(len(dataset) / folds)
        for i in range(folds):
            fold = list()
            while len(fold) < fold_size:
                index = randrange(len(dataset_copy))
                fold.append(dataset_copy.pop(index))
            dataset_split.append(fold)
        return dataset_split

    # test cross validation split
    seed(1)
    dataset = E_raw.reshape((-1, 1))
    print(dataset.shape)
    folds = cross_validation_split(dataset, 2)
    y = cross_validation_split(E_true, 2)
    print(len(folds))
    
    len_X = 0
    E_pred = []
    for i in range(len(folds)):
        folds_ = folds.copy()
        y_ = y.copy()
        
        r = HuberRegressor()
        X = folds_.pop(i)
        Y = y_.pop(i)
        
        len_X+=len(X)      
        
        
        folds_new = np.array([item for sublist in folds_ for item in sublist])
        y_new = np.array([item for sublist in y_ for item in sublist])

        r.fit(folds_new, y_new, sample_weight=1/(y_new)**6)
        
        Y_pred = r.predict(X)              
        E_pred.append(Y_pred)

   
    E_pred = np.array(E_pred).reshape((-1, 1))
    E_true = np.array(y).reshape((-1, 1)) 
    scale_mm = 10000
    Energy_resolution = np.std((E_true - E_pred) / E_true)
    print('Energy_resolution:', Energy_resolution)
    Track_efficiency = selected_tracks / total_tracks
    Good_showers = number_of_good_showers / total_number_of_showers
    Stuck_showers = number_of_stucked_showers / total_number_of_showers
    Broken_showers = number_of_broken_showers / total_number_of_showers
    Lost_showers =number_of_lost_showers / total_number_of_showers
    MAE_x = np.abs((x_raw * scale_mm - x_true) / scale_mm).mean()
    MAE_y = np.abs((y_raw * scale_mm - y_true) / scale_mm).mean()
    MAE_z = np.abs((z_raw * scale_mm - z_true) / scale_mm).mean()
    MAE_tx = np.abs((tx_raw - tx_true)).mean()
    MAE_ty = np.abs((ty_raw - ty_true)).mean()
    
    return Energy_resolution,Track_efficiency,Good_showers,Stuck_showers,Broken_showers,Lost_showers,MAE_x,MAE_y,MAE_z,MAE_tx,MAE_ty

## Preprocessing

In [2]:
datafile='./data/train.pt'

project_name='em_showers_clustering'
work_space='ketrint'

#experiment = Experiment('6O55PoJt4tkp9LyupIE86eikH', project_name=project_name, workspace=work_space)
device = torch.device('cpu')
showers = preprocess_dataset(datafile)

k = showers[0].x.shape[1]
print(k)

10


In [3]:
dim_out=10
threshold =0.9

In [4]:
graph_embedder = GraphNN_KNN_v1_v1(dim_out=dim_out, k=k).to(device)
edge_classifier = EdgeClassifier_v3(dim_out=dim_out)

graph_embedder.load_state_dict(torch.load('graph_embedder_v1_v1v3.pt', map_location=device))
graph_embedder.eval()
edge_classifier.load_state_dict(torch.load('edge_classifier_v1_v1v3.pt', map_location=device))
edge_classifier.eval()

EdgeClassifier_v3(
  (_layers): ModuleList(
    (0): Linear(in_features=20, out_features=30, bias=True)
    (1): Tanh()
    (2): Dropout(p=0.3)
    (3): Linear(in_features=30, out_features=30, bias=True)
    (4): Tanh()
    (5): Dropout(p=0.3)
    (6): Linear(in_features=30, out_features=10, bias=True)
    (7): Tanh()
    (8): Dropout(p=0.3)
    (9): Linear(in_features=10, out_features=1, bias=True)
    (10): Sigmoid()
  )
)

In [211]:
G = preprocess_torch_shower_to_nx(showers[10],
                                            graph_embedder=graph_embedder,
                                            edge_classifier=edge_classifier,
                                            threshold=threshold)

100%|██████████| 200/200 [00:00<00:00, 210.42it/s]


200
867869


In [212]:
data = showers[10].x.detach().numpy()
y = showers[10].y.detach().numpy().reshape(-1,1)
data.shape

(112305, 10)

In [213]:
node_id = np.arange(data.shape[0]).reshape(-1,1)
node_id.shape

(112305, 1)

## Clusterization

In [37]:
kmeans = MiniBatchKMeans(n_clusters=200, random_state=0).fit(data)

In [58]:
labels = kmeans.predict(data).reshape(-1,1)

In [214]:
whole_data = np.hstack((node_id, data[:,:5],labels, y))
whole_data.shape

(112305, 8)

In [215]:
clusters = defaultdict(list)
for node_id, SX, SY, SZ, TX, TY, key, y in whole_data:

    clusters[key].append(
            
                {    
                    'features': {
                        'SX': SX,
                        'SY': SY,
                        'SZ': SZ,
                        'TX': TX,
                        'TY': TY,
                    },
                    'signal': int(y),
                    'node_id': int(node_id),
                }
        
    )

-----

In [389]:
Energy_resolutions = []
Track_efficiencys = []
Good_showerss = []
Stuck_showerss = []
Broken_showerss = [] 
Lost_showerss = []
MAE_xs = []
MAE_ys = []
MAE_zs = []
MAE_txs = []
MAE_tys = []


for i in range(40):
    
    brick = showers[i]
    
    G = preprocess_torch_shower_to_nx(brick,
                                            graph_embedder=graph_embedder,
                                            edge_classifier=edge_classifier,
                                            threshold=threshold)
    
    
    data = brick.x.detach().numpy()
    y = brick.y.detach().numpy().reshape(-1,1)
    node_id = np.arange(data.shape[0]).reshape(-1,1)
    
    #clusterization    
    kmeans = MiniBatchKMeans(n_clusters=200, random_state=0).fit(data)
    labels = kmeans.predict(data).reshape(-1,1)
    
    #data
    whole_data = np.hstack((node_id, data[:,:5],labels, y))
    
    #add data to clusters information
    clusterized_brick = defaultdict(list)
    
    for node_id, SX, SY, SZ, TX, TY, key, y in whole_data:

        clusterized_brick[key].append(

                    {    
                        'features': {
                            'SX': SX,
                            'SY': SY,
                            'SZ': SZ,
                            'TX': TX,
                            'TY': TY,
                        },
                        'signal': int(y),
                        'node_id': int(node_id),
                    }

        )

    
    print('Brick N:', i)

    Energy_resolution,Track_efficiency,Good_showers,Stuck_showers,Broken_showers,Lost_showers,MAE_x,MAE_y,MAE_z,MAE_tx,MAE_ty = calc_clustering_metrics(clusterized_brick)
    
    Energy_resolutions.append(Energy_resolution)
    Track_efficiencys.append(Track_efficiency)
    Good_showerss.append(Good_showers)
    Stuck_showerss.append(Stuck_showers)
    Broken_showerss.append(Broken_showers)
    Lost_showerss.append(Lost_showers)
    MAE_xs.append(MAE_x)
    MAE_ys.append(MAE_y)
    MAE_zs.append(MAE_z)
    MAE_txs.append(MAE_tx)
    MAE_tys.append(MAE_ty)
    
    

100%|██████████| 200/200 [00:00<00:00, 230.73it/s]


200
797595
Brick N: 0


 11%|█         | 22/200 [00:00<00:00, 219.89it/s]

(2, 1)
2
len(X) 1
folds_new 1 [[368]]
y_new 1 [1.0937959]
0.0
len(X) 1
folds_new 1 [[428]]
y_new 1 [1.812693]
0.0
2
(2, 1)
(2, 1)
Energy_resolution: 0.36156797800514756


100%|██████████| 200/200 [00:00<00:00, 217.84it/s]


200
835667
Brick N: 1


 12%|█▎        | 25/200 [00:00<00:00, 247.24it/s]

(2, 1)
2
len(X) 1
folds_new 1 [[368]]
y_new 1 [11.62014]
0.0
len(X) 1
folds_new 1 [[428]]
y_new 1 [13.702604]
0.0
2
(2, 1)
(2, 1)
Energy_resolution: 0.012878458651186007


100%|██████████| 200/200 [00:00<00:00, 231.56it/s]


200
812690
Brick N: 2


 12%|█▏        | 23/200 [00:00<00:00, 229.11it/s]

(2, 1)
2
len(X) 1
folds_new 1 [[368]]
y_new 1 [3.6352756]
0.0
len(X) 1
folds_new 1 [[428]]
y_new 1 [3.3270884]
0.0
2
(2, 1)
(2, 1)
Energy_resolution: 0.24192488371714027


100%|██████████| 200/200 [00:00<00:00, 250.51it/s]


200
715753
Brick N: 3


 10%|▉         | 19/200 [00:00<00:00, 186.93it/s]

(2, 1)
2
len(X) 1
folds_new 1 [[368]]
y_new 1 [8.396688]
0.0
len(X) 1
folds_new 1 [[428]]
y_new 1 [5.421699]
0.0
2
(2, 1)
(2, 1)
Energy_resolution: 0.6227900303158156


100%|██████████| 200/200 [00:00<00:00, 204.76it/s]


200
881207
Brick N: 4


 10%|█         | 21/200 [00:00<00:00, 209.46it/s]

(2, 1)
2
len(X) 1
folds_new 1 [[368]]
y_new 1 [9.911651]
0.0
len(X) 1
folds_new 1 [[428]]
y_new 1 [2.4051852]
0.0
2
(2, 1)
(2, 1)
Energy_resolution: 2.2904200871533567


100%|██████████| 200/200 [00:00<00:00, 215.87it/s]


200
847880
Brick N: 5


 14%|█▎        | 27/200 [00:00<00:00, 228.91it/s]

(2, 1)
2
len(X) 1
folds_new 1 [[368]]
y_new 1 [18.772476]
0.0
len(X) 1
folds_new 1 [[428]]
y_new 1 [6.613272]
0.0
2
(2, 1)
(2, 1)
Energy_resolution: 0.15072058051307025


100%|██████████| 200/200 [00:00<00:00, 223.21it/s]


200
843864
Brick N: 6


 10%|█         | 21/200 [00:00<00:00, 198.84it/s]

(2, 1)
2
len(X) 1
folds_new 1 [[368]]
y_new 1 [2.6880145]
0.0
len(X) 1
folds_new 1 [[428]]
y_new 1 [2.8149016]
0.0
2
(2, 1)
(2, 1)
Energy_resolution: 0.1051075520582235


100%|██████████| 200/200 [00:00<00:00, 208.29it/s]


200
859376
Brick N: 7


 12%|█▎        | 25/200 [00:00<00:00, 248.90it/s]

(2, 1)
2
len(X) 1
folds_new 1 [[368]]
y_new 1 [5.086825]
0.0
len(X) 1
folds_new 1 [[428]]
y_new 1 [5.4884534]
0.0
2
(2, 1)
(2, 1)
Energy_resolution: 0.07501708009656036


100%|██████████| 200/200 [00:00<00:00, 213.66it/s]


200
884671
Brick N: 8


 10%|█         | 21/200 [00:00<00:00, 205.57it/s]

(2, 1)
2
len(X) 1
folds_new 1 [[368]]
y_new 1 [5.9202957]
0.0
len(X) 1
folds_new 1 [[428]]
y_new 1 [2.6488354]
0.0
2
(2, 1)
(2, 1)
Energy_resolution: 1.1073510628857282


100%|██████████| 200/200 [00:00<00:00, 214.45it/s]


200
853312
Brick N: 9


 12%|█▎        | 25/200 [00:00<00:00, 246.66it/s]

(2, 1)
2
len(X) 1
folds_new 1 [[368]]
y_new 1 [7.451229]
0.0
len(X) 1
folds_new 1 [[428]]
y_new 1 [5.115943]
0.0
2
(2, 1)
(2, 1)
Energy_resolution: 0.5516501547437712


100%|██████████| 200/200 [00:00<00:00, 213.32it/s]


200
867869
Brick N: 10


 13%|█▎        | 26/200 [00:00<00:00, 250.45it/s]

(2, 1)
2
len(X) 1
folds_new 1 [[368]]
y_new 1 [8.817481]
0.0
len(X) 1
folds_new 1 [[428]]
y_new 1 [11.976575]
0.0
2
(2, 1)
(2, 1)
Energy_resolution: 0.1560889176819083


100%|██████████| 200/200 [00:00<00:00, 237.91it/s]


200
778521
Brick N: 11


 12%|█▎        | 25/200 [00:00<00:00, 246.56it/s]

(2, 1)
2
len(X) 1
folds_new 1 [[368]]
y_new 1 [6.806085]
0.0
len(X) 1
folds_new 1 [[428]]
y_new 1 [9.891028]
0.0
2
(2, 1)
(2, 1)
Energy_resolution: 0.2249186578364497


100%|██████████| 200/200 [00:00<00:00, 239.72it/s]


200
778062
Brick N: 12
(2, 1)
2
len(X) 1
folds_new 1 [[368]]
y_new 1 [0.7673867]


ValueError: HuberRegressor convergence failed: l-BFGS-b solver terminated with ABNORMAL_TERMINATION_IN_LNSRCH

In [392]:
metrics = pd.DataFrame(np.column_stack([Energy_resolutions,
    Track_efficiencys,
    Good_showerss,
    Stuck_showerss,
    Broken_showerss,
    Lost_showerss,
    MAE_xs,
    MAE_ys,
    MAE_zs,
    MAE_txs,
    MAE_tys]), columns=['Energy_resolution', 'Track_efficiency', 'Good_showers', 'Stuck_showers',
    'Broken_showers', 'Lost_showers', 'MAE_x', 'MAE_y', 'MAE_z', 'MAE_tx', 'MAE_ty'])

In [393]:
metrics.describe()

Unnamed: 0,Energy_resolution,Track_efficiency,Good_showers,Stuck_showers,Broken_showers,Lost_showers,MAE_x,MAE_y,MAE_z,MAE_tx,MAE_ty
count,12.0,12.0,12.0,12.0,12.0,12.0,12.0,12.0,12.0,12.0,12.0
mean,0.491703,1.041283,0.01,0.833333,0.01,0.146667,5.180038,1.90485,1996.665746,6.43111,9.756493
std,0.644342,0.063902,1.81186e-18,0.04997,1.81186e-18,0.04997,2.808737,1.331663,0.539777,0.060986,0.093151
min,0.012878,0.974185,0.01,0.79,0.01,0.0,0.04549,0.197429,1995.926593,6.319961,9.635412
25%,0.139317,0.998663,0.01,0.8125,0.01,0.1475,3.074574,0.886123,1996.298148,6.409748,9.693611
50%,0.233422,1.020186,0.01,0.815,0.01,0.165,5.780158,1.632048,1996.648143,6.428668,9.725091
75%,0.569435,1.078338,0.01,0.8325,0.01,0.1675,6.594023,2.475262,1997.008583,6.45392,9.8398
max,2.29042,1.195077,0.01,0.98,0.01,0.19,9.549434,4.429449,1997.634832,6.562537,9.928096
