In [25]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
import networkx as nx

import uproot

from create_train_test_dataset import scale_data, gen_bricks
sns.set(font_scale=2)

In [7]:
!cd tools/ && python setup_opera_distance_metric.py build_ext --inplace

running build_ext


In [8]:
from tools.opera_distance_metric import generate_k_nearest_graph, opera_distance_metric_py, generate_radius_graph

In [9]:
a = np.array([0, 0, 0, 0, 0, 0.])
assert np.allclose(opera_distance_metric_py(a, 
                                         a + np.array([0, 0, 0, 3, 0, 0])), 
                   0)
assert np.allclose(opera_distance_metric_py(a, 
                                         a + np.array([0, 0, 2, 3, 0, 0])), 
                   6 / 1293)
assert np.allclose(opera_distance_metric_py(a, 
                                         a + np.array([0, 1, 2, 3, 0, 0])), 
                   np.sqrt(1 + 2**2) * 3 / 1293)
assert np.allclose(opera_distance_metric_py(a, 
                                         a + np.array([0, 0, 0, 3, 1e-1, 0])), 
                   1e-1 * 3**2 / 2 / 1293, atol=1e-3, rtol=1e-4)
assert np.allclose(opera_distance_metric_py(a, 
                                         a + np.array([0, 0, 0, 3, 1e-1, 1e-2])), 
                   np.sqrt(0.1**2 + 0.01**2) * 3**2 / 2 / 1293, atol=1e-3, rtol=1e-4)

In [10]:
import torch
import torch_scatter

In [11]:
import torch_geometric
from torch_geometric.data import Data

In [12]:
# https://blog.sicara.com/fast-custom-knn-sklearn-cython-de92e5a325c
from sklearn.neighbors import NearestNeighbors
from tqdm import tqdm

In [13]:
def load_mc(filename="./EM_data/mcdata_taue2.root", step=1):
    f = uproot.open(filename)
    mc = f['Data'].pandas.df(["Event_id", "ele_P", "BT_X", "BT_Y",
                              "BT_Z","BT_SX", "BT_SY","ele_x", 
                              "ele_y", "ele_z", "ele_sx", "ele_sy", "chisquare", ], flatten=False)
    pmc = pd.DataFrame(mc)
    pmc['numtracks'] = pmc.BT_X.apply(lambda x: len(x))
    # cuts
    shapechange = [pmc.shape[0]]
    pmc = pmc[pmc.ele_P > 0.1]
    shapechange.append(pmc.shape[0])

    pmc = pmc[pmc.ele_z < 0]
    shapechange.append(pmc.shape[0])

    pmc = pmc[pmc.numtracks > 3]
    shapechange.append(pmc.shape[0])
    print("numtracks reduction by cuts: ", shapechange)
    pmc['m_BT_X'] = pmc.BT_X.apply(lambda x: x.mean())
    pmc['m_BT_Y'] = pmc.BT_Y.apply(lambda x: x.mean())
    pmc['m_BT_Z'] = pmc.BT_Z.apply(lambda x: x.mean())

    print("len(pmc): {len}".format(len=len(pmc)))
    return pmc

In [14]:
pmc = load_mc(filename='./EM_data/mcdata_taue2.root', step=1)

numtracks reduction by cuts:  [18724, 18679, 9616, 9106]
len(pmc): 9106


In [17]:
def pmc_to_ship_format(pmc):
    showers = []
    scale = 10000
    for idx in pmc.index:
        shower = pmc.loc[idx]
        
        showers.append(
            {
                'TX': shower['BT_X'] / scale,
                'TY': shower['BT_Y'] / scale,
                'TZ': shower['BT_Z'] / scale,
                'PX': shower['BT_SX'],
                'PY': shower['BT_SY'],
                'PZ': np.ones_like(shower['BT_X']),
                'ele_P': shower['ele_P'],
                'ele_TX': shower['ele_x'] / scale,
                'ele_TY': shower['ele_y'] / scale,
                'ele_TZ': shower['ele_z']  / scale,
                'ele_PX': shower['ele_sx'],
                'ele_PY': shower['ele_sy'],
                'ele_PZ': 1.
            }
        )
    return showers
selected_showers = pmc_to_ship_format(pmc)

In [20]:
len(pmc)

9106

In [21]:
#selected_showers = scale_data(pmc)
selected_showers = [selected_shower for selected_shower in selected_showers if len(selected_shower['PX']) > 70]
selected_showers = [selected_shower for selected_shower in selected_showers if len(selected_shower['PX']) < 3000]

In [22]:
len(selected_showers)

8019

In [23]:
bricks = []
NUM_SHOWERS_IN_BRICK = 200

In [26]:
%%time
scale = 10000
bricks = []
for i in range(len(selected_showers) // NUM_SHOWERS_IN_BRICK):
    node_id = 0
    graphx = nx.DiGraph()
    nodes_to_add = []
    showers_data = []
    for j in range(NUM_SHOWERS_IN_BRICK):
        selected_shower = selected_showers[i * NUM_SHOWERS_IN_BRICK + j]
        showers_data.append(
            {
            'numtracks': len(selected_shower['PX']),
            'signal': j,
            'ele_P': selected_shower['ele_P'],
            'ele_SX': selected_shower['ele_TX'] * scale,
            'ele_SY': selected_shower['ele_TY'] * scale,
            'ele_SZ': selected_shower['ele_TZ'] * scale,
            'ele_TX': selected_shower['ele_PX'] / selected_shower['ele_PZ'],
            'ele_TY': selected_shower['ele_PY'] / selected_shower['ele_PZ']
            }
        )
        for k in range(len(selected_shower['PX'])):
            nodes_to_add.append(
                (
                    node_id,
                    {
                        'features': {
                            'SX': selected_shower['TX'][k] * scale,
                            'SY': selected_shower['TY'][k] * scale,
                            'SZ': selected_shower['TZ'][k] * scale,
                            'TX': selected_shower['PX'][k] / selected_shower['PZ'][k],
                            'TY': selected_shower['PY'][k] / selected_shower['PZ'][k],
                        },
                        'signal': j
                    }
                )
            )
            node_id += 1
    graphx.add_nodes_from(nodes_to_add)
    graphx.graph['showers_data'] = showers_data
    bricks.append(graphx)

CPU times: user 1min 17s, sys: 2.68 s, total: 1min 20s
Wall time: 1min 20s


In [None]:
#bricks = gen_bricks(selected_showers=selected_showers, NUM_SHOWERS_IN_BRICK=200)

In [27]:
len(bricks)

40

In [35]:
for i, graph in tqdm(enumerate(bricks)):
        
        nodes = graph.nodes()
        
        SX = [node['features']['SX'] for node in nodes.values()]
        SY = [node['features']['SY'] for node in nodes.values()]
        SZ = [node['features']['SZ'] for node in nodes.values()]
        TX = [node['features']['TX'] for node in nodes.values()]
        TY = [node['features']['TY'] for node in nodes.values()]
        
        
        data = graph.graph

        numtracks = [node['numtracks'] for node in data['showers_data']]
        ele_P = [node['ele_P'] for node in data['showers_data']]
        
        
        print(len(SX),len(SY),len(SZ),len(TX),len(TY), len(numtracks),len(ele_P))
        
        print(numtracks)
        
        break

0it [00:00, ?it/s]

103926 103926 103926 103926 103926 200 200
[317, 152, 520, 119, 105, 615, 409, 77, 285, 163, 87, 564, 574, 127, 1508, 989, 144, 895, 294, 134, 685, 539, 1285, 358, 404, 717, 468, 430, 117, 1050, 723, 509, 627, 476, 102, 475, 103, 106, 889, 315, 506, 400, 703, 244, 241, 79, 103, 317, 833, 287, 1495, 1075, 650, 335, 374, 620, 1207, 607, 571, 109, 1588, 378, 1802, 118, 220, 107, 343, 516, 305, 342, 1154, 280, 859, 489, 1122, 506, 545, 152, 1120, 184, 368, 827, 258, 589, 726, 349, 139, 174, 598, 786, 90, 162, 425, 865, 853, 149, 516, 375, 134, 128, 106, 170, 591, 210, 1886, 586, 638, 597, 518, 2127, 520, 87, 227, 568, 406, 213, 321, 160, 1138, 243, 537, 121, 148, 949, 1567, 663, 581, 87, 1379, 753, 416, 166, 497, 623, 1886, 266, 763, 186, 911, 1050, 445, 360, 460, 889, 431, 132, 257, 852, 316, 407, 292, 188, 167, 773, 452, 171, 630, 958, 2428, 389, 183, 310, 123, 230, 367, 109, 75, 139, 99, 724, 1173, 270, 472, 470, 172, 435, 188, 109, 915, 244, 175, 1510, 2405, 1342, 274, 208, 249, 97, 40




In [29]:
def digraph_to_csv(graphs: list):
    df = pd.DataFrame(columns=['brick_id', 'shower_id', 'SX', 'SY', 'SZ', 'TX', 'TY', 
                               "ele_P", "BT_X", "BT_Y",
                               "BT_Z","BT_SX", "BT_SY","ele_x", 
                               "ele_y", "ele_z", "ele_sx", "ele_sy", 'shower_id', 'numtracks'])
    
    Numtracks = []
    Ele_P = []
    Ele_SX = []
    Ele_SY = []
    Ele_SZ = []
    Ele_TX = []
    Ele_TY = []
    
    for i, graph in tqdm(enumerate(graphs)):
        data = graph.graph

        numtracks = [node['numtracks'] for node in data['showers_data']]
        ele_P = [node['ele_P'] for node in data['showers_data']]
        ele_SX = [node['ele_SX'] for node in data['showers_data']]
        ele_SY = [node['ele_SY'] for node in data['showers_data']]
        ele_SZ = [node['ele_SZ'] for node in data['showers_data']]
        ele_TX = [node['ele_TX'] for node in data['showers_data']]
        ele_TY = [node['ele_TY'] for node in data['showers_data']]
        
        Numtracks.append(numtracks)
        Ele_P.append(ele_P)
        Ele_SX.append(ele_SX)
        Ele_SY.append(ele_SY)
        Ele_SZ.append(ele_SZ)
        Ele_TX.append(ele_TX)
        Ele_TY.append(ele_TY)
        
        
        
        nodes = graph.nodes()
        
        SX = [node['features']['SX'] for node in nodes.values()]
        SY = [node['features']['SY'] for node in nodes.values()]
        SZ = [node['features']['SZ'] for node in nodes.values()]
        TX = [node['features']['TX'] for node in nodes.values()]
        TY = [node['features']['TY'] for node in nodes.values()]
        
        
        shower_id = [node['signal'] for node in nodes.values()]
        brick_id = [i for _ in range(len(shower_id))]
        
        df = df.append(
            pd.DataFrame(
                {'brick_id': brick_id, 
                 'shower_id': shower_id, 
                 'SX': SX, 
                 'SY': SY, 
                 'SZ': SZ, 
                 'TX': TX, 
                 'TY': TY,
                 
                 'numtracks': numtracks, 
                 'ele_P': ele_P, 
                 'ele_SX': ele_SX, 
                 'ele_SY': ele_SY, 
                 'ele_SZ': ele_SZ, 
                 'ele_TX': ele_TX, 
                 'ele_TY': ele_TY,                 
                }
            )
        )
    return df
        

In [None]:
for i, graph in tqdm(enumerate(bricks)):
    data = graph.graph
    #print(data.values())
    numtracks = [node['numtracks'] for node in data['showers_data']]
    print(len(numtracks))
    
    ele_P = [node['ele_P'] for node in data['showers_data']]
    print(len(ele_P))
    break

In [30]:
df = digraph_to_csv(bricks)

0it [00:00, ?it/s]


ValueError: arrays must all be same length

In [None]:
data = 

In [None]:
df = np.loadtxt('./EM_data/opera_train.data')
df = pd.DataFrame(df, columns=['brick_id', 'SX', 'SY', 'SZ', 'TX', 'TY'])

df_sol = np.loadtxt('./EM_data/opera_train.solution')
df_sol = pd.DataFrame(df_sol, columns=['brick_id', 'shower_id'])

In [None]:
def gen_torch_showers(df, df_sol, knn=False, r=250, k=5, directed=False, e = 0.00005):
    showers = []

    for brick_id in tqdm(list(df.brick_id.unique())[:]):
        df_brick = df[df.brick_id == brick_id]
        if knn:
            edges_from, edge_to, dist = generate_k_nearest_graph(df_brick.values, 
                                                                 k, e=e,
                                                                 symmetric=directed);
            edges = np.vstack([edges_from, edge_to])
            dist = np.array(dist)
            edge_index = torch.LongTensor(edges)
        else:
            edges_from, edge_to, dist = generate_radius_graph(df_brick.values, 
                                                              r, e=e,
                                                              symmetric=directed);
            edges = np.vstack([edges_from, edge_to])
            dist = np.array(dist)
            edge_index = torch.LongTensor(edges)
            
        x = torch.FloatTensor(df_brick.values[:, 1:] / np.array([1e4, 1e4, 1e4, 1., 1.]))
        edge_attr = torch.log(torch.FloatTensor(dist).view(-1, 1))
        y = torch.LongTensor(df_sol.shower_id.loc[df_sol.brick_id == brick_id].values)
        shower = torch_geometric.data.Data(x=x, edge_index=edge_index, 
                                           pos=x, edge_attr=edge_attr, y=y)
        showers.append(shower)
    
    return showers

In [None]:
showers_train=gen_torch_showers(df=df, df_sol=df_sol, knn=True, k=10, directed=False, e = 10)
torch.save(showers_train, './EM_data/showers_trai7l.pt')