In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
import networkx as nx

import uproot

from collections import deque

from create_train_test_dataset import scale_data, gen_bricks
sns.set(font_scale=2)

In [2]:
!cd tools/ && python setup_opera_distance_metric.py build_ext --inplace

running build_ext


In [3]:
from tools.opera_distance_metric import generate_k_nearest_graph, opera_distance_metric_py, generate_radius_graph

In [4]:
a = np.array([0, 0, 0, 0, 0, 0.])
assert np.allclose(opera_distance_metric_py(a, 
                                         a + np.array([0, 0, 0, 3, 0, 0])), 
                   0)
assert np.allclose(opera_distance_metric_py(a, 
                                         a + np.array([0, 0, 2, 3, 0, 0])), 
                   6 / 1293)
assert np.allclose(opera_distance_metric_py(a, 
                                         a + np.array([0, 1, 2, 3, 0, 0])), 
                   np.sqrt(1 + 2**2) * 3 / 1293)
assert np.allclose(opera_distance_metric_py(a, 
                                         a + np.array([0, 0, 0, 3, 1e-1, 0])), 
                   1e-1 * 3**2 / 2 / 1293, atol=1e-3, rtol=1e-4)
assert np.allclose(opera_distance_metric_py(a, 
                                         a + np.array([0, 0, 0, 3, 1e-1, 1e-2])), 
                   np.sqrt(0.1**2 + 0.01**2) * 3**2 / 2 / 1293, atol=1e-3, rtol=1e-4)

In [5]:
import torch
import torch_scatter

In [6]:
import torch_geometric
from torch_geometric.data import Data

In [7]:
# https://blog.sicara.com/fast-custom-knn-sklearn-cython-de92e5a325c
from sklearn.neighbors import NearestNeighbors
from tqdm import tqdm

In [8]:
def load_mc(filename="./EM_data/mcdata_taue2.root", step=1):
    f = uproot.open(filename)
    mc = f['Data'].pandas.df(["Event_id", "ele_P", "BT_X", "BT_Y",
                              "BT_Z","BT_SX", "BT_SY","ele_x", 
                              "ele_y", "ele_z", "ele_sx", "ele_sy", "chisquare", ], flatten=False)
    pmc = pd.DataFrame(mc)
    pmc['numtracks'] = pmc.BT_X.apply(lambda x: len(x))
    # cuts
    shapechange = [pmc.shape[0]]
    pmc = pmc[pmc.ele_P > 0.1]
    shapechange.append(pmc.shape[0])

    pmc = pmc[pmc.ele_z < 0]
    shapechange.append(pmc.shape[0])

    pmc = pmc[pmc.numtracks > 3]
    shapechange.append(pmc.shape[0])
    print("numtracks reduction by cuts: ", shapechange)
    pmc['m_BT_X'] = pmc.BT_X.apply(lambda x: x.mean())
    pmc['m_BT_Y'] = pmc.BT_Y.apply(lambda x: x.mean())
    pmc['m_BT_Z'] = pmc.BT_Z.apply(lambda x: x.mean())

    print("len(pmc): {len}".format(len=len(pmc)))
    return pmc

In [9]:
pmc = load_mc(filename='./EM_data/mcdata_taue2.root', step=1)

numtracks reduction by cuts:  [18724, 18679, 9616, 9106]
len(pmc): 9106


In [10]:
def pmc_to_ship_format(pmc):
    showers = []
    scale = 10000
    for idx in pmc.index:
        shower = pmc.loc[idx]
        
        showers.append(
            {
                'TX': shower['BT_X'] / scale,
                'TY': shower['BT_Y'] / scale,
                'TZ': shower['BT_Z'] / scale,
                'PX': shower['BT_SX'],
                'PY': shower['BT_SY'],
                'PZ': np.ones_like(shower['BT_X']),
                'ele_P': shower['ele_P'],
                'ele_TX': shower['ele_x'] / scale,
                'ele_TY': shower['ele_y'] / scale,
                'ele_TZ': shower['ele_z']  / scale,
                'ele_PX': shower['ele_sx'],
                'ele_PY': shower['ele_sy'],
                'ele_PZ': 1.
            }
        )
    return showers
selected_showers = pmc_to_ship_format(pmc)

In [11]:
len(pmc)

9106

In [12]:
selected_showers = [selected_shower for selected_shower in selected_showers if len(selected_shower['PX']) > 70]

In [13]:
selected_showers = [selected_shower for selected_shower in selected_showers if len(selected_shower['PX']) < 3000]

In [14]:
len(selected_showers)

8019

In [15]:
bricks = deque()
NUM_SHOWERS_IN_BRICK = 200

In [16]:
%%time
scale = 10000
bricks = deque()
for i in range(len(selected_showers) // NUM_SHOWERS_IN_BRICK):
    node_id = 0
    graphx = nx.DiGraph()
    nodes_to_add = []
    showers_data = []
    for j in range(NUM_SHOWERS_IN_BRICK):
        selected_shower = selected_showers[i * NUM_SHOWERS_IN_BRICK + j]
        showers_data.append(
            {
            'numtracks': len(selected_shower['PX']),
            'signal': j,
            'ele_P': selected_shower['ele_P'],
            'ele_SX': selected_shower['ele_TX'] * scale,
            'ele_SY': selected_shower['ele_TY'] * scale,
            'ele_SZ': selected_shower['ele_TZ'] * scale,
            'ele_TX': selected_shower['ele_PX'] / selected_shower['ele_PZ'],
            'ele_TY': selected_shower['ele_PY'] / selected_shower['ele_PZ']
            }
        )
        for k in range(len(selected_shower['PX'])):
            nodes_to_add.append(
                (
                    node_id,
                    {
                        'features': {
                            'SX': selected_shower['TX'][k] * scale,
                            'SY': selected_shower['TY'][k] * scale,
                            'SZ': selected_shower['TZ'][k] * scale,
                            'TX': selected_shower['PX'][k] / selected_shower['PZ'][k],
                            'TY': selected_shower['PY'][k] / selected_shower['PZ'][k],
                        },
                        'signal': j
                    }
                )
            )
            node_id += 1
    graphx.add_nodes_from(nodes_to_add)
    graphx.graph['showers_data'] = showers_data
    bricks.append(graphx)

CPU times: user 1min 16s, sys: 2.79 s, total: 1min 19s
Wall time: 1min 19s


In [17]:
len(bricks)

40

In [18]:
graphx.nodes(data= True)[0]

{'features': {'SX': 9381.415843963623,
  'SY': -15577.337741851807,
  'SZ': -143.58147978782654,
  'TX': -0.12269463,
  'TY': 0.06926952},
 'signal': 0}

In [19]:
def digraph_to_csv(graphs: list):
    df = pd.DataFrame(columns=['Brick_id', 'shower_id', 'SX', 'SY', 'SZ', 'TX', 'TY', 
                               'numtracks', 'ele_P', 'ele_SX', 'ele_SY', 'ele_SZ',
                               'ele_TX', 'ele_TY' ])
    
    Numtracks = []
    Ele_P = []
    Ele_SX = []
    Ele_SY = []
    Ele_SZ = []
    Ele_TX = []
    Ele_TY = []
    
    for i, graph in tqdm(enumerate(graphs)):
        data = graph.graph

        numtracks = [node['numtracks'] for node in data['showers_data']]
        ele_P = [node['ele_P'] for node in data['showers_data']]
        ele_SX = [node['ele_SX'] for node in data['showers_data']]
        ele_SY = [node['ele_SY'] for node in data['showers_data']]
        ele_SZ = [node['ele_SZ'] for node in data['showers_data']]
        ele_TX = [node['ele_TX'] for node in data['showers_data']]
        ele_TY = [node['ele_TY'] for node in data['showers_data']]
        Numtracks.append(numtracks)
        Ele_P.append(ele_P)
        Ele_SX.append(ele_SX)
        Ele_SY.append(ele_SY)
        Ele_SZ.append(ele_SZ)
        Ele_TX.append(ele_TX)
        Ele_TY.append(ele_TY)
        
        
        
        nodes = graph.nodes()
        
        SX = [node['features']['SX'] for node in nodes.values()]
        SY = [node['features']['SY'] for node in nodes.values()]
        SZ = [node['features']['SZ'] for node in nodes.values()]
        TX = [node['features']['TX'] for node in nodes.values()]
        TY = [node['features']['TY'] for node in nodes.values()]
        signal = [node['signal'] for node in nodes.values()]
        
        
        shower_id = [node['signal'] for node in nodes.values()]
        brick_id = [i for _ in range(len(shower_id))]
        
        numtracks_expanded = [numtrack * [numtrack] for numtrack in numtracks]
        numtracks_expanded = [numtrack for sublist in numtracks_expanded for numtrack in sublist]
    
        def expanded(ele_):
            ele_expanded = [[ele_[i]]*numtracks[i] for i in range(len(numtracks))]
            ele_expanded = [_ for sublist in ele_expanded for _ in sublist]
            return ele_expanded
            
            
            
        ele_P_expanded = expanded(ele_P)
        ele_SX_expanded = expanded(ele_SX)
        ele_SY_expanded = expanded(ele_SY)
        ele_SZ_expanded = expanded(ele_SZ)
        ele_TX_expanded = expanded(ele_TX)
        ele_TY_expanded = expanded(ele_TY)
        

        print(len(ele_TY_expanded) == len(ele_TX_expanded) == len(ele_SZ_expanded) == len(ele_SY_expanded)
              == len(ele_SX_expanded) == len(ele_P_expanded) == len(numtracks_expanded) ==len(TY))
        
        
        df = df.append(
            pd.DataFrame(
                {'Brick_id': brick_id, 
                 'shower_id': shower_id, 
                 'SX': SX, 
                 'SY': SY, 
                 'SZ': SZ, 
                 'TX': TX, 
                 'TY': TY,
                 
                 'numtracks': numtracks_expanded, 
                 'ele_P': ele_P_expanded, 
                 'ele_SX': ele_SX_expanded, 
                 'ele_SY': ele_SY_expanded, 
                 'ele_SZ': ele_SZ_expanded, 
                 'ele_TX': ele_TX_expanded, 
                 'ele_TY': ele_TY_expanded,
                 'signal': signal
                }
            )        
        )

    return df
        

In [20]:
df = digraph_to_csv(bricks)

0it [00:00, ?it/s]

True


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,
1it [00:00,  1.51it/s]

True


2it [00:01,  1.46it/s]

True


3it [00:02,  1.40it/s]

True


4it [00:02,  1.38it/s]

True


5it [00:03,  1.27it/s]

True


6it [00:04,  1.21it/s]

True


7it [00:05,  1.15it/s]

True


8it [00:06,  1.10it/s]

True


9it [00:07,  1.05it/s]

True


10it [00:08,  1.00it/s]

True


11it [00:10,  1.05s/it]

True


12it [00:11,  1.07s/it]

True


13it [00:12,  1.11s/it]

True


14it [00:13,  1.13s/it]

True


15it [00:14,  1.18s/it]

True


16it [00:16,  1.24s/it]

True


17it [00:17,  1.30s/it]

True


18it [00:19,  1.34s/it]

True


19it [00:20,  1.38s/it]

True


20it [00:22,  1.42s/it]

True


21it [00:23,  1.47s/it]

True


22it [00:25,  1.51s/it]

True


23it [00:27,  1.56s/it]

True


24it [00:28,  1.58s/it]

True


25it [00:30,  1.61s/it]

True


26it [00:31,  1.63s/it]

True


27it [00:33,  1.64s/it]

True


28it [00:35,  1.68s/it]

True


29it [00:37,  1.73s/it]

True


30it [00:39,  1.79s/it]

True


31it [00:41,  1.84s/it]

True


32it [00:43,  1.89s/it]

True


33it [00:45,  1.95s/it]

True


34it [00:47,  1.98s/it]

True


35it [00:49,  2.05s/it]

True


36it [00:51,  2.11s/it]

True


37it [00:53,  2.11s/it]

True


38it [00:56,  2.14s/it]

True


39it [00:58,  2.19s/it]

True


40it [01:00,  2.23s/it]


In [21]:
columns = df.columns.tolist()

In [22]:
data = df.values.astype(np.double)

In [23]:
len(data[0])

15

In [24]:
data = pd.DataFrame(data, columns=columns)

In [25]:
len(data)

4314005

In [26]:
sol = data[['Brick_id', 'shower_id']]

data = data.drop(['shower_id'], axis = 1)

In [27]:
sol.head()

Unnamed: 0,Brick_id,shower_id
0,0.0,0.0
1,0.0,0.0
2,0.0,0.0
3,0.0,0.0
4,0.0,0.0


In [28]:
len(np.unique(np.array(data.signal)))

200

In [29]:
def gen_torch_showers(df, df_sol, knn=False, r=250, k=5, directed=False, e = 0.00005):
    showers = deque()

    for brick_id in tqdm(list(df.Brick_id.unique())[:3]):
        df_brick = df[df.Brick_id == brick_id]
        if knn:
            edges_from, edge_to, dist = generate_k_nearest_graph(df_brick.values, 
                                                                 k, e=e,
                                                                 symmetric=directed);
            edges = np.vstack([edges_from, edge_to])
            dist = np.array(dist)
            edge_index = torch.LongTensor(edges)
        else:
            edges_from, edge_to, dist = generate_radius_graph(df_brick.values, 
                                                              r, e=e,
                                                              symmetric=directed);
            edges = np.vstack([edges_from, edge_to])
            dist = np.array(dist)
            edge_index = torch.LongTensor(edges)
            
        x = torch.FloatTensor(df_brick.values[:, 1:6] / np.array([1e4, 1e4, 1e4, 1., 1.]))
        shower_data = torch.FloatTensor(df_brick.values[:, 6:])
        edge_attr = torch.log(torch.FloatTensor(dist).view(-1, 1))
        y = torch.LongTensor(df_sol.shower_id.loc[df_sol.Brick_id == brick_id].values)
        shower = torch_geometric.data.Data(x=x, edge_index=edge_index,
                                           shower_data = shower_data,
                                           pos=x, edge_attr=edge_attr, y=y)
        showers.append(shower)
    
    return showers

In [None]:
showers_train=gen_torch_showers(df=data, df_sol=sol, knn=True, k=10, directed=False, e = 10)
torch.save(showers_train, './EM_data/train_.pt')

 67%|██████▋   | 2/3 [1:53:29<55:39, 3339.11s/it]