In [1]:
import numpy as np
import pandas as pd
import tables as tb
from sklearn.neighbors import BallTree, KDTree, DistanceMetric
from tqdm import tqdm
import glob

%matplotlib inline

In [2]:
import tarfile
import gzip
import os

In [3]:
if not os.path.isdir("data/training"):
    my_tar = tarfile.open('data/training/training.tgz')
    my_tar.extractall('data/') # specify which folder to extract to
    my_tar.close()
    
if not os.path.isdir("data/testing"):
    os.system("gunzip data/testing/testing.h5.gz - data/testing/testing.h5")

# Grouping Close Tracks from Neighbour Plates into Pairs

In [4]:
def add_neighbours(df, k, metric='minkowski'):
    res = []
    
    for data_ind in tqdm(np.unique(df.data_ind)):
        ind = df.loc[df.data_ind == data_ind].copy()
        # 1293 is the distance between slices along Z.
        ind[['TX', 'TY']] *= 1293
        values = np.unique(ind.Z)
        
        for j in range(1, len(values)):
            z, z_next = (ind.loc[ind.Z == values[j-1]].copy(),
                         ind.loc[ind.Z == values[j]].copy())
            
            b_tree = BallTree(z_next[feat_XY], metric=metric)
            d, i = b_tree.query(z[feat_XY], k=min(k, len(z_next)))
            
            for m in range(i.shape[1]):
                data = z_next.iloc[i[:, m]]
                z_copy = z.copy()
                for col in feat_XY + ['Z']:
                    z_copy[col + '_pair'] = data[col].values
                res.append(z_copy)
            
        res.append(z_next)
        
    res = pd.concat(res)
    for col in feat_XY + ['Z']:
        res['d' + col] = res[col].values - res[col + '_pair'].values
    return res

def balance_train(df, k):
    data = add_neighbours(df, k=k)
    noise = data.event_id == -999
    signal, not_signal = data.loc[np.logical_not(noise)], data.loc[noise]
    noise_part = not_signal.sample(len(signal))
    return pd.concat([signal, noise_part]).reset_index(drop=True)

In [5]:
n_cluster = 3

In [6]:
feat_XY = ['TX', 'TY', 'X', 'Y']

# Load Datasets

### Train Data

In [7]:
train_data = []
for file in glob.glob('data/training/open*.h5'):
    train_data.append(balance_train(pd.read_hdf(file), k = n_cluster))

train_data = pd.concat(train_data)

100%|█████████████████████████████████████████████████████████████████████████████| 10/10 [01:01<00:00,  6.13s/it]
100%|█████████████████████████████████████████████████████████████████████████████| 10/10 [00:55<00:00,  5.58s/it]
100%|█████████████████████████████████████████████████████████████████████████████| 10/10 [00:55<00:00,  5.56s/it]
100%|█████████████████████████████████████████████████████████████████████████████| 10/10 [00:57<00:00,  5.76s/it]
100%|█████████████████████████████████████████████████████████████████████████████| 10/10 [00:56<00:00,  5.67s/it]
100%|█████████████████████████████████████████████████████████████████████████████| 10/10 [00:54<00:00,  5.41s/it]
100%|█████████████████████████████████████████████████████████████████████████████| 10/10 [00:58<00:00,  5.86s/it]
100%|█████████████████████████████████████████████████████████████████████████████| 10/10 [00:56<00:00,  5.62s/it]
100%|███████████████████████████████████████████████████████████████████████████

In [8]:
columns = ['TX', 'TY', 'X', 'Y', 'Z', 'event_id', 'signal',
           'data_ind', 'TX_pair', 'TY_pair', 'X_pair', 'Y_pair', 
           'Z_pair', 'dTX', 'dTY', 'dX', 'dY', 'dZ']

train_data = train_data[columns]

In [9]:
train_data.head()

Unnamed: 0,TX,TY,X,Y,Z,event_id,signal,data_ind,TX_pair,TY_pair,X_pair,Y_pair,Z_pair,dTX,dTY,dX,dY,dZ
0,-86.840851,92.664032,47257.558594,11238.640625,5172.0,183825.0,1.0,261,-94.468147,123.360336,47264.585938,11231.140625,5426.527344,7.627296,-30.696304,-7.027344,7.5,-254.527344
1,-94.468147,123.360336,47264.585938,11231.140625,5426.527344,183825.0,1.0,261,-89.869354,99.934273,47168.984375,11332.085938,6465.0,-4.598793,23.426064,95.601562,-100.945312,-1038.472656
2,-94.468147,123.360336,47264.585938,11231.140625,5426.527344,183825.0,1.0,261,339.412476,69.291908,47322.609375,11128.431641,6465.0,-433.880615,54.068428,-58.023438,102.708984,-1038.472656
3,-94.468147,123.360336,47264.585938,11231.140625,5426.527344,183825.0,1.0,261,275.844818,159.851212,46879.84375,11431.775391,6465.0,-370.312958,-36.490875,384.742188,-200.634766,-1038.472656
4,-89.869354,99.934273,47168.984375,11332.085938,6465.0,183825.0,1.0,261,-70.092018,84.584602,47085.210938,11426.46875,7758.0,-19.777336,15.34967,83.773438,-94.382812,-1293.0


In [10]:
train_data.shape

(9490034, 18)

In [11]:
train_data.to_csv("prepared_train.csv", compression="gzip", index=False)

### Test Data

In [12]:
test_data = pd.read_hdf('data/testing/test.h5')
test_data = test_data.reset_index(drop=True)
test_data = add_neighbours(test_data, k=n_cluster)
test_data['index'] = test_data.index

100%|█████████████████████████████████████████████████████████████████████████████| 11/11 [00:52<00:00,  4.74s/it]


In [13]:
test_data.head()

Unnamed: 0,TX,TY,X,Y,Z,data_ind,TX_pair,TY_pair,X_pair,Y_pair,Z_pair,dTX,dTY,dX,dY,dZ,index
6,-193.084152,623.975891,37949.0,24967.570312,0.0,0,-350.331818,437.50589,38081.851562,24522.3125,1293.0,157.247665,186.470001,-132.851562,445.257812,-1293.0,6
224,-225.433212,-367.239899,39335.953125,49094.96875,0.0,0,-380.564331,458.562836,39409.578125,49534.917969,1293.0,155.131119,-825.802734,-73.625,-439.949219,-1293.0,224
292,-288.543915,744.821838,51992.125,74045.695312,0.0,0,-166.982605,638.900269,51851.539062,73873.390625,1293.0,-121.56131,105.92157,140.585938,172.304688,-1293.0,292
325,-131.043716,-389.968414,29508.035156,67410.984375,0.0,0,-486.173767,107.317078,29299.15625,67238.296875,1293.0,355.130066,-497.285492,208.878906,172.6875,-1293.0,325
548,-233.201782,573.095337,67685.78125,75164.359375,0.0,0,135.409042,775.31897,67272.015625,75664.359375,1293.0,-368.61084,-202.223633,413.765625,-500.0,-1293.0,548


In [14]:
test_data.shape

(20179459, 17)

In [15]:
test_data.to_csv("prepared_test.csv.gz", compression="gzip", index=False)