In [36]:
import os
import numpy as np
import pandas as pd
from trackml.dataset import load_event, load_dataset
from trackml.randomize import shuffle_hits
import extension
from sklearn.neighbors import KDTree
import time

In [None]:
path = './clustering'
data_path = './data/test_data'
#container_df = pd.DataFrame(columns = ['event_id','hit_id','track_id'])
itr = load_dataset(data_path,parts=['hits'])


In [60]:
def extend(submission,hits):

    df = submission.merge(hits,  on=['hit_id'], how='left')
    df = df.assign(d = np.sqrt( df.x**2 + df.y**2 + df.z**2 ))
    df = df.assign(r = np.sqrt( df.x**2 + df.y**2))
    df = df.assign(arctan2 = np.arctan2(df.z, df.r))

    for angle in range(-180,180,1):

        print ('\r %f'%angle, end='',flush=True)
        #df1 = df.loc[(df.arctan2>(angle-0.5)/180*np.pi) & (df.arctan2<(angle+0.5)/180*np.pi)]
        df1 = df.loc[(df.arctan2>(angle-1.0)/180*np.pi) & (df.arctan2<(angle+1.0)/180*np.pi)]

        min_num_neighbours = len(df1)
        if min_num_neighbours<4: continue

        hit_ids = df1.hit_id.values
        x,y,z = df1.as_matrix(columns=['x', 'y', 'z']).T
        r  = (x**2 + y**2)**0.5
        r  = r/1000
        a  = np.arctan2(y,x)
        tree = KDTree(np.column_stack([a,r]), metric='euclidean')

        track_ids = list(df1.track_id.unique())
        num_track_ids = len(track_ids)
        min_length=3
 
        for i in range(num_track_ids):
            p = track_ids[i]
            if p==0: continue

            idx = np.where(df1.track_id==p)[0]
            if len(idx)<min_length: continue

            if angle>0:
                idx = idx[np.argsort( z[idx])]
            else:
                idx = idx[np.argsort(-z[idx])]


            ## start and end points  ##
            idx0,idx1 = idx[0],idx[-1]
            a0 = a[idx0]
            a1 = a[idx1]
            r0 = r[idx0]
            r1 = r[idx1]

            da0 = a[idx[1]] - a[idx[0]]  #direction
            dr0 = r[idx[1]] - r[idx[0]]
            direction0 = np.arctan2(dr0,da0) 

            da1 = a[idx[-1]] - a[idx[-2]]
            dr1 = r[idx[-1]] - r[idx[-2]]
            direction1 = np.arctan2(dr1,da1) 


            ## extend start point
            ns = tree.query([[a0,r0]], k=min(20,min_num_neighbours), return_distance=False)
            ns = np.concatenate(ns)
            direction = np.arctan2(r0-r[ns],a0-a[ns])
            ns = ns[(r0-r[ns]>0.01) &(np.fabs(direction-direction0)<0.04)]

            for n in ns:
                df.loc[ df.hit_id==hit_ids[n],'track_id' ] = p 

            ## extend end point
            ns = tree.query([[a1,r1]], k=min(20,min_num_neighbours), return_distance=False)
            ns = np.concatenate(ns)

            direction = np.arctan2(r[ns]-r1,a[ns]-a1)
            ns = ns[(r[ns]-r1>0.01) &(np.fabs(direction-direction1)<0.04)] 

            for n in ns:
                df.loc[ df.hit_id==hit_ids[n],'track_id' ] = p
    #print ('\r')
    df = df[['event_id', 'hit_id', 'track_id']]
    return df

def extend_threaded(hits,event_id,submission):
    init = time.time()
    extended = extend(submission,hits)
    for i in range(4):
        extended = extend(extended,hits)
    fname = './raw-extended/'+str(event_id)+'.npy'
    np.save(fname,extended.values)
    print('completed extension: ',time.time()-init)

In [66]:
extend_params = []
all_events = load_dataset('./data/test_data/', parts=['hits'])
path = './clustering'
data_path = './data/test_data'

for event in all_events:
    event_id = event[0]
    hits_df = event[1]
    label_path = './clustering/hough-'+str(param[1])+'.npy'
    un_extended = np.load(label_path)
    print(un_extended.shape)
    print(hits_df.hit_id.values.shape)
    full_current = np.column_stack(([event_id]*len(hits_df), hits_df.hit_id.values, un_extended))
    temp_df = pd.DataFrame(data=full_current,columns=['event_id','hit_id','track_id'])
    extend_params.append([hits_df, event_id,temp_df])


(108117,)
(119016,)


ValueError: all the input array dimensions except for the concatenation axis must match exactly

In [64]:
from multiprocessing import Pool
pool = Pool(32)
pool.starmap(extend_threaded,extend_params)
pool.close()

In [67]:
path = './clustering'
data_path = './data/test_data'
container_df = pd.DataFrame(columns = ['event_id','hit_id','track_id'])
itr = load_dataset(data_path, parts=['hits'])

count = 0 


for event in itr:
    
    init = time.time()
    
    event_id = event[0]
    hit_df = event[1]
    print(event_id)
    label_path = './clustering/hough-'+str(event_id)+'.npy'
    current = np.load(label_path)
    full_current = np.column_stack(([event_id]*len(hit_df), hit_df.hit_id.values, current))
    temp_df = pd.DataFrame(data=full_current,columns=['event_id','hit_id','track_id'])
    extended = extend(temp_df,hit_df)
    
    extended = extend(extended,hit_df)
    
    """
        grab just label column here 
    """
    sub_data = np.column_stack(([event_id]*len(hit_df), hit_df.hit_id.values, extended['track_id'].values))
    
    if event_id is 0:
        temp = sub_data
    else:
        #print('hello?')
        temp = np.concatenate((temp,sub_data),axis=0)
    count += 1
    
    if count % 10 == 0 :
        print('\n')
        print (str(count)+'/'+str(125)+' complete!')
        print('time taken: ',time.time()-init)
        
final_submission = pd.DataFrame(data=temp, columns=["event_id", "hit_id", "track_id"]).astype(int)
final_submission.to_csv('./submission-hbdscan-final.csv',index = False)

                    
            
            
            
            
            
            
            
            
            

0
 179.00000001
 179.00000002
 179.00000003
 179.00000004
 82.00000000

KeyboardInterrupt: 

In [None]:
extend_threaded(,parameters)
pool = Pool(32)


In [30]:
event_col = temp[:,2]
hitid_col = temp[:,3]
trackid_col = temp[:,4]
all_np_data = np.column_stack((event_col,hitid_col,trackid_col))

In [31]:
final_submission = pd.DataFrame(data=all_np_data, columns=["event_id", "hit_id", "track_id"]).astype(int)
final_submission.to_csv('./submission-hbdscan-extended.csv',index = False)


In [32]:
view = pd.read_csv('./submission-hbdscan-extended.csv')

In [34]:
view['track_id'].values

array([14308791,        1,   119017, ..., 47506008, 11667716,   108116])

In [20]:
final_submission = pd.DataFrame(data=temp, columns=["event_id", "hit_id", "track_id"]).astype(int)
final_submission.to_csv('./submission-hbdscan.csv',index = False)

In [19]:
temp.shape

(13741466, 3)

In [None]:
container_df = pd.DataFrame(columns = ['event_id','hit_id','track_id'])


In [None]:
container_df

In [None]:
a

In [None]:
np.save('./teststuff/a',a)