In [3]:
import numpy as np
import pandas as pd
import os
from sklearn.preprocessing import StandardScaler
import hdbscan
from scipy import stats
from sklearn.cluster import DBSCAN
%run utils.ipynb
from trackml.dataset import load_event, load_dataset
from trackml.score import score_event
import time
import multiprocessing
import extension
from sklearn.neighbors import KDTree

In [None]:
all_events = load_dataset(path, parts=['hits'])
for event in all_events:
    event_id = event[0]
    hits_df = event[1]

In [5]:
def merge(cl1, cl2): # merge cluster 2 to cluster 1
    d = pd.DataFrame(data={'s1':cl1,'s2':cl2})
    d['N1'] = d.groupby('s1')['s1'].transform('count')
    d['N2'] = d.groupby('s2')['s2'].transform('count')
    maxs1 = d['s1'].max()
    cond = np.where((d['N2'].values>d['N1'].values) & (d['N2'].values<25)) # tìm vị trí hit với nhit của cluster mới > nhits cluster cũ
    s1 = d['s1'].values 
    s1[cond] = d['s2'].values[cond]+maxs1 # gán tất cả các hit đó thuộc về track mới (+maxs1 để tăng label cho track để nó khác ban đầu)
    return s1

def extract_good_hits(truth, submission):
    tru = truth[['hit_id', 'particle_id', 'weight']].merge(submission, how='left', on='hit_id')
    tru['count_both'] = tru.groupby(['track_id', 'particle_id']).hit_id.transform('count')    
    tru['count_particle'] = tru.groupby(['particle_id']).hit_id.transform('count')
    tru['count_track'] = tru.groupby(['track_id']).hit_id.transform('count')
    return tru[(tru.count_both > 0.5*tru.count_particle) & (tru.count_both > 0.5*tru.count_track)]

def fast_score(good_hits_df):
    return good_hits_df.weight.sum()


def analyze_truth_perspective(truth, submission):
    tru = truth[['hit_id', 'particle_id', 'weight']].merge(submission, how='left', on='hit_id')
    tru['count_both'] = tru.groupby(['track_id', 'particle_id']).hit_id.transform('count')    
    tru['count_particle'] = tru.groupby(['particle_id']).hit_id.transform('count')
    tru['count_track'] = tru.groupby(['track_id']).hit_id.transform('count')
    good_hits = tru[(tru.count_both > 0.5*tru.count_particle) & (tru.count_both > 0.5*tru.count_track)]
    score = good_hits.weight.sum()
    
    anatru = tru.particle_id.value_counts().value_counts().sort_index().to_frame().rename({'particle_id':'true_particle_counts'}, axis=1)
    #anatru['true_particle_ratio'] = anatru['true_particle_counts'].values*100/np.sum(anatru['true_particle_counts'])

    anatru['good_tracks_counts'] = np.zeros(len(anatru)).astype(int)
    anatru['good_tracks_intersect_nhits_avg'] = np.zeros(len(anatru))
    anatru['best_detect_intersect_nhits_avg'] = np.zeros(len(anatru))
    for nhit in tqdm(range(4,20)):
        particle_list  = tru[(tru.count_particle==nhit)].particle_id.unique()
        intersect_count = 0
        good_tracks_count = 0
        good_tracks_intersect = 0
        for p in particle_list:
            nhit_intersect = tru[tru.particle_id==p].count_both.max()
            intersect_count += nhit_intersect
            corresponding_track = tru.loc[tru[tru.particle_id==p].count_both.idxmax()].track_id
            leng_corresponding_track = len(tru[tru.track_id == corresponding_track])
            
            if (nhit_intersect >= nhit/2) and (nhit_intersect >= leng_corresponding_track/2):
                good_tracks_count += 1
                good_tracks_intersect += nhit_intersect
        intersect_count = intersect_count/len(particle_list)
        anatru.at[nhit,'best_detect_intersect_nhits_avg'] = intersect_count
        anatru.at[nhit,'good_tracks_counts'] = good_tracks_count
        if good_tracks_count > 0:
            anatru.at[nhit,'good_tracks_intersect_nhits_avg'] = good_tracks_intersect/good_tracks_count
    
    return score, anatru, good_hits

def precision(truth, submission,min_hits):
    tru = truth[['hit_id', 'particle_id', 'weight']].merge(submission, how='left', on='hit_id')
    tru['count_both'] = tru.groupby(['track_id', 'particle_id']).hit_id.transform('count')    
    tru['count_particle'] = tru.groupby(['particle_id']).hit_id.transform('count')
    tru['count_track'] = tru.groupby(['track_id']).hit_id.transform('count')
    #print('Analyzing predictions...')
    predicted_list  = tru[(tru.count_track>=min_hits)].track_id.unique()
    good_tracks_count = 0
    ghost_tracks_count = 0
    fp_weights = 0
    tp_weights = 0
    for t in predicted_list:
        nhit_track = tru[tru.track_id==t].count_track.iloc[0]
        nhit_intersect = tru[tru.track_id==t].count_both.max()
        corresponding_particle = tru.loc[tru[tru.track_id==t].count_both.idxmax()].particle_id
        leng_corresponding_particle = len(tru[tru.particle_id == corresponding_particle])
        if (nhit_intersect >= nhit_track/2) and (nhit_intersect >= leng_corresponding_particle/2): #if the predicted track is good
            good_tracks_count += 1
            tp_weights += tru[(tru.track_id==t)&(tru.particle_id==corresponding_particle)].weight.sum()
            fp_weights += tru[(tru.track_id==t)&(tru.particle_id!=corresponding_particle)].weight.sum()
        else: # if the predicted track is bad
                ghost_tracks_count += 1
                fp_weights += tru[(tru.track_id==t)].weight.sum()
    all_weights = tru[(tru.count_track>=min_hits)].weight.sum()
    precision = tp_weights/all_weights*100
    print('Precision: ',precision,', good tracks:', good_tracks_count,', total tracks:',len(predicted_list),
           ', loss:', fp_weights, ', reco:', tp_weights, 'reco/loss', tp_weights/fp_weights)
    return precision


class Clusterer(object):
    def __init__(self, event_id):                        
        self.abc = []
        self.event_id = event_id
          
    def initialize(self,dfhits):
        self.cluster = range(len(dfhits))
        
    def Hough_clustering(self,dfh,coef,epsilon,min_samples=1,n_loop=180,verbose=False): # [phi_coef,phi_coef,zdivrt_coef,zdivr_coef,xdivr_coef,ydivr_coef]
        merged_cluster = self.cluster
        mm = 1
        stepii = 0.000005
        count_ii = 0
        adaptive_eps_coefficient = 1
        for ii in np.arange(0, n_loop*stepii, stepii):
            count_ii += 1
            for jj in range(2):
                mm = mm*(-1)
                eps_new = epsilon + count_ii*adaptive_eps_coefficient*10**(-5)
                dfh['a1'] = dfh['a0'].values - np.nan_to_num(np.arccos(mm*ii*dfh['rt'].values))
                dfh['sina1'] = np.sin(dfh['a1'].values)
                dfh['cosa1'] = np.cos(dfh['a1'].values)
                ss = StandardScaler()
                dfs = ss.fit_transform(dfh[['sina1','cosa1','zdivrt','zdivr','xdivr','ydivr']].values) 
                #dfs = scale_ignore_nan(dfh[['sina1','cosa1','zdivrt','zdivr','xdivr','ydivr']])
                dfs = np.multiply(dfs, coef)
                new_cluster=DBSCAN(eps=eps_new,min_samples=min_samples,metric='euclidean',n_jobs=8).fit(dfs).labels_
                merged_cluster = merge(merged_cluster, new_cluster)
                if verbose == True:
                    sub = create_one_event_submission(0, hits, merged_cluster)
                    good_hits = extract_good_hits(truth, sub)
                    score_1 = fast_score(good_hits)
                    print('2r0_inverse:', ii*mm ,'. Score:', score_1)
                    #clear_output(wait=True)
        self.cluster = merged_cluster
        houghhouse = "./clustering/extend-baseline/hough-" + str(self.event_id) + ".npy"
        

def create_one_event_submission(event_id, hits, labels):
    sub_data = np.column_stack(([event_id]*len(hits), hits.hit_id.values, labels))
    submission = pd.DataFrame(data=sub_data, columns=["event_id", "hit_id", "track_id"]).astype(int)
    return submission  

def preprocess_hits(h,dz):
    h['z'] =  h['z'].values + dz
    h['r'] = np.sqrt(h['x'].values**2+h['y'].values**2+h['z'].values**2)
    h['rt'] = np.sqrt(h['x'].values**2+h['y'].values**2)
    h['a0'] = np.arctan2(h['y'].values,h['x'].values)
    h['zdivrt'] = h['z'].values/h['rt'].values
    h['zdivr'] = h['z'].values/h['r'].values
    h['xdivr'] = h['x'].values / h['r'].values
    h['ydivr'] = h['y'].values / h['r'].values
    return h

In [13]:
def extend(submission,hits):

    df = submission.merge(hits,  on=['hit_id'], how='left')
    df = df.assign(d = np.sqrt( df.x**2 + df.y**2 + df.z**2 ))
    df = df.assign(r = np.sqrt( df.x**2 + df.y**2))
    df = df.assign(arctan2 = np.arctan2(df.z, df.r))

    for angle in range(-180,180,1):

        print ('\r %f'%angle, end='',flush=True)
        #df1 = df.loc[(df.arctan2>(angle-0.5)/180*np.pi) & (df.arctan2<(angle+0.5)/180*np.pi)]
        df1 = df.loc[(df.arctan2>(angle-1.0)/180*np.pi) & (df.arctan2<(angle+1.0)/180*np.pi)]

        min_num_neighbours = len(df1)
        if min_num_neighbours<4: continue

        hit_ids = df1.hit_id.values
        x,y,z = df1.as_matrix(columns=['x', 'y', 'z']).T
        r  = (x**2 + y**2)**0.5
        r  = r/1000
        a  = np.arctan2(y,x)
        tree = KDTree(np.column_stack([a,r]), metric='euclidean')

        track_ids = list(df1.track_id.unique())
        num_track_ids = len(track_ids)
        min_length=3
 
        for i in range(num_track_ids):
            p = track_ids[i]
            if p==0: continue

            idx = np.where(df1.track_id==p)[0]
            if len(idx)<min_length: continue

            if angle>0:
                idx = idx[np.argsort( z[idx])]
            else:
                idx = idx[np.argsort(-z[idx])]


            ## start and end points  ##
            idx0,idx1 = idx[0],idx[-1]
            a0 = a[idx0]
            a1 = a[idx1]
            r0 = r[idx0]
            r1 = r[idx1]

            da0 = a[idx[1]] - a[idx[0]]  #direction
            dr0 = r[idx[1]] - r[idx[0]]
            direction0 = np.arctan2(dr0,da0) 

            da1 = a[idx[-1]] - a[idx[-2]]
            dr1 = r[idx[-1]] - r[idx[-2]]
            direction1 = np.arctan2(dr1,da1) 


            ## extend start point
            ns = tree.query([[a0,r0]], k=min(30,min_num_neighbours), return_distance=False)
            ns = np.concatenate(ns)
            direction = np.arctan2(r0-r[ns],a0-a[ns])
            ns = ns[(r0-r[ns]>0.01) &(np.fabs(direction-direction0)<0.04)]

            for n in ns:
                df.loc[ df.hit_id==hit_ids[n],'track_id' ] = p 

            ## extend end point
            ns = tree.query([[a1,r1]], k=min(30,min_num_neighbours), return_distance=False)
            ns = np.concatenate(ns)

            direction = np.arctan2(r[ns]-r1,a[ns]-a1)
            ns = ns[(r[ns]-r1>0.01) &(np.fabs(direction-direction1)<0.04)] 

            for n in ns:
                df.loc[ df.hit_id==hit_ids[n],'track_id' ] = p
    #print ('\r')
    df = df[['event_id', 'hit_id', 'track_id']]
    return df

In [3]:
def hough_threading(hits_df, event_id):
    hough_threading_init = time.time()
    c = [1.5,1.5,0.73,0.17,0.027,0.027]
    coef = c
    epsilon = 0.0048
    min_samples = 1
    n_loop = 300
    verbose = False
    model = Clusterer(event_id)
    model.initialize(hits_df)
    hits_with_dz = preprocess_hits(hits_df, 0)
    model.Hough_clustering(hits_with_dz, coef, epsilon, min_samples, n_loop, verbose)
    hough_threading_end = time.time()
    print(hough_threading_end - hough_threading_init)

In [4]:
parameters = []
all_events = load_dataset('./data/test_data/', parts=['hits'])
for event in all_events:
    event_id = event[0]
    hits_df = event[1]
    parameters.append([hits_df, event_id])

In [None]:
len(parameters)

In [5]:
start = time.time()
pool = Pool(32)
pool.starmap(hough_threading, parameters)
print(time.time() - start)

















986.7997071743011














































1260.616060256958


















1329.4545073509216










1356.5343644618988


















































1537.0700688362122








1555.1023416519165




























1657.440277814865




1660.3574244976044






1669.5106592178345






1679.6348581314087
















1747.1648530960083




1754.1663217544556








1779.7211291790009






1797.4622647762299




1809.8521256446838




1818.2673902511597




1824.6280913352966






1838.9997630119324




1840.4039707183838




1846.7892925739288








1879.7735879421234






1905.8593618869781








1937.3797585964203








1990.4825475215912






2008.8721151351929




2012.5935740470886








2062.8372225761414








2149.9442200660706






2192.7795300483704




2219.717801809311






2258.067381620407






1318.563306093216




2352.532006263733






1274.784366607666






























1633.9171283245087












1238.1232030391693














































1612.3438506126404
































1848.4600613117218






1620.5775275230408








1631.0182781219482








1923.5724408626556












1823.8435859680176










1546.4085762500763






1803.6067893505096






2241.8156402111053




1748.2427937984467














1849.3136341571808






















1981.5188586711884






2084.957196712494






1827.6615567207336






1878.4229307174683




1950.12717795372






2146.084846973419




1939.6788947582245














1887.6916899681091




2071.867477416992




2078.5774595737457






1938.5876667499542






1785.7254178524017








1512.3061447143555








1904.5687220096588






2118.490692615509




1850.1594038009644




1971.3540964126587




1965.384656906128




2125.3799965381622


























































1917.0476899147034


































1557.1143634319305




1600.3483815193176










2043.3619585037231




1687.9099752902985




1543.3851449489594
1557.9785962104797












1596.184332370758










1937.3084907531738




1804.794883966446








1509.9836640357971


















1898.4441809654236










1561.6816022396088










1882.0743725299835












1849.7792432308197










1749.7216687202454




1778.3671760559082














1674.963809967041






1818.8787441253662




1566.49120926857












1559.5666739940643














1789.954659461975




1844.5001320838928




1971.0245778560638








1662.8659973144531








2022.3120625019073








1713.2023634910583




2203.395210504532








1844.783302307129




2125.2051632404327










1948.255733013153












2201.0867133140564




1221.637820482254


























































1861.890210390091


























1781.8721091747284


















1756.8030631542206




1443.6037232875824






1737.6901876926422












1899.5364847183228














1939.9521260261536






1933.319628238678
















1316.9506578445435




1499.7707834243774




1749.677966117859










1732.4318239688873




2050.964215040207








2062.7243225574493




1523.2218458652496




1918.8485000133514


















1488.005376815796






1566.4554374217987




1704.444857597351






1327.4207866191864




1424.2599875926971








1301.6733417510986




1810.3113486766815






1468.9276568889618




1298.043287038803




1785.2048873901367
7301.189469575882


In [7]:
hits, _,_,_ = load_single_train_event(1050) 

c = [1.5,1.5,0.73,0.17,0.027,0.027] #[phi_coef,phi_coef,zdivrt_coef,zdivr_coef,xdivr_coef,ydivr_coef]
new_c = [0.5455112739039641,1.2952788836731726,0.020644802534625598,0.010872422391297676,0.9692458635889661,0.7413379980415846]
new_c2 = [1.5092941586156554,1.0412878684054139,0.009882745498471901,0.006927533300489619,0.602558504132395,0.740027703894827]
new_ep = 0.004567159537430591
new_ep2 = 0.00558191221045077
orig_ep = 0.0048
min_samples = 1

start = time.time()
model = Clusterer(1050)
model.initialize(hits)

hits_with_dz = preprocess_hits(hits, 0)
model.Hough_clustering(hits_with_dz,coef=c,epsilon=orig_ep,min_samples=min_samples,n_loop=300,verbose=False)
submission = create_one_event_submission(1050, hits, model.cluster)
extended_sub = extend(submission, hits)

print('time for 1 event: ',time.time()-start)









 179.0000000time for 1 event:  524.1420583724976


In [15]:
hits,_,_,truth = load_single_train_event(1050)

for i in range(8):
    extend_init = time.time()
    extended_sub = extend(extended_sub,hits)
    print('time for 1 extension: ',time.time()-extend_init)
score_event(truth,extended_sub)

 179.0000000time for 1 extension:  54.490899324417114
 179.0000000time for 1 extension:  46.205568075180054
 179.0000000time for 1 extension:  31.99036145210266
 179.0000000time for 1 extension:  30.769341468811035
 179.0000000time for 1 extension:  27.19871473312378
 179.0000000time for 1 extension:  20.250677824020386
 179.0000000time for 1 extension:  17.798975229263306
 179.0000000time for 1 extension:  17.177982330322266


0.5662458032909261

In [16]:
hits,_,_,truth = load_single_train_event(1050)
single_extend = extend(submission,hits)
score_event(truth,single_extend)

 179.0000000

0.5633987110551497

In [10]:
hits,_,_,truth = load_single_train_event(1050) 
score_event(truth,submission)

0.5376634851921283

In [None]:
# 512.9022784233093   prev runtime

In [13]:
4500/60

75.0

In [1]:
7301.189469575882/3600

2.0281081859933003