In [11]:
import numpy as np
import pandas as pd
import os
from sklearn.preprocessing import StandardScaler
import hdbscan
from scipy import stats
from sklearn.cluster import DBSCAN
%run utils.ipynb
from trackml.dataset import load_event, load_dataset
from trackml.score import score_event
import time
import multiprocessing
from scipy.stats import norm

In [34]:
def merge(cl1, cl2): # merge cluster 2 to cluster 1
    d = pd.DataFrame(data={'s1':cl1,'s2':cl2})
    d['N1'] = d.groupby('s1')['s1'].transform('count')
    d['N2'] = d.groupby('s2')['s2'].transform('count')
    maxs1 = d['s1'].max()
    cond = np.where((d['N2'].values>d['N1'].values) & (d['N2'].values<25)) # tìm vị trí hit với nhit của cluster mới > nhits cluster cũ
    s1 = d['s1'].values 
    s1[cond] = d['s2'].values[cond]+maxs1 # gán tất cả các hit đó thuộc về track mới (+maxs1 để tăng label cho track để nó khác ban đầu)
    return s1

def extract_good_hits(truth, submission):
    tru = truth[['hit_id', 'particle_id', 'weight']].merge(submission, how='left', on='hit_id')
    tru['count_both'] = tru.groupby(['track_id', 'particle_id']).hit_id.transform('count')    
    tru['count_particle'] = tru.groupby(['particle_id']).hit_id.transform('count')
    tru['count_track'] = tru.groupby(['track_id']).hit_id.transform('count')
    return tru[(tru.count_both > 0.5*tru.count_particle) & (tru.count_both > 0.5*tru.count_track)]

def fast_score(good_hits_df):
    return good_hits_df.weight.sum()


def analyze_truth_perspective(truth, submission):
    tru = truth[['hit_id', 'particle_id', 'weight']].merge(submission, how='left', on='hit_id')
    tru['count_both'] = tru.groupby(['track_id', 'particle_id']).hit_id.transform('count')    
    tru['count_particle'] = tru.groupby(['particle_id']).hit_id.transform('count')
    tru['count_track'] = tru.groupby(['track_id']).hit_id.transform('count')
    good_hits = tru[(tru.count_both > 0.5*tru.count_particle) & (tru.count_both > 0.5*tru.count_track)]
    score = good_hits.weight.sum()
    
    anatru = tru.particle_id.value_counts().value_counts().sort_index().to_frame().rename({'particle_id':'true_particle_counts'}, axis=1)
    #anatru['true_particle_ratio'] = anatru['true_particle_counts'].values*100/np.sum(anatru['true_particle_counts'])

    anatru['good_tracks_counts'] = np.zeros(len(anatru)).astype(int)
    anatru['good_tracks_intersect_nhits_avg'] = np.zeros(len(anatru))
    anatru['best_detect_intersect_nhits_avg'] = np.zeros(len(anatru))
    for nhit in tqdm(range(4,20)):
        particle_list  = tru[(tru.count_particle==nhit)].particle_id.unique()
        intersect_count = 0
        good_tracks_count = 0
        good_tracks_intersect = 0
        for p in particle_list:
            nhit_intersect = tru[tru.particle_id==p].count_both.max()
            intersect_count += nhit_intersect
            corresponding_track = tru.loc[tru[tru.particle_id==p].count_both.idxmax()].track_id
            leng_corresponding_track = len(tru[tru.track_id == corresponding_track])
            
            if (nhit_intersect >= nhit/2) and (nhit_intersect >= leng_corresponding_track/2):
                good_tracks_count += 1
                good_tracks_intersect += nhit_intersect
        intersect_count = intersect_count/len(particle_list)
        anatru.at[nhit,'best_detect_intersect_nhits_avg'] = intersect_count
        anatru.at[nhit,'good_tracks_counts'] = good_tracks_count
        if good_tracks_count > 0:
            anatru.at[nhit,'good_tracks_intersect_nhits_avg'] = good_tracks_intersect/good_tracks_count
    
    return score, anatru, good_hits

def precision(truth, submission,min_hits):
    tru = truth[['hit_id', 'particle_id', 'weight']].merge(submission, how='left', on='hit_id')
    tru['count_both'] = tru.groupby(['track_id', 'particle_id']).hit_id.transform('count')    
    tru['count_particle'] = tru.groupby(['particle_id']).hit_id.transform('count')
    tru['count_track'] = tru.groupby(['track_id']).hit_id.transform('count')
    #print('Analyzing predictions...')
    predicted_list  = tru[(tru.count_track>=min_hits)].track_id.unique()
    good_tracks_count = 0
    ghost_tracks_count = 0
    fp_weights = 0
    tp_weights = 0
    for t in predicted_list:
        nhit_track = tru[tru.track_id==t].count_track.iloc[0]
        nhit_intersect = tru[tru.track_id==t].count_both.max()
        corresponding_particle = tru.loc[tru[tru.track_id==t].count_both.idxmax()].particle_id
        leng_corresponding_particle = len(tru[tru.particle_id == corresponding_particle])
        if (nhit_intersect >= nhit_track/2) and (nhit_intersect >= leng_corresponding_particle/2): #if the predicted track is good
            good_tracks_count += 1
            tp_weights += tru[(tru.track_id==t)&(tru.particle_id==corresponding_particle)].weight.sum()
            fp_weights += tru[(tru.track_id==t)&(tru.particle_id!=corresponding_particle)].weight.sum()
        else: # if the predicted track is bad
                ghost_tracks_count += 1
                fp_weights += tru[(tru.track_id==t)].weight.sum()
    all_weights = tru[(tru.count_track>=min_hits)].weight.sum()
    precision = tp_weights/all_weights*100
    print('Precision: ',precision,', good tracks:', good_tracks_count,', total tracks:',len(predicted_list),
           ', loss:', fp_weights, ', reco:', tp_weights, 'reco/loss', tp_weights/fp_weights)
    return precision


class Clusterer(object):
    def __init__(self, event_id):                        
        self.abc = []
        self.event_id = event_id
          
    def initialize(self,dfhits):
        self.cluster = range(len(dfhits))
        
    def Hough_clustering(self,dfh,coef,epsilon,min_samples=1,n_loop=300,verbose=False): # [phi_coef,phi_coef,zdivrt_coef,zdivr_coef,xdivr_coef,ydivr_coef]
        merged_cluster = self.cluster
        mm = 1
        stepii = 0.000005
        count_ii = 0
        adaptive_eps_coefficient = 1
        for ii in np.arange(0, n_loop*stepii, stepii):
            count_ii += 1
            for jj in range(2):
                mm = mm*(-1)
                eps_new = epsilon + count_ii*adaptive_eps_coefficient*10**(-5)
                dfh['a1'] = dfh['a0'].values - np.nan_to_num(np.arccos(mm*ii*dfh['rt'].values))
                dfh['sina1'] = np.sin(dfh['a1'].values)
                dfh['cosa1'] = np.cos(dfh['a1'].values)
                ss = StandardScaler()
                dfs = ss.fit_transform(dfh[['sina1','cosa1','zdivrt','zdivr','xdivr','ydivr']].values) 
                #dfs = scale_ignore_nan(dfh[['sina1','cosa1','zdivrt','zdivr','xdivr','ydivr']])
                dfs = np.multiply(dfs, coef)
                new_cluster=DBSCAN(eps=eps_new,min_samples=min_samples,metric='euclidean',n_jobs=8).fit(dfs).labels_
                merged_cluster = merge(merged_cluster, new_cluster)
                if verbose == True:
                    sub = create_one_event_submission(0, hits, merged_cluster)
                    good_hits = extract_good_hits(truth, sub)
                    score_1 = fast_score(good_hits)
                    print('2r0_inverse:', ii*mm ,'. Score:', score_1)
                    #clear_output(wait=True)
        self.cluster = merged_cluster
        
        #houghhouse = "./clustering/hough-" + str(self.event_id) + ".npy"
        #np.save(houghhouse, self.cluster)

def create_one_event_submission(event_id, hits, labels):
    sub_data = np.column_stack(([event_id]*len(hits), hits.hit_id.values, labels))
    submission = pd.DataFrame(data=sub_data, columns=["event_id", "hit_id", "track_id"]).astype(int)
    return submission  

def preprocess_hits(h,dz):
    h['z'] =  h['z'].values + dz
    h['r'] = np.sqrt(h['x'].values**2+h['y'].values**2+h['z'].values**2)
    h['rt'] = np.sqrt(h['x'].values**2+h['y'].values**2)
    h['a0'] = np.arctan2(h['y'].values,h['x'].values)
    h['zdivrt'] = h['z'].values/h['rt'].values
    h['zdivr'] = h['z'].values/h['r'].values
    h['xdivr'] = h['x'].values / h['r'].values
    h['ydivr'] = h['y'].values / h['r'].values
    return h

In [37]:
def hough_threading(hits_df,truth_df, event_id, c ):
    hough_threading_init = time.time()
    #c = [1.5,1.5,0.73,0.17,0.027,0.027]
    coef = c
    epsilon = 0.0048
    min_samples = 1
    n_loop = 300
    verbose = False
    model = Clusterer(event_id)
    model.initialize(hits_df)
    hits_with_dz = preprocess_hits(hits_df, 0)
    model.Hough_clustering(hits_with_dz, coef, epsilon, min_samples, n_loop, verbose)
    hough_threading_end = time.time()
    print(hough_threading_end - hough_threading_init)
    score = score_event(truth_df,create_one_event_submission(1050,hits_df,model.cluster))
    
    
    return {'score':score, 'coefs':c} 

In [5]:
parameters = []
all_events = load_dataset('./data/test_data/', parts=['hits'])
for event in all_events:
    event_id = event[0]
    hits_df = event[1]
    parameters.append([hits_df, event_id])

In [59]:
param_dist = {'phi_coef':norm.rvs(loc=1.5,size = 32, scale = 0.01),'phi_coef2':norm.rvs(loc=1.5,size=32,scale=0.01),'zdivrt_coef':norm.rvs(loc=0.73,size=32,scale=0.005),'zdivr_coef':norm.rvs(loc=0.17,size=32,scale=0.005),'xdivr_coef':norm.rvs(loc=0.27,size=32,scale=0.001),'ydivr_coef':norm.rvs(loc=0.27,size=32,scale=0.001)}
all_params = []
hits, _,_,truth = load_single_train_event(1050) 
for i in range(32):
    coef_set = []
    coef_set.append(param_dist['phi_coef'][i])
    coef_set.append(param_dist['phi_coef2'][i])
    coef_set.append(param_dist['zdivrt_coef'][i])
    coef_set.append(param_dist['zdivr_coef'][i])
    coef_set.append(param_dist['xdivr_coef'][i])
    coef_set.append(param_dist['ydivr_coef'][i])
    all_params.append([hits,truth,1050,coef_set])
    
results = []    
def collect_result(result):
    results.append(result)
start = time.time()
pool = Pool(28)
results.append(pool.starmap(hough_threading,all_params))
print('elasped time: ',time.time()-start)
pool.close()




































































































































1486.3487775325775




1489.6299483776093




1491.220358133316




1496.0837080478668






1506.9933259487152








1526.2076251506805




1531.6949787139893




















1602.3526365756989










1623.4070491790771








1636.491013288498










1654.4063079357147








1668.7769162654877




1672.6548869609833






1680.490957736969






1689.8093407154083




1693.5769443511963
1694.2143540382385




1694.45383810997




1696.9567210674286




1699.2171747684479




1701.4453217983246




1702.8213036060333




1709.7530937194824




1712.8581488132477




1717.1541867256165




1720.6501343250275




1727.1822276115417




1736.1309492588043






























682.3068730831146




678.6245746612549




682.9569914340973




688.3760356903076
elasped time:  2184.1781935691833


In [60]:
best = 0
best_coefs = 0
for result in results[0]:
    #print (result)
    if result['score'] > best:
        best = result['score']
        best_coefs = result['coefs']
print (best)
print(best_coefs)

0.30349904948103695
[1.5042764492337748, 1.493936388422331, 0.719927837335959, 0.16354386345006816, 0.2704735476413385, 0.2691625247455191]


In [51]:
results[0][0]

{'coefs': [1.5652605901044556,
  1.5420238065159098,
  0.73969690599809,
  0.14729956712533576,
  0.28112948613839195,
  0.269478001337332],
 'score': 0.30035467204610883}

In [56]:
2268.731608390808/60

37.81219347318014

In [None]:
start = time.time()
pool = Pool(32)
pool.starmap(hough_threading, parameters)
print(time.time() - start)

In [None]:
hits, _,_,_ = load_single_train_event(1050) 

c = [1.5,1.5,0.73,0.17,0.027,0.027] #[phi_coef,phi_coef,zdivrt_coef,zdivr_coef,xdivr_coef,ydivr_coef]
new_c = [0.5455112739039641,1.2952788836731726,0.020644802534625598,0.010872422391297676,0.9692458635889661,0.7413379980415846]
new_c2 = [1.5092941586156554,1.0412878684054139,0.009882745498471901,0.006927533300489619,0.602558504132395,0.740027703894827]
new_ep = 0.004567159537430591
new_ep2 = 0.00558191221045077
orig_ep = 0.0048
min_samples = 1

start = time.time()
model = Clusterer(1050)
model.initialize(hits)

hits_with_dz = preprocess_hits(hits, 0)
model.Hough_clustering(hits_with_dz,coef=c,epsilon=orig_ep,min_samples=min_samples,n_loop=300,verbose=False)
submission = create_one_event_submission(1050, hits, model.cluster)

print('time for 1 event: ',time.time()-start)

In [None]:
hits,_,_,truth = load_single_train_event(1050)
score_event(truth,submission)

In [None]:
# 512.9022784233093   prev runtime

In [None]:
4500/60

In [None]:
7301.189469575882/3600