In [1]:
from trackml.dataset import load_event, load_dataset
from trackml.score import score_event

from sklearn.cluster.dbscan_ import dbscan
from sklearn.preprocessing import StandardScaler

import numpy as np
import pandas as pd
import timeit
import multiprocessing
from multiprocessing import Pool

  return f(*args, **kwds)
  return f(*args, **kwds)


In [2]:
def find_labels(params):
    hits, dz = params
    a = hits['phi'].values
    z = hits['z'].values
    zr = hits['zr'].values
    aa = a + np.sign(z) * dz * z

    f0 = np.cos(aa)
    f1 = np.sin(aa)
    f2 = zr
    X = StandardScaler().fit_transform(np.column_stack([f0, f1, f2]))

    _, l = dbscan(X, eps=0.0045, min_samples=1, n_jobs=4)
    return l + 1

In [3]:
def add_count(l):
    unique, reverse, count = np.unique(l, return_counts=True, return_inverse=True)
    c = count[reverse]
    c[np.where(l == 0)] = 0
    c[np.where(c > 20)] = 0
    return (l, c)

In [4]:
def do_dbscan_predict(hits):
    start_time = timeit.default_timer()

    hits['r'] = np.sqrt(hits['x'] ** 2 + hits['y'] ** 2)
    hits['zr'] = hits['z'] / hits['r']
    hits['phi'] = np.arctan2(hits['y'], hits['x'])

    params = []
    for i in range(0, 20):
        dz = i * 0.00001
        params.append((hits, dz))
        if i > 0:
             params.append((hits, -dz))
    # Kernel time is limited. So we skip some angles.
    for i in range(20, 60):
        dz = i * 0.00001
        if i % 2 == 0:
            params.append((hits, dz))
        else:
             params.append((hits, -dz))
             
    pool = Pool(processes=4)
    labels_for_all_steps = pool.map(find_labels, params)
    results = [add_count(l) for l in labels_for_all_steps]
    pool.close()

    labels, counts = results[0]
    for i in range(1, len(results)):
        l, c = results[i]
        idx = np.where((c - counts > 0))[0]
        labels[idx] = l[idx] + labels.max()
        counts[idx] = c[idx]

    print('time spent:', timeit.default_timer() - start_time)

    return labels

In [5]:
def create_one_event_submission(event_id, hits, labels):
    sub_data = np.column_stack(([event_id]*len(hits), hits, labels))
    submission = pd.DataFrame(data=sub_data, columns=["event_id", "hit_id", "track_id"]).astype(int)
    return submission

In [8]:
def run_dbscan():
    data_dir = './input/train_1'

    event_ids = ['000001000']
    sum = 0
    sum_score = 0
    for i, event_id in enumerate(event_ids):
        hits, cells, particles, truth = load_event(data_dir + '/event' + event_id)
        labels = do_dbscan_predict(hits)
        submission = create_one_event_submission(0, hits['hit_id'].values, labels)
        score = score_event(truth, submission)
        print('[%2d] score : %0.8f' % (i, score))
        sum_score += score
        sum += 1

    print('--------------------------------------')
    print(sum_score / sum)

In [9]:
if __name__ == '__main__':
    print('estimate score by known events')
    run_dbscan()

    path_to_test = "./input/test"
    test_dataset_submissions = []

    create_submission = True  # True for submission
    if create_submission:
        print('process test events')
        for event_id, hits in load_dataset(path_to_test, parts=['hits']):
            print('Event ID: ', event_id)
            labels = do_dbscan_predict(hits)
            # Prepare submission for an event
            one_submission = create_one_event_submission(event_id, hits['hit_id'].values, labels)
            test_dataset_submissions.append(one_submission)

        # Create submission file
        submussion = pd.concat(test_dataset_submissions, axis=0)
        submussion.to_csv('submission_final.csv', index=True)

estimate score by known events
time spent: 46.299787277006544
[ 0] score : 0.40945479
--------------------------------------
0.40945479215932146
process test events
Event ID:  0
time spent: 40.742337266972754
Event ID:  1
time spent: 40.91561421396909
Event ID:  2
time spent: 35.10932774405228
Event ID:  3
time spent: 38.44468945002882
Event ID:  4
time spent: 42.547842786007095
Event ID:  5
time spent: 35.273274248000234
Event ID:  6
time spent: 36.36160474701319
Event ID:  7
time spent: 42.175417025981005
Event ID:  8
time spent: 44.08930184098426
Event ID:  9
time spent: 40.06115578103345
Event ID:  10
time spent: 42.621015796030406
Event ID:  11
time spent: 36.87787714099977
Event ID:  12
time spent: 40.7131386710098
Event ID:  13
time spent: 43.89875564101385
Event ID:  14
time spent: 36.98866864602314
Event ID:  15
time spent: 30.275080284976866
Event ID:  16
time spent: 30.573880069016013
Event ID:  17
time spent: 47.91493080701912
Event ID:  18
time spent: 34.379522314004134
Ev