In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os

from trackml.dataset import load_event, load_dataset
from trackml.score import score_event

In [2]:
# Change this according to your directory preferred setting
path_to_train = "./input/train_5"

## Working on one event

In [5]:
# This event is in Train_1
event_prefix = "event000008180"

### Read and Look

In [6]:
hits, cells, particles, truth = load_event(os.path.join(path_to_train, event_prefix))

In [7]:
hits.head()

Unnamed: 0,hit_id,x,y,z,volume_id,layer_id,module_id
0,1,-64.895897,-9.09471,-1502.5,7,2,1
1,2,-62.720901,-5.21209,-1502.5,7,2,1
2,3,-76.973999,-6.18337,-1502.5,7,2,1
3,4,-83.017097,-1.09181,-1502.5,7,2,1
4,5,-73.857498,-10.6526,-1502.5,7,2,1


Identify tracks
In this example the track pattern recognition is solved as clustering problem. Each of the clusters corresponds to one track. Firstly we preprocess hit coordinates in order to highlight the fact that a track is (approximatly) an arc of helix.

$$r_1=\sqrt{x^2+y^2+z^2}$$
 
$$x_2=x/r_1$$
 
$$y_2=y/r_1$$
 
$$r_2=\sqrt{x^2+y^2}$$
 
$$z_2=z/r_2$$
 
Then, DBSCAN is used to recognize hit clusters.

In [12]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
from collections import Sequence

"""
updated - added self.rz_scale
"""
class Clusterer(object):
    
    def __init__(self, eps):
        self.eps = eps
        self.rz_scale = 1
        
    
    def _preprocess(self, hits):
        
        x = hits.x.values
        y = hits.y.values
        z = hits.z.values

        r = np.sqrt(x**2 + y**2 + z**2)
        hits['x2'] = x/r
        hits['y2'] = y/r

        r = np.sqrt(x**2 + y**2)
        hits['z2'] = z/r

        ss = StandardScaler()
        X = ss.fit_transform(hits[['x2', 'y2', 'z2']].values)
        X[:,2] = X[:,2] * self.rz_scale
        
        return X
    
    
    def predict(self, hits, rz_scale=1):
        
        self.rz_scale = rz_scale
        X = self._preprocess(hits)
        
        cl = DBSCAN(eps=self.eps, min_samples=1, algorithm='kd_tree')
        labels = cl.fit_predict(X)
        
        return labels

In [13]:
"""
updated - added to predict: rz_scale=1.5
"""
model = Clusterer(eps=0.008)
labels = model.predict(hits, rz_scale=1.5)

In [14]:
print(labels)

[    0     1     2 ... 72039 72040 72041]


## Score
Compute the score for this event. The dummy submission output of create_one_event_submission is created only to be the second parameter of the score_event function. It should not be confused with a well-behaved submission for the test set.

In [17]:
def create_one_event_submission(event_id, hits, labels):
    sub_data = np.column_stack(([event_id]*len(hits), hits.hit_id.values, labels))
    submission = pd.DataFrame(data=sub_data, columns=["event_id", "hit_id", "track_id"]).astype(int)
    return submission

In [18]:
submission = create_one_event_submission(0, hits, labels)
score = score_event(truth, submission)

In [19]:
print("Your score: ", score)

Your score:  0.19775939010865784


## Recognize tracks in all events of a dataset

Recognize tracks in all events of a dataset
In this example, the dataset is the whole training set.
This is a simple loop over the one-event actions: because of the use of DBScan, there is no actual training.

This may take a very long time. To run on only a subset, use

``` python
load_dataset(path_to_train, skip=1000, nevents=5)
```

It will skip the first 1000 events, and select the next 5 ones.

In [21]:
dataset_submissions = []
dataset_scores = []

for event_id, hits, cells, particles, truth in load_dataset(path_to_train, skip=0, nevents=5):
        
    # Track pattern recognition
    model = Clusterer(eps=0.008)
    labels = model.predict(hits, rz_scale=1.5)
        
    # Prepare submission for an event
    one_submission = create_one_event_submission(event_id, hits, labels)
    dataset_submissions.append(one_submission)
    
    # Score for the event
    score = score_event(truth, one_submission)
    dataset_scores.append(score)
    
    print("Score for event %d: %.3f" % (event_id, score))
    
print('Mean score: %.3f' % (np.mean(dataset_scores)))

Score for event 8180: 0.198
Score for event 8181: 0.212
Score for event 8182: 0.222
Score for event 8183: 0.210
Score for event 8184: 0.205
Mean score: 0.209


## Creat a sumbmission

In [22]:
path_to_test = "./input/test"
test_dataset_submissions = []

create_submission = True # True for submission 

if create_submission:
    for event_id, hits, cells in load_dataset(path_to_test, parts=['hits', 'cells']):

        # Track pattern recognition
        model = Clusterer(eps=0.008)
        labels = model.predict(hits, rz_scale=1.5)

        # Prepare submission for an event
        one_submission = create_one_event_submission(event_id, hits, labels)
        test_dataset_submissions.append(one_submission)
        
        print('Event ID: ', event_id)

    # Create submission file
    submussion = pd.concat(test_dataset_submissions, axis=0)
    submussion.to_csv('submission.csv', index=False)

Event ID:  0
Event ID:  1
Event ID:  2
Event ID:  3
Event ID:  4
Event ID:  5
Event ID:  6
Event ID:  7
Event ID:  8
Event ID:  9
Event ID:  10
Event ID:  11
Event ID:  12
Event ID:  13
Event ID:  14
Event ID:  15
Event ID:  16
Event ID:  17
Event ID:  18
Event ID:  19
Event ID:  20
Event ID:  21
Event ID:  22
Event ID:  23
Event ID:  24
Event ID:  25
Event ID:  26
Event ID:  27
Event ID:  28
Event ID:  29
Event ID:  30
Event ID:  31
Event ID:  32
Event ID:  33
Event ID:  34
Event ID:  35
Event ID:  36
Event ID:  37
Event ID:  38
Event ID:  39
Event ID:  40
Event ID:  41
Event ID:  42
Event ID:  43
Event ID:  44
Event ID:  45
Event ID:  46
Event ID:  47
Event ID:  48
Event ID:  49
Event ID:  50
Event ID:  51
Event ID:  52
Event ID:  53
Event ID:  54
Event ID:  55
Event ID:  56
Event ID:  57
Event ID:  58
Event ID:  59
Event ID:  60
Event ID:  61
Event ID:  62
Event ID:  63
Event ID:  64
Event ID:  65
Event ID:  66
Event ID:  67
Event ID:  68
Event ID:  69
Event ID:  70
Event ID:  71
Ev