In [1]:
!nvidia-smi

Tue Nov  7 15:20:01 2017       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 384.90                 Driver Version: 384.90                    |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  GeForce GTX 1080    Off  | 00000000:04:00.0 Off |                  N/A |
| 27%   28C    P8     9W / 180W |    197MiB /  8114MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   1  GeForce GTX 1080    Off  | 00000000:05:00.0 Off |                  N/A |
| 27%   29C    P8     9W / 180W |     10MiB /  8114MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   2  GeForce GTX 1080    Off  | 00000000:06:00.0 Off |                  N/A |
| 27%   

In [2]:
################################################################################
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import importlib
import numpy as np
import pandas as pd
import json
import sys
from IPython.display import display
from typing import Iterable, List, Sequence
from tracker import extractor
pd.options.display.max_columns = 0

In [3]:
################################################################################
def flatten(
        iterable: Iterable
        ) -> Iterable:
    """
    Return a flattened iterable from a nested iterable.
    [[3, [4, 5]], 6, [[[7]]]] -> [3, 4, 5, 6, 7]
    
    Arguments:
        iterable
            Some Iterable object that may or may not contain more Iterable
            objects.
            
    Yields the elements from each Iterable or single element from iterable.
    """
    for item in iterable:
        if  isinstance(item, Iterable) and not isinstance(item, (str, bytes)):
            yield from flatten(item)
        else:
            yield item


def parse_file(
        filename         : str,
        initial_event_id : int      = 0,
        ignored_columns  : Sequence = (),
        ) -> Iterable[Iterable]:
    """ 
    Parses the lines in the file from 'filename' to a format
    appropriate for passing into a pandas DataFrame constructor.
        
    Arguments:
        filename
            The name of the file to parse.
        initial_event_id
            The event ID that the first event extracted has.
            Event ID is incremented by 1 after finishing the
            parsing of an event.
        ignored_columns
            The list of indices of the columns to delete from each line.
    
    For each line, yields a generator that yield the elements from the line.
    """
    event_id = initial_event_id
    with open(filename) as file:
        lines = filter(None, (line.strip() for line in file))
        for line in lines:
            if line.startswith("#"):
                event_id += 1
            else:
                j_list = json.loads("[{0}]".format(line))
                for column in ignored_columns:
                    del j_list[column]
                j_list.append(event_id)
                yield flatten(j_list)

In [4]:
################################################################################
clusters_columns = [
    "hit_nr", "barcode", "volume_id", "layer_id",
    "lx",     "ly",      "elx",       "ely",    
    "gx",     "gy",      "gz",        "phi",    
    "theta",  "ephi",    "etheta",    "event_id",
]
particles_columns = [
    "barcode",  "vertex_x", "vertex_y",
    "vertex_z", "momentum", "theta",
    "phi",      "charge",   "event_id",
]

# Extraction from a single file.

In [5]:
################################################################################
number = 1
base_directory     = "/inputdata/ACTS/prod_mu200_pt500_2017_07_26"
clusters_filename  = base_directory + "/clusters_{0}.csv".format(number)
particles_filename = base_directory + "/particles_{0}.csv".format(number)

In [6]:
################################################################################
# Example of how the file looks.
with open(clusters_filename, "r") as file:
    for _ in range(5):
        print(file.readline())



### Format hit_nr, barcode, volume_id, layer_id , [lx, ly], [elx, ely], [ gx, gy, gz ],[[fch0,fch1,fchdata]], [phi,theta], [ephi,etheta]

1, 752102443440930816, 7, 2, [ 0.625, 28.0611], [ 0.15, 0.15],[ -166.363, -23.8377,-1498], [[ 180, 1138, 0.144688], [ 180, 1139, 0.0828125]], [ -3.00299, 1.5708], [ 0.1, 0.01]

2, 58548238264827904, 7, 2, [ -2.575, 14.1469], [ 0.15, 0.15],[ -146.974, -46.545,-1498], [[ 116, 891, 0.268438]], [ -2.8349, 3.03904], [ 0.1, 0.01]

3, 477381629220749312, 7, 2, [ -1.225, 13.6928], [ 0.15, 0.15],[ -146.115, -47.6808,-1498], [[ 143, 882, 0.0209375], [ 143, 883, 0.268438]], [ -2.81819, 1.5708], [ 0.1, 0.01]



In [7]:
################################################################################
# Quick note: I am ignoring the 7th column within each line.
# This column contains [[fch0, fch1, fchdata]]. 
clusters_lines = parse_file(clusters_filename, ignored_columns=[7])
clusters_frame = pd.DataFrame(clusters_lines, columns=clusters_columns)
clusters_frame.head(6)

Unnamed: 0,hit_nr,barcode,volume_id,layer_id,lx,ly,elx,ely,gx,gy,gz,phi,theta,ephi,etheta,event_id
0,1,752102443440930816,7,2,0.625,28.0611,0.15,0.15,-166.363,-23.8377,-1498.0,-3.00299,1.5708,0.1,0.01,1
1,2,58548238264827904,7,2,-2.575,14.1469,0.15,0.15,-146.974,-46.545,-1498.0,-2.8349,3.03904,0.1,0.01,1
2,3,477381629220749312,7,2,-1.225,13.6928,0.15,0.15,-146.115,-47.6808,-1498.0,-2.81819,1.5708,0.1,0.01,1
3,4,414331990351806464,7,2,-2.675,12.5208,0.15,0.15,-134.547,-71.8804,-1498.0,-2.63339,1.5708,0.1,0.01,1
4,5,756604874837196800,7,2,0.375,19.2616,0.15,0.15,-131.178,-90.3139,-1502.0,-2.54099,1.5708,0.1,0.01,1
5,6,405336335969157120,7,2,-1.07079,33.3094,0.15,0.15,-134.017,-109.894,-1498.0,-1.72193,1.5708,0.1,0.01,1


In [8]:
################################################################################
clusters_frame = clusters_frame.rename(columns={
    "hit_nr": "hit_number", "barcode": "cluster_id", "lx": "local_x",
    "ly": "local_y", "gx": "x", "gy": "y", "gz": "z", "elx": "local_x_error",
    "ely": "local_y_error", "ephi": "phi_error", "etheta": "theta_error"
})
clusters_frame.head(6)

Unnamed: 0,hit_number,cluster_id,volume_id,layer_id,local_x,local_y,local_x_error,local_y_error,x,y,z,phi,theta,phi_error,theta_error,event_id
0,1,752102443440930816,7,2,0.625,28.0611,0.15,0.15,-166.363,-23.8377,-1498.0,-3.00299,1.5708,0.1,0.01,1
1,2,58548238264827904,7,2,-2.575,14.1469,0.15,0.15,-146.974,-46.545,-1498.0,-2.8349,3.03904,0.1,0.01,1
2,3,477381629220749312,7,2,-1.225,13.6928,0.15,0.15,-146.115,-47.6808,-1498.0,-2.81819,1.5708,0.1,0.01,1
3,4,414331990351806464,7,2,-2.675,12.5208,0.15,0.15,-134.547,-71.8804,-1498.0,-2.63339,1.5708,0.1,0.01,1
4,5,756604874837196800,7,2,0.375,19.2616,0.15,0.15,-131.178,-90.3139,-1502.0,-2.54099,1.5708,0.1,0.01,1
5,6,405336335969157120,7,2,-1.07079,33.3094,0.15,0.15,-134.017,-109.894,-1498.0,-1.72193,1.5708,0.1,0.01,1


In [9]:
################################################################################
events = clusters_frame.groupby(["event_id"])["cluster_id"]
tracks_per_event = [len(event.unique()) for (_, event) in events]

print("Hits:", len(clusters_frame))
print("Events:", len(clusters_frame["event_id"].unique()))
print("Min Tracks:", min(tracks_per_event))
print("Max Tracks:", max(tracks_per_event))

Hits: 3029037
Events: 100
Min Tracks: 1732
Max Tracks: 3252


In [10]:
################################################################################
particles_lines = parse_file(particles_filename)
particles_frame = pd.DataFrame(particles_lines, columns=particles_columns)
particles_frame.head(6)

Unnamed: 0,barcode,vertex_x,vertex_y,vertex_z,momentum,theta,phi,charge,event_id
0,68719476736,0.022137,0.006326,23.7941,1.07009,2.30747,1.25342,-1,1
1,824633720832,0.022137,0.006326,23.7941,0.651495,1.76023,2.61439,-1,1
2,4503874505277440,-0.010566,-0.011323,-28.9527,0.819525,1.79714,-2.2263,1,1
3,4503943224754176,-0.010566,-0.011323,-28.9527,1.71963,2.54697,-0.783333,-1,1
4,4504080663707648,-0.010566,-0.011323,-28.9527,1.16629,2.62599,-0.390961,-1,1
5,4504561700044800,-0.010566,-0.011323,-28.9527,3.50855,2.79646,2.29379,-1,1


In [11]:
################################################################################
particles_frame = particles_frame.rename(columns={
    "barcode": "cluster_id", "theta": "momentum_theta", "phi": "momentum_phi"
})
particles_frame.head(6)

Unnamed: 0,cluster_id,vertex_x,vertex_y,vertex_z,momentum,momentum_theta,momentum_phi,charge,event_id
0,68719476736,0.022137,0.006326,23.7941,1.07009,2.30747,1.25342,-1,1
1,824633720832,0.022137,0.006326,23.7941,0.651495,1.76023,2.61439,-1,1
2,4503874505277440,-0.010566,-0.011323,-28.9527,0.819525,1.79714,-2.2263,1,1
3,4503943224754176,-0.010566,-0.011323,-28.9527,1.71963,2.54697,-0.783333,-1,1
4,4504080663707648,-0.010566,-0.011323,-28.9527,1.16629,2.62599,-0.390961,-1,1
5,4504561700044800,-0.010566,-0.011323,-28.9527,3.50855,2.79646,2.29379,-1,1


In [12]:
################################################################################
events = particles_frame.groupby(["event_id"])["cluster_id"]
tracks_per_event = [len(event.unique()) for (_, event) in events]

print("Hits:", len(particles_frame))
print("Events:", len(particles_frame["event_id"].unique()))
print("Min Tracks:", min(tracks_per_event))
print("Max Tracks:", max(tracks_per_event))

Hits: 248386
Events: 100
Min Tracks: 1732
Max Tracks: 3252


In [13]:
################################################################################
combined_frame = clusters_frame.merge(
    particles_frame,
    on=["event_id", "cluster_id"])
combined_frame.head(6)

Unnamed: 0,hit_number,cluster_id,volume_id,layer_id,local_x,local_y,local_x_error,local_y_error,x,y,z,phi,theta,phi_error,theta_error,event_id,vertex_x,vertex_y,vertex_z,momentum,momentum_theta,momentum_phi,charge
0,1,752102443440930816,7,2,0.625,28.0611,0.15,0.15,-166.363,-23.8377,-1498.0,-3.00299,1.5708,0.1,0.01,1,0.004919,-0.018601,-65.7336,16.704,3.02477,-3.02523,-1
1,61,752102443440930816,7,4,-0.425,-18.8719,0.15,0.15,-120.025,-16.3137,-1098.0,-3.0065,3.03172,0.1,0.01,1,0.004919,-0.018601,-65.7336,16.704,3.02477,-3.02523,-1
2,298,752102443440930816,7,6,-0.625,-35.2969,0.15,0.15,-103.785,-13.8464,-958.0,-3.00896,3.03273,0.1,0.01,1,0.004919,-0.018601,-65.7336,16.704,3.02477,-3.02523,-1
3,597,752102443440930816,7,8,4.575,22.6969,0.15,0.15,-88.0645,-11.52,-822.5,-3.01152,3.03403,0.1,0.01,1,0.004919,-0.018601,-65.7336,16.704,3.02477,-3.02523,-1
4,937,752102443440930816,7,10,3.725,8.63437,0.15,0.15,-74.112,-9.56926,-702.5,-3.01318,3.03562,0.1,0.01,1,0.004919,-0.018601,-65.7336,16.704,3.02477,-3.02523,-1
5,954,752102443440930816,7,10,-7.975,7.77453,0.15,0.15,-73.5979,-9.46767,-698.0,-2.90597,1.5708,0.1,0.01,1,0.004919,-0.018601,-65.7336,16.704,3.02477,-3.02523,-1


In [14]:
################################################################################
x     = combined_frame["x"]
y     = combined_frame["y"]
frame = combined_frame.assign(r=np.sqrt(x**2 + y**2), phi=np.arctan2(y, x))
frame.head(6)

Unnamed: 0,hit_number,cluster_id,volume_id,layer_id,local_x,local_y,local_x_error,local_y_error,x,y,z,phi,theta,phi_error,theta_error,event_id,vertex_x,vertex_y,vertex_z,momentum,momentum_theta,momentum_phi,charge,r
0,1,752102443440930816,7,2,0.625,28.0611,0.15,0.15,-166.363,-23.8377,-1498.0,-2.999274,1.5708,0.1,0.01,1,0.004919,-0.018601,-65.7336,16.704,3.02477,-3.02523,-1,168.062142
1,61,752102443440930816,7,4,-0.425,-18.8719,0.15,0.15,-120.025,-16.3137,-1098.0,-3.006501,3.03172,0.1,0.01,1,0.004919,-0.018601,-65.7336,16.704,3.02477,-3.02523,-1,121.128599
2,298,752102443440930816,7,6,-0.625,-35.2969,0.15,0.15,-103.785,-13.8464,-958.0,-3.008962,3.03273,0.1,0.01,1,0.004919,-0.018601,-65.7336,16.704,3.02477,-3.02523,-1,104.70458
3,597,752102443440930816,7,8,4.575,22.6969,0.15,0.15,-88.0645,-11.52,-822.5,-3.011518,3.03403,0.1,0.01,1,0.004919,-0.018601,-65.7336,16.704,3.02477,-3.02523,-1,88.814788
4,937,752102443440930816,7,10,3.725,8.63437,0.15,0.15,-74.112,-9.56926,-702.5,-3.013184,3.03562,0.1,0.01,1,0.004919,-0.018601,-65.7336,16.704,3.02477,-3.02523,-1,74.727233
5,954,752102443440930816,7,10,-7.975,7.77453,0.15,0.15,-73.5979,-9.46767,-698.0,-3.013655,1.5708,0.1,0.01,1,0.004919,-0.018601,-65.7336,16.704,3.02477,-3.02523,-1,74.204364


In [15]:
################################################################################
# Eliminate duplicate hits that were caused by imperfections in the detector.
frame = frame.sort_values("r")
frame = frame.drop_duplicates(["event_id", "cluster_id", "layer_id"])

In [16]:
################################################################################
# Specify the volume to use. Each volume is a different detector configuration.
frame = frame[frame["volume_id"] == 8]

In [17]:
################################################################################
# Set radiuses to be the same for each layer.
for layer_id in frame["layer_id"].unique():
    ind = frame["layer_id"] == layer_id
    rs  = frame[ind]["r"]
    med = rs.median()
    frame.loc[ind, "r"] = med

In [18]:
################################################################################
# Clean up the frame a bit.
frame = frame.sort_values(["event_id", "cluster_id", "r"])
frame.head(6)

Unnamed: 0,hit_number,cluster_id,volume_id,layer_id,local_x,local_y,local_x_error,local_y_error,x,y,z,phi,theta,phi_error,theta_error,event_id,vertex_x,vertex_y,vertex_z,momentum,momentum_theta,momentum_phi,charge,r
11000,3018,68719476736,8,2,-3.625,28.9125,0.15,0.15,9.91091,31.4262,-6.0875,1.265298,2.79998,0.1,0.01,1,0.022137,0.006326,23.7941,1.07009,2.30747,1.25342,-1,31.959887
11001,5601,68719476736,8,4,0.306232,-6.69028,0.15,0.15,20.6705,69.1861,-41.6903,1.280471,2.96565,0.1,0.01,1,0.022137,0.006326,23.7941,1.07009,2.30747,1.25342,-1,71.842084
11002,7429,68719476736,8,6,3.275,24.1594,0.15,0.15,31.2039,111.037,-80.8406,1.29684,3.14159,0.1,0.01,1,0.022137,0.006326,23.7941,1.07009,2.30747,1.25342,-1,115.820577
11003,9198,68719476736,8,8,-1.87219,-27.3375,0.15,0.15,42.965,166.569,-132.338,1.318358,2.96565,0.1,0.01,1,0.022137,0.006326,23.7941,1.07009,2.30747,1.25342,-1,171.805748
17628,3754,824633720832,8,2,2.425,-17.2458,0.15,0.15,-27.4519,15.4548,17.7542,2.628841,3.14159,0.1,0.01,1,0.022137,0.006326,23.7941,0.651495,1.76023,2.61439,-1,31.959887
17629,6006,824633720832,8,4,-0.220786,-24.9796,0.15,0.15,-63.202,34.0301,10.0204,2.647673,2.41493,0.1,0.01,1,0.022137,0.006326,23.7941,0.651495,1.76023,2.61439,-1,71.842084


In [19]:
################################################################################
events = frame.groupby(["event_id"])["cluster_id"]
tracks_per_event = [len(event.unique()) for (_, event) in events]

print("Hits:", len(frame))
print("Events:", len(frame["event_id"].unique()))
print("Min Tracks:", min(tracks_per_event))
print("Max Tracks:", max(tracks_per_event))

Hits: 795033
Events: 100
Min Tracks: 1731
Max Tracks: 3249


# Extraction from multiple files.

In [20]:
################################################################################
def get_clusters_frame(
        clusters_filename : str,
        initial_event_id  : int,
        ) -> pd.DataFrame:
    clusters_lines = parse_file(
        clusters_filename,
        ignored_columns=[7],
        initial_event_id=initial_event_id)
    clusters_frame = pd.DataFrame(clusters_lines, columns=clusters_columns)
    clusters_frame = clusters_frame.rename(columns={
        "hit_nr": "hit_number", "barcode": "cluster_id", "lx": "local_x",
        "ly": "local_y", "gx": "x", "gy": "y", "gz": "z", "elx":
        "local_x_error", "ely": "local_y_error", "ephi": "phi_error",
        "etheta": "theta_error"})
    return clusters_frame

def get_particles_frame(
        particles_filename : str,
        initial_event_id   : int,
        ) -> pd.DataFrame:
    particles_lines = parse_file(
        particles_filename, 
        initial_event_id=initial_event_id)
    particles_frame = pd.DataFrame(particles_lines, columns=particles_columns)
    particles_frame = particles_frame.rename(columns={
        "barcode": "cluster_id", "theta": "momentum_theta",
        "phi": "momentum_phi"})
    return particles_frame

def extract(
        clusters_filename  : str, 
        particles_filename : str,
        volume_id          : int = 8,
        initial_event_id   : int = 0,
        ) -> pd.DataFrame:
    """
    Everything in one function.
    Depending on the size of the file, this function could take a long
    time. Most of the time is spent parsing the csv files within the
    first 4 lines.
    """
    clusters  = get_clusters_frame(clusters_filename, initial_event_id)
    particles = get_particles_frame(particles_filename, initial_event_id)
    combined  = clusters.merge(particles, on=["event_id", "cluster_id"])
    volume    = combined[combined["volume_id"] == volume_id]
    x, y      = volume["x"], volume["y"]
    
    return (
        volume.assign(r=np.sqrt(x**2 + y**2))
              .assign(phi=np.arctan2(y, x))
              .sort_values(["event_id", "cluster_id", "r"])
              .drop_duplicates(["event_id", "cluster_id", "layer_id"])
    )

In [21]:
%%time
################################################################################
frames = []
initial_event_id = 0
base_directory = "/inputdata/ACTS/prod_mu10_pt1000_2017_07_29"
for i in range(1, 1 + 100):
    print("Extracting from file {0}. Initial Event ID is {1}".format(i, initial_event_id))
    try:
        clusters_filename  = base_directory + "/clusters_{0}.csv".format(i)
        particles_filename = base_directory + "/particles_{0}.csv".format(i)
        frame = extract(
            clusters_filename=clusters_filename, 
            particles_filename=particles_filename,
            initial_event_id=initial_event_id,)
        initial_event_id = frame["event_id"].max() + 1
        frames.append(frame)
    except FileNotFoundError as error:
        print(error)
print("All done. Concatenating frames.")
frame = pd.concat(frames)
for layer_id in frame["layer_id"].unique():
    ind = frame["layer_id"] == layer_id
    rs  = frame[ind]["r"]
    med = rs.median()
    frame.loc[ind, "r"] = med

Extracting from file 1. Initial Event ID is 0
Extracting from file 2. Initial Event ID is 1001
Extracting from file 3. Initial Event ID is 2002
Extracting from file 4. Initial Event ID is 3003
Extracting from file 5. Initial Event ID is 4004
Extracting from file 6. Initial Event ID is 5005
Extracting from file 7. Initial Event ID is 6006
Extracting from file 8. Initial Event ID is 7007
Extracting from file 9. Initial Event ID is 8008
Extracting from file 10. Initial Event ID is 9009
Extracting from file 11. Initial Event ID is 10010
Extracting from file 12. Initial Event ID is 11011
Extracting from file 13. Initial Event ID is 12012
Extracting from file 14. Initial Event ID is 13013
Extracting from file 15. Initial Event ID is 14014
Extracting from file 16. Initial Event ID is 15015
Extracting from file 17. Initial Event ID is 16016
Extracting from file 18. Initial Event ID is 17017
Extracting from file 19. Initial Event ID is 18018
Extracting from file 20. Initial Event ID is 19019
Ex

In [22]:
%%time
################################################################################
print("Number of Hits: {}".format(len(frame)))
print("Number of Events: {}".format(len(frame["event_id"].unique())))
tracks  = [value for (_, value) in frame.groupby(["event_id"])]
lengths = [len(value["cluster_id"].unique()) for value in tracks]
print("Min Number of Tracks: {}".format(min(lengths)))
print("Max Number of Tracks: {}".format(max(lengths)))

Number of Hits: 13929190
Number of Events: 99615
Min Number of Tracks: 1
Max Number of Tracks: 246
CPU times: user 27.8 s, sys: 16.9 s, total: 44.7 s
Wall time: 44.6 s


In [23]:
%%time
################################################################################
savepath = "data/sets/ACTS-MU10-PT1000-COMPLETE.gz"
frame.to_csv(savepath, compression="gzip")

CPU times: user 12min 50s, sys: 532 ms, total: 12min 50s
Wall time: 12min 50s


# Preparing the extracted frame.

In [24]:
%%time
################################################################################
max_tracks = 50
max_len    = lambda event: len(event["cluster_id"].unique()) <= max_tracks
filtered   = frame.groupby("event_id").filter(max_len)

CPU times: user 34.8 s, sys: 45 s, total: 1min 19s
Wall time: 1min 19s


In [28]:
print(len(filtered["event_id"].unique()))
print(len(frame["event_id"].unique()))

65407
99615


In [29]:
%%time
################################################################################
prepared = extractor.prepare_frame(
    frame    = filtered,
    n_tracks = max_tracks,
    n_rows   = 200,
    n_noise  = 0,
)
prepared = prepared.sort_values(["event_id", "cluster_id", "r"])

CPU times: user 12min 17s, sys: 29.5 s, total: 12min 46s
Wall time: 12min 46s


In [30]:
%%time
################################################################################
savepath = "data/sets/ACTS-MU10-PT1000-PREPARED.gz"
prepared.to_csv(savepath, compression="gzip")

CPU times: user 9min 55s, sys: 296 ms, total: 9min 55s
Wall time: 9min 55s
