# Playing with the ACTS dataset

This notebook continues the work in ActsExample1. I add here some useful code for fetching and cleaning the ACTS data, and then I demonstrate how to bin it into 3D images.

In [1]:
# System
from __future__ import print_function
import os

# Externals
import numpy as np
import matplotlib.pyplot as plt

# Locals
from drawing import draw_3d_event

# Magic
%matplotlib notebook

In [2]:
np.set_printoptions(precision=3)

## Utilities

In [3]:
def calc_eta(theta):
    """Calculates eta from a theta value or flat array"""
    return -1. * np.log(np.tan(theta / 2.))

def calc_phi(rphi, r):
    """Calculates phi from rphi"""
    return rphi / r
# I vectorize it to work on an array of arrays
calc_phi = np.vectorize(calc_phi, otypes='O')

def filter_samples(idx, *arrays):
    """Apply a filter index to a list of arrays"""
    return map(lambda x: x[idx], arrays)

def filter_objects(idx, *arrays):
    """
    Apply array of filter indices to some object arrays.
    Each input array should be an array of arrays (dtype='O').
    """
    filt_func = np.vectorize(lambda x: x[idx], otypes='O')
    return map(lambda a: np.stack(filt_func(a)), arrays)

In [4]:
class ActsData():
    """Empty class for setting variables as attributes"""
    pass

def load_data(filenames):
    """
    Retrieve data from some input files.
    Returns a data object with attributes for each numpy array.
    """
    d = ActsData()
    f = np.concatenate([np.load(f, encoding='bytes') for f in filenames])
    # Track level truth quantities
    d.true_theta = f['truth_Theta']
    d.true_eta = calc_eta(d.true_theta)
    d.true_phi = f['truth_Phi']
    d.true_qop = f['truth_QoverP']
    d.true_pt = np.abs(1/d.true_qop)
    # Detector hit measurements
    d.nstep = f['Filter_nSteps']
    d.rphi = f['Meas_RPHI']
    d.z = f['Meas_z']
    d.r = f['Cyl_R']
    d.phi = calc_phi(d.rphi, d.r)
    return d

def clean_data(data, fix_phi=False):
    """
    Cleans up the data, selecting barrel tracks and good hits.
    """
    barrel_tracks = np.abs(data.true_eta) < 1
    d = ActsData()

    # filter out all tracks not perfectly in the barrel.
    d.true_theta, d.true_eta, d.true_phi, d.true_qop, d.true_pt = (
        filter_samples(barrel_tracks, data.true_theta, data.true_eta,
                       data.true_phi, data.true_qop, data.true_pt))
    d.nstep, d.rphi, d.z, d.r, d.phi = (
        filter_samples(barrel_tracks, data.nstep, data.rphi,
                       data.z, data.r, data.phi))

    # To select the actual layer hits, I select the indices of the steps
    # I want. I'm currently taking the middle of each detector layer triplet,
    # and ignoring all of the apparent "auxiliary" steps. This assumes
    # all tracks have the fixed 31 steps as previously discovered, so it's
    # a bit fragile and will need to be updated if the data changes.
    assert np.all(d.nstep == 31)
    #good_hit_idxs = np.array([1, 4, 9, 11, 14, 17, 20, 24, 27])
    good_hit_idxs = np.array([2, 5, 8, 11, 15, 18, 21, 25, 28])
    d.rphi, d.z, d.r, d.phi = filter_objects(
        good_hit_idxs, d.rphi, d.z, d.r, d.phi)
    
    # Current data has some funny artifacts in phi.
    # Here is a shitty, hacky correction. Needs to be fixed upstream.
    if fix_phi:
        for i in range(d.phi.shape[1]):
            phi = d.phi[:,i]
            phi = phi * np.pi * 2 / (phi.max() - phi.min())
            d.phi[:,i] = phi - phi.min() - np.pi

    # Calculate theta
    d.theta = np.arctan(d.r / d.z)
    # Fix negative values so theta ranges from (0, pi)
    negidx = d.theta < 0
    d.theta[negidx] = d.theta[negidx] + np.pi
    d.eta = calc_eta(d.theta)

    return d

In [5]:
def hist2d_r_phi(data, det_shape, phi_range=(-3.1416, 3.1416)):
    """Takes the track hit data and bins it in 2D histogram detector images"""
    num_tracks = len(data.true_pt)
    # Initial structure with zeros
    tracks = np.zeros((num_tracks,) + det_shape)
    # Loop over tracks and layers
    for itrk in range(num_tracks):
        for jlay in range(det_shape[0]):
            # Convert coordinates for this layer to a histogram
            tracks[itrk, jlay] = np.histogram(
                data.phi[itrk, jlay:jlay+1],
                bins=det_shape[1], range=phi_range)[0]
    # Check that all tracks have one hit in every layer
    assert np.all(tracks.sum(axis=2) == 1.)
    return tracks

def hist3d_r_phi_z(data, det_shape, phi_range, z_range):
    """Takes the track hit data and bins it in 3D histogram detector images"""
    num_tracks = len(data.true_pt)
    # Initial structure with zeros
    tracks = np.zeros((num_tracks,) + det_shape)
    # Loop over tracks and layers
    for itrk in range(num_tracks):
        for jlay in range(det_shape[0]):
            # Convert coordinates for this layer to a histogram
            tracks[itrk, jlay] = np.histogram2d(
                data.phi[itrk, jlay:jlay+1],
                data.z[itrk, jlay:jlay+1],
                bins=[num_phi_bins, num_z_bins],
                range=[phi_range, z_range])[0]
    # Check that all tracks have one hit in every layer
    assert np.all(tracks.sum(axis=3).sum(axis=2) == 1.)
    return tracks

def combine_events(events, num_combine):
    """Combine detector image events together by num_combine"""
    combined_events = []
    for ievt in range(int(events.shape[0]/num_combine)):
        istart = num_combine * ievt
        iend = num_combine * (ievt + 1)
        combined_events.append(sum(events[istart:iend]))
    return np.stack(combined_events)

## Load the data
Multiple input files

In [6]:
input_dir = '/Users/sfarrell/Atlas/TrackML/acts_dev/data'

In [7]:
ls $input_dir

KFTest_0000.npy    KFTest_0003.npy    KFTest_0006.npy    KFTest_0009.npy
KFTest_0001.npy    KFTest_0004.npy    KFTest_0007.npy    KFfatras_0000.npy
KFTest_0002.npy    KFTest_0005.npy    KFTest_0008.npy


In [8]:
input_files = [os.path.join(input_dir, f) for f in os.listdir(input_dir)
               if f.startswith('KFTest') and f.endswith('.npy')]

In [9]:
# Load the raw data
raw_data = load_data(input_files)
num_raw_tracks = len(raw_data.true_theta)
# Clean the data
data = clean_data(raw_data, fix_phi=True)
num_tracks = len(data.true_theta)
print('Number of raw tracks: %i, cleaned tracks: %i' % (num_raw_tracks, num_tracks))

Number of raw tracks: 100000, cleaned tracks: 26674


In [10]:
data.phi.shape

(26674, 9)

## Inspect the data

In [11]:
# Plot the distribution of hits
plt.figure(figsize=(9.5,3))
plt.subplot(131)
plt.scatter(data.r.flatten(), data.z.flatten(), s=1, marker=',')
plt.xlabel('R [mm]')
plt.ylabel('Z [mm]')

plt.subplot(132)
plt.scatter(data.r.flatten(), data.rphi.flatten(), s=1, marker=',')
plt.xlabel('R [mm]')
plt.ylabel('R*$\phi$ [mm]')

plt.subplot(133)
plt.scatter(data.rphi.flatten(), data.z.flatten(), s=1, marker=',')
plt.xlabel('R*$\phi$ [mm]')
plt.ylabel('Z [mm]')
plt.tight_layout()

<IPython.core.display.Javascript object>

In [12]:
# Plot the distribution of hits
plt.figure(figsize=(9.5,3))
plt.subplot(131)
plt.scatter(data.r.flatten(), data.z.flatten(), s=1, marker=',')
plt.xlabel('R [mm]')
plt.ylabel('Z [mm]')

plt.subplot(132)
plt.scatter(data.r.flatten(), data.eta.flatten(), s=1, marker=',')
plt.xlabel('R [mm]')
plt.ylabel('$\eta$')

plt.subplot(133)
plt.scatter(data.rphi.flatten(), data.z.flatten(), s=1, marker=',')
plt.xlabel('R*$\phi$ [mm]')
plt.ylabel('Z [mm]')
plt.tight_layout()

<IPython.core.display.Javascript object>

In [13]:
# Draw some tracks in 2D
plt.figure(figsize=(9,4))
num_draw = 10

plt.subplot(121)
for i in range(num_draw):
    plt.plot(data.r[i], data.z[i], 'b.-')
plt.xlabel('R [mm]')
plt.ylabel('Z [mm]')

plt.subplot(122)
for i in range(num_draw):
    plt.plot(data.r[i], data.phi[i], 'b.-')
plt.xlabel('R [mm]')
plt.ylabel('$\phi$')

plt.tight_layout()

<IPython.core.display.Javascript object>

With my shitty phi correction hack, the phi distribution _seems_ well behaved.

In [14]:
# Draw some tracks in 2D with eta
plt.figure(figsize=(9,4))
num_draw = 10

plt.subplot(121)
for i in range(num_draw):
    plt.plot(data.r[i], data.z[i], 'b.-')
plt.xlabel('R [mm]')
plt.ylabel('Z [mm]')

plt.subplot(122)
for i in range(num_draw):
    plt.plot(data.r[i], data.eta[i], 'b.-')
plt.xlabel('R [mm]')
plt.ylabel('$\eta$')

plt.tight_layout()

<IPython.core.display.Javascript object>

Damn, it looks like $\eta$ is messed up, too. Perhaps we should avoid using eta as calculated this way for now.

## Discretizing the data

Now let's demonstrate how to bin the data into 3D "images" for ML models.

In [15]:
print('Phi range: (%.5f, %.5f)' % (data.phi.min(), data.phi.max()))
print('Z range: (%.2f, %.2f)' % (data.z.min(), data.z.max()))

Phi range: (-3.14159, 3.14159)
Z range: (-1151.84, 1153.63)


In [16]:
# Binning config
num_det_layers = 9
num_phi_bins = 16
num_z_bins = 16
phi_range = (-3.1416, 3.1416)
z_range = (-1155, 1155)

det_shape = (num_det_layers, num_phi_bins, num_z_bins)

In [17]:
# Bin the data into discrete detector images
tracks = hist3d_r_phi_z(data, det_shape, phi_range=phi_range, z_range=z_range)

In [18]:
# Now we have to try 3D visualizations
fig, _ = draw_3d_event(tracks[0], sig_track=tracks[0], ylabel='$\phi$ bin', zlabel='$z$ bin')

<IPython.core.display.Javascript object>

In [19]:
# Plot multiple tracks
event = sum(tracks[0:10])

event.shape

(9, 16, 16)

In [20]:
fig2, _ = draw_3d_event(event, sig_track=tracks[0], ylabel='$\phi$ bin', zlabel='$z$ bin')

<IPython.core.display.Javascript object>