# Track candidate tree search with LSTMs

In [1]:
# Select a GPU first
import os
#os.environ['CUDA_VISIBLE_DEVICES'] = '6'
cuda = False

In [2]:
from __future__ import print_function

# System imports
import ast
import multiprocessing as mp
from functools import partial
from timeit import default_timer as timer

# Data libraries
import numpy as np
import pandas as pd

# Torch imports
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F

# Visualization
import matplotlib.pyplot as plt

# Local imports
from data import load_data_events

# Magic
%matplotlib notebook

## Utilities

In [3]:
def process_hits_data(df, copy_keys=['evtid', 'barcode', 'volid', 'layid']):
    """Split columns and calculate some derived variables"""
    x = df.gpos.apply(lambda pos: pos[0])
    y = df.gpos.apply(lambda pos: pos[1])
    z = df.gpos.apply(lambda pos: pos[2])
    r = np.sqrt(x**2 + y**2)
    phi = np.arctan2(y, x)
    return df[copy_keys].assign(z=z.astype(np.float32), r=r.astype(np.float32),
                                phi=phi.astype(np.float32))

def read_worker(hits_file):
    hits_columns = ['hitid', 'barcode', 'volid', 'layid', 'lpos',
                    'lerr', 'gpos', 'chans', 'dir', 'direrr']
    return process_hits_data(load_data_events(hits_file, columns=hits_columns))

def process_files(hits_files, num_workers):
    """Load and process a set of hits files with MP"""
    pool = mp.Pool(processes=num_workers)
    hits = pool.map(read_worker, hits_files)
    pool.close()
    pool.join()
    # Fix the evtid to be consecutive
    for i in range(1, len(hits)):
        hits[i].evtid += hits[i-1].evtid.iloc[-1] + 1
    return pd.concat(hits, ignore_index=True)

In [4]:
def calc_eta(r, z):
    theta = np.arctan2(r, z)
    return -1. * np.log(np.tan(theta / 2.))

def calc_dphi(phi1, phi2):
    dphi = np.abs(phi1 - phi2)
    idx = dphi > np.pi
    dphi[idx] = 2*np.pi - dphi[idx]
    return dphi

def calc_dR(eta1, eta2, phi1, phi2):
    deta = np.abs(eta1 - eta2)
    dphi = calc_dphi(phi1, phi2)
    return np.sqrt(deta*deta + dphi*dphi)

In [5]:
# PyTorch memory allocations and conversions
np_to_torch_cpu = lambda x: Variable(torch.from_numpy(x))
np_to_torch_gpu = lambda x: Variable(torch.from_numpy(x)).cuda()
torch_zeros_cpu = lambda *size: Variable(torch.FloatTensor(*size).zero_())
torch_zeros_gpu = lambda *size: Variable(torch.cuda.FloatTensor(*size).zero_())

np_to_torch = np_to_torch_gpu if cuda else np_to_torch_cpu
torch_zeros = torch_zeros_gpu if cuda else torch_zeros_cpu

In [6]:
data_dir = '/global/cscratch1/sd/sfarrell/ACTS/prod_mu10_pt1000_2017_07_29'

In [7]:
n_files = 10

all_files = os.listdir(data_dir)
hits_files = sorted(f for f in all_files if f.startswith('clusters'))
hits_files = [os.path.join(data_dir, f) for f in hits_files[:n_files]]

In [8]:
n_workers = 5
hits = process_files(hits_files, num_workers=n_workers)

Loading /global/cscratch1/sd/sfarrell/ACTS/prod_mu10_pt1000_2017_07_29/clusters_100.csv
Loading /global/cscratch1/sd/sfarrell/ACTS/prod_mu10_pt1000_2017_07_29/clusters_12.csv
Loading /global/cscratch1/sd/sfarrell/ACTS/prod_mu10_pt1000_2017_07_29/clusters_11.csv
Loading /global/cscratch1/sd/sfarrell/ACTS/prod_mu10_pt1000_2017_07_29/clusters_10.csv
Loading /global/cscratch1/sd/sfarrell/ACTS/prod_mu10_pt1000_2017_07_29/clusters_1.csv
Loading /global/cscratch1/sd/sfarrell/ACTS/prod_mu10_pt1000_2017_07_29/clusters_13.csv
Loading /global/cscratch1/sd/sfarrell/ACTS/prod_mu10_pt1000_2017_07_29/clusters_14.csv
Loading /global/cscratch1/sd/sfarrell/ACTS/prod_mu10_pt1000_2017_07_29/clusters_15.csv
Loading /global/cscratch1/sd/sfarrell/ACTS/prod_mu10_pt1000_2017_07_29/clusters_16.csv
Loading /global/cscratch1/sd/sfarrell/ACTS/prod_mu10_pt1000_2017_07_29/clusters_17.csv


## Single-step classification

For the first iteration of a tree search algorithm, we will develop a model which reads a fixed number of steps (maybe 3) and classifies individual hits as being the next step of that track.

Input:
- A list of coordinates for the track hits
- A list of next-hit candidate coordinates

Target:
- Binary labels for the candidate coordinates

The track hits tensor will have shape (n_events, n_hits, n_features).
The candidate hits tensor will have the same shape, just with a different number of hits.

### Data preparation

In [10]:
def select_hits(hits):
    # Select the inner pixel layers
    hits = hits[hits.volid == 8]
    # Enumerate layers 0 to 3
    layer = (hits.layid / 2 - 1).astype(np.int8)
    # Select just the fields we need
    hits = hits[['evtid', 'barcode', 'phi', 'r', 'z']].assign(layer=layer)
    # Average all duplicate hits together
    return hits.groupby(['evtid', 'barcode', 'layer'], as_index=False).mean()

def select_signal_hits(hits):
    """Select tracks that hit every layer and have sufficient number of bkg hits"""
    # Require at least 5 event hits on the candidate layer
    hits = hits.groupby('evtid').filter(lambda x: (x.layer == 3).sum() >= 5)
    # Require at least 1 signal hit on every layer
    return (hits.groupby(['evtid', 'barcode'])
            .filter(lambda x: len(x) >= 4 and x.layer.unique().size == 4))

def extract_features(x, feature_names, scale_factors):
    return x[feature_names].values / scale_factors

In [9]:
%%time
sel_hits = select_hits(hits)
sig_hits = select_signal_hits(sel_hits)

evt_groups = sel_hits.groupby('evtid')
sig_groups = sig_hits.groupby(['evtid', 'barcode'])
sig_keys = sig_groups.groups.keys()

CPU times: user 1min 38s, sys: 723 ms, total: 1min 39s
Wall time: 1min 39s


In [15]:
feature_names = ['r', 'phi', 'z']
feature_scale_factors = [100., np.pi, 200.]
get_features = partial(extract_features, feature_names=feature_names,
                       scale_factors=feature_scale_factors)

n_samples = len(sig_keys)
n_trk_hits = 3
n_cand_hits = 5
n_features = len(feature_names)

In [11]:
# Create the data structures
trk_input = np.zeros((n_samples, n_trk_hits, n_features), dtype=np.float32)
hit_input = np.zeros((n_samples, n_cand_hits, n_features), dtype=np.float32)
hit_labels = np.zeros((n_samples, n_cand_hits), dtype=np.float32)

# No need to shuffle; true hit is always first
hit_labels[:,0] = 1

In [12]:
%%time

# Loop over samples
for i in xrange(n_samples):
    eid, pid = sig_keys[i]
    # Select the signal and background hits
    trk_hits = sig_groups.get_group((eid, pid))
    evt_hits = evt_groups.get_group(eid)
    bkg_hits = evt_hits[(evt_hits.layer == n_trk_hits) & (evt_hits.barcode != pid)]
    
    # Select closest background hits in eta-phi
    sig_cand_hit = trk_hits.iloc[-1]
    sig_cand_eta = calc_eta(sig_cand_hit.r, sig_cand_hit.z)
    bkg_hits_eta = calc_eta(bkg_hits.r, bkg_hits.z)
    bkg_dr = calc_dR(sig_cand_eta, bkg_hits_eta, sig_cand_hit.phi, bkg_hits.phi)
    bkg_cand_hits = bkg_hits.loc[bkg_dr.sort_values().head(n_cand_hits - 1).index]
    
    # Extract the features
    trk_features = get_features(trk_hits)
    bkg_features = get_features(bkg_cand_hits)

    # Fill the model inputs
    trk_input[i] = trk_features[:n_trk_hits]
    hit_input[i] = np.concatenate([trk_features[None, n_trk_hits], bkg_features])

CPU times: user 23min 10s, sys: 4.31 s, total: 23min 14s
Wall time: 23min 14s


### Model definition

In [15]:
def time_distributed(module, x):
    """Applies a module across both batch and 'time' dimensions"""
    s = x.size()
    y = module(x.view((-1,) + s[2:]))
    return y.view(s[:2] + y.size()[1:])

In [16]:
class TrackHitScorer(nn.Module):
    """
    A track-hit binary classifier model.
    
    This model embeds a sequence of hits using an LSTM into a state estimate.
    It then classifies candidate hits conditional on that state.
    """
    
    def __init__(self, input_dim, state_dim, hidden_dims):
        """Initialize the model"""
        super(TrackHitScorer, self).__init__()
        
        # Use an LSTM for the encoder
        self.encoder = nn.LSTM(input_dim, state_dim, batch_first=True)
        
        # Fully-connected classifier hidden layers
        clf_layers = [nn.Linear(input_dim + state_dim, hidden_dims[0]), nn.ReLU()]
        for i in range(len(hidden_dims) - 1):
            clf_layers += [nn.Linear(hidden_dims[i], hidden_dims[i+1]), nn.ReLU()]
        # Classifier final layer
        clf_layers += [nn.Linear(hidden_dims[-1], 1)]
        self.classifier = nn.Sequential(*clf_layers)

    def forward(self, inputs):
        trk_inputs, hit_inputs = inputs
        trk_input_size = trk_inputs.size()
        
        # Initialize the lstm hidden state
        var_args = [trk_input_size[0], self.encoder.hidden_size]
        #var_args = [self.encoder.num_layers, trk_input_size[0], self.encoder.hidden_size]
        h0, c0 = torch_zeros(*var_args), torch_zeros(*var_args)
        
        # Encode the track hits into a state estimate
        _, (h, c) = self.encoder(trk_inputs, (h0, c0))
        
        # Broadcast state from shape (batch, state) to (batch, hits, state).
        expanded_size = (h.size(1), hit_inputs.size(1), h.size(2))
        states = h.squeeze(0)[:, None, :].expand(*expanded_size)
        # Attach state estimate onto every hit candidate for classification.
        x = torch.cat([hit_inputs, states], dim=-1)
        
        # Apply classifier head to every candidate
        return time_distributed(self.classifier, x).squeeze(-1)

In [17]:
def predict_prob(model, inputs):
    return F.sigmoid(model(inputs))

def training_step(model, inputs, targets, loss_func, optimizer):
    model.zero_grad()
    outputs = model(inputs)
    loss = loss_func(outputs, targets)
    loss.backward()
    optimizer.step()
    return loss

def accuracy(probs, target, threshold=0.5):
    return ((probs.data.numpy() > threshold) == (target.data.numpy() > 0.5)).mean()

In [23]:
# Model config
state_size = 16
hidden_sizes = [16, 16, 16]

# Training config
batch_size = 64
n_epochs = 10
test_frac = 0.1

In [24]:
n_train = int(n_samples * (1 - test_frac))
n_batches = (n_train + batch_size - 1) // batch_size

# Split data into train and test sets
train_trk_input = np_to_torch(trk_input[:n_train])
train_hit_input = np_to_torch(hit_input[:n_train])
train_hit_labels = np_to_torch(hit_labels[:n_train])
test_trk_input = np_to_torch(trk_input[n_train:])
test_hit_input = np_to_torch(hit_input[n_train:])
test_hit_labels = np_to_torch(hit_labels[n_train:])

print('Training samples:', n_train)
print('Batches per epoch:', n_batches)
print('Test samples:', n_samples - n_train)

Training samples: 187698
Batches per epoch: 2933
Test samples: 20856


In [25]:
# Construct the model
model = TrackHitScorer(input_dim=n_features, state_dim=state_size, hidden_dims=hidden_sizes)
optimizer = torch.optim.Adam(model.parameters())
loss_func = nn.BCEWithLogitsLoss()

print(model)
print('Parameters:', sum(param.numel() for param in model.parameters()))

TrackHitScorer (
  (encoder): LSTM(3, 16, batch_first=True)
  (classifier): Sequential (
    (0): Linear (19 -> 16)
    (1): ReLU ()
    (2): Linear (16 -> 16)
    (3): ReLU ()
    (4): Linear (16 -> 16)
    (5): ReLU ()
    (6): Linear (16 -> 1)
  )
)
Parameters: 2225


In [26]:
batch_idxs = np.arange(0, n_train, batch_size)
train_losses, test_losses = [], []

for i in range(n_epochs):
    print('Epoch', i)
    start_time = timer()
    sum_loss = 0

    for j in batch_idxs:
        batch_trk_input = train_trk_input[j:j+batch_size]
        batch_hit_input = train_hit_input[j:j+batch_size]
        batch_inputs = [batch_trk_input, batch_hit_input]
        batch_target = train_hit_labels[j:j+batch_size]
        sum_loss += training_step(model, batch_inputs, batch_target, loss_func, optimizer)

    end_time = timer()
    avg_loss = sum_loss.cpu().data[0] / n_batches
    train_losses.append(avg_loss)
    print('  training loss %.3g' % avg_loss, 'time %gs' % (end_time - start_time))
    
    # Evaluate the model on the test set
    test_output = model([test_trk_input, test_hit_input])
    test_loss = loss_func(test_output, test_hit_labels).cpu().data[0]
    test_losses.append(test_loss)
    print('  test loss %.3g' % test_loss)

Epoch 0
  training loss 0.251 time 19.8075s
  test loss 0.116
Epoch 1
  training loss 0.0807 time 19.6965s
  test loss 0.059
Epoch 2
  training loss 0.0469 time 19.8309s
  test loss 0.0335
Epoch 3
  training loss 0.0403 time 20.6198s
  test loss 0.033
Epoch 4
  training loss 0.0383 time 20.0436s
  test loss 0.0301
Epoch 5
  training loss 0.0375 time 19.3751s
  test loss 0.0348
Epoch 6
  training loss 0.0331 time 19.8245s
  test loss 0.022
Epoch 7
  training loss 0.0323 time 19.8075s
  test loss 0.0228
Epoch 8
  training loss 0.0311 time 20.4844s
  test loss 0.0202
Epoch 9
  training loss 0.0287 time 19.2873s
  test loss 0.0211


In [27]:
# Evaluate accuracy on the test set
test_probs = F.sigmoid(test_output)
accuracy(test_probs, test_hit_labels)

0.99431338703490602

### Discussion

This works pretty well so far. What next?
- make classifications along the entire length of a track
- make it work with holes

## Develop full track classification data

Now I want to demonstrate that the model can learn to classify hits at any point along a track.

Let's select candidate hits along the whole sequence of a track (maybe after a seed).

The seed structure will be the same as before. The hit input will be of shape (batch, step, cand, features).

I will use the same hit selection as was used in the track filter notebook: barrel hits, signal tracks that hit every layer.

In [16]:
def select_hits(hits):
    # Select all barrel hits
    vids = [8, 13, 17]
    hits = hits[np.logical_or.reduce([hits.volid == v for v in vids])]
    # Re-enumerate the volume and layer numbers for convenience
    volume = pd.Series(-1, index=hits.index, dtype=np.int8)
    vid_groups = hits.groupby('volid')
    for i, v in enumerate(vids):
        volume[vid_groups.get_group(v).index] = i
    # This assumes 4 layers per volume (except last volume)
    layer = (hits.layid / 2 - 1 + volume * 4).astype(np.int8)
    hits = hits[['evtid', 'barcode', 'r', 'phi', 'z']].assign(layer=layer)
    # Average all duplicate hits together
    return hits.groupby(['evtid', 'barcode', 'layer'], as_index=False).mean()

def select_signal_hits(hits):
    """Select signal hits from tracks that hit all barrel layers"""
    sel_func = lambda x: len(x) >= 10 and x.layer.unique().size == 10
    return hits.groupby(['evtid', 'barcode']).filter(sel_func)

def extract_features(x, feature_names, scale_factors):
    return x[feature_names].values / scale_factors

def select_cand_hits(sig_hits, sel_hits, seed_length, feature_names, n_cand_hits):
    # Drop seed layers before matching
    sig_hits = sig_hits[sig_hits.layer >= seed_length]
    sel_hits = sel_hits[sel_hits.layer >= seed_length]

    # Calculate dR for all hits in every sample
    paired_hits = sig_hits.merge(sel_hits, on=['evtid', 'layer'], suffixes=('_sig', ''))
    eta = calc_eta(paired_hits.r, paired_hits.z)
    sig_eta = calc_eta(paired_hits.r_sig, paired_hits.z_sig)
    dR = calc_dR(eta, sig_eta, paired_hits.phi, paired_hits.phi_sig)
    
    # Select closest candidate hits
    cand_group_cols = ['evtid', 'barcode_sig', 'layer']
    return (paired_hits[cand_group_cols + feature_names].assign(dR=dR)
            .groupby(cand_group_cols, as_index=False)
            .apply(lambda x: x.nsmallest(n_cand_hits, 'dR')))

In [None]:
%%time
sel_hits = select_hits(hits)
sig_hits = select_signal_hits(sel_hits)

sig_groups = sig_hits.groupby(['evtid', 'barcode'])
sig_keys = sig_groups.groups.keys()

CPU times: user 1min 48s, sys: 1.04 s, total: 1min 49s
Wall time: 1min 49s


In [17]:
feature_names = ['r', 'phi', 'z']
feature_scale_factors = [1000., np.pi, 1000.]
get_features = partial(extract_features, feature_names=feature_names,
                       scale_factors=feature_scale_factors)

n_samples = len(sig_keys)
track_length = 10
seed_length = 3
cand_length = track_length - seed_length
n_cand_hits = 5
n_features = len(feature_names)

In [18]:
# Create the data structures
trk_input = np.zeros((n_samples, track_length, n_features), dtype=np.float32)
hit_input = np.zeros((n_samples, cand_length, n_cand_hits, n_features), dtype=np.float32)
hit_labels = np.zeros((n_samples, cand_length, n_cand_hits), dtype=np.float32)

# True hit is always first due to our sorting; no need to shuffle.
hit_labels[:,0] = 1

In [None]:
%%time
cand_hits = select_cand_hits(sig_hits, sel_hits, seed_length, feature_names, n_cand_hits)
cand_groups = cand_hits.groupby(['evtid', 'barcode_sig'])

CPU times: user 18min 43s, sys: 17.9 s, total: 19min 1s
Wall time: 19min 1s


In [None]:
%%time

# Loop over samples
for i in xrange(n_samples):
    # Extract the hits
    key = tuple(sig_keys[i])
    sig_evt_hits = sig_groups.get_group(key)
    cand_evt_hits = cand_groups.get_group(key)
    
    # Fill track features
    trk_input[i] = get_features(sig_evt_hits)
    
    # Loop over candidate hit layers
    for layer, lay_hits in cand_evt_hits.groupby('layer'):
        # Translate layer number to layer index in tensor
        l = layer - seed_length
        # Fill the hit input features
        hit_input[i, l, :lay_hits.shape[0]] = get_features(lay_hits)

CPU times: user 17min 6s, sys: 8.46 s, total: 17min 14s
Wall time: 17min 14s


### Define the model

In [25]:
def time_distributed(module, x):
    """Applies a module across both batch and 'time' dimensions"""
    s = x.size()
    y = module(x.view((-1,) + s[2:]))
    return y.view(s[:2] + y.size()[1:])

In [26]:
class TrackHitScorer2(nn.Module):
    """
    A track-hit binary classifier model.
    
    This model embeds a sequence of hits using an LSTM into a state estimate.
    It then classifies candidate hits conditional on that state.
    """
    
    def __init__(self, input_dim, state_dim, hidden_dims, seed_length):
        """Initialize the model"""
        super(TrackHitScorer2, self).__init__()
        
        self.seed_length = seed_length
        
        # Use an LSTM for the encoder
        self.encoder = nn.LSTM(input_dim, state_dim, batch_first=True)
        
        # Fully-connected classifier hidden layers
        clf_layers = [nn.Linear(input_dim + state_dim, hidden_dims[0]), nn.ReLU()]
        for i in range(len(hidden_dims) - 1):
            clf_layers += [nn.Linear(hidden_dims[i], hidden_dims[i+1]), nn.ReLU()]
        # Classifier final layer
        clf_layers += [nn.Linear(hidden_dims[-1], 1)]
        self.classifier = nn.Sequential(*clf_layers)

    def forward(self, inputs):
        trk_inputs, hit_inputs = inputs
        trk_input_size = trk_inputs.size()
        
        # Initialize the lstm hidden state
        var_args = [trk_input_size[0], self.encoder.hidden_size]
        h0, c0 = torch_zeros(*var_args), torch_zeros(*var_args)
        
        # Encode the track hits into a state estimate
        states, _ = self.encoder(trk_inputs, (h0, c0))
        # Drop the seed layer outputs
        states = states[:, seed_length:]
        
        # Broadcast state from shape (batch, layer, state) to (batch, layer, hits, state).
        expanded_size = (states.size(0), states.size(1), hit_inputs.size(2), states.size(2))
        hit_states = states[:, :, None, :].expand(*expanded_size)
        # Attach state estimates onto the hit candidates for classification.
        clf_inputs = torch.cat([hit_inputs, hit_states], dim=-1)
        
        # Apply classifier head to every candidate
        return time_distributed(self.classifier, clf_inputs).squeeze(-1)

In [27]:
def predict_prob(model, inputs):
    return F.sigmoid(model(inputs))

def training_step(model, inputs, targets, loss_func, optimizer):
    model.zero_grad()
    outputs = model(inputs)
    loss = loss_func(outputs, targets)
    loss.backward()
    optimizer.step()
    return loss

def accuracy(probs, target, threshold=0.5):
    return ((probs.data.numpy() > threshold) == (target.data.numpy() > 0.5)).mean()

### Train the model

In [28]:
# Model config
state_size = 16
hidden_sizes = [16, 16, 16]

# Training config
batch_size = 64
n_epochs = 10
test_frac = 0.1

In [29]:
n_train = int(n_samples * (1 - test_frac))
n_batches = (n_train + batch_size - 1) // batch_size

# Split data into train and test sets
train_trk_input = np_to_torch(trk_input[:n_train])
train_hit_input = np_to_torch(hit_input[:n_train])
train_hit_labels = np_to_torch(hit_labels[:n_train])
test_trk_input = np_to_torch(trk_input[n_train:])
test_hit_input = np_to_torch(hit_input[n_train:])
test_hit_labels = np_to_torch(hit_labels[n_train:])

print('Training samples:', n_train)
print('Batches per epoch:', n_batches)
print('Test samples:', n_samples - n_train)

Training samples: 122388
Batches per epoch: 1913
Test samples: 13599


In [30]:
# Construct the model
model = TrackHitScorer2(input_dim=n_features, state_dim=state_size,
                        hidden_dims=hidden_sizes, seed_length=seed_length)
optimizer = torch.optim.Adam(model.parameters())
loss_func = nn.BCEWithLogitsLoss()

print(model)
print('Parameters:', sum(param.numel() for param in model.parameters()))

TrackHitScorer2 (
  (encoder): LSTM(3, 16, batch_first=True)
  (classifier): Sequential (
    (0): Linear (19 -> 16)
    (1): ReLU ()
    (2): Linear (16 -> 16)
    (3): ReLU ()
    (4): Linear (16 -> 16)
    (5): ReLU ()
    (6): Linear (16 -> 1)
  )
)
Parameters: 2225


In [None]:
batch_idxs = np.arange(0, n_train, batch_size)
train_losses, test_losses = [], []

for i in range(n_epochs):
    print('Epoch', i)
    start_time = timer()
    sum_loss = 0

    for j in batch_idxs:
        batch_trk_input = train_trk_input[j:j+batch_size]
        batch_hit_input = train_hit_input[j:j+batch_size]
        batch_inputs = [batch_trk_input, batch_hit_input]
        batch_target = train_hit_labels[j:j+batch_size]
        sum_loss += training_step(model, batch_inputs, batch_target, loss_func, optimizer)

    end_time = timer()
    avg_loss = sum_loss.cpu().data[0] / n_batches
    train_losses.append(avg_loss)
    print('  training loss %.3g' % avg_loss, 'time %gs' % (end_time - start_time))
    
    # Evaluate the model on the test set
    test_output = model([test_trk_input, test_hit_input])
    test_loss = loss_func(test_output, test_hit_labels).cpu().data[0]
    test_losses.append(test_loss)
    print('  test loss %.3g' % test_loss)

Epoch 0
  training loss 0.0408 time 28.4336s
  test loss 1.48e-05
Epoch 1
  training loss 6.39e-06 time 29.2816s
  test loss 2.45e-06
Epoch 2
  training loss 1.34e-06 time 28.6127s
  test loss 6.57e-07
Epoch 3
  training loss 3.84e-07 time 37.0491s
  test loss 2.02e-07
Epoch 4
  training loss 1.22e-07 time 30.6693s
  test loss 6.53e-08
Epoch 5
  training loss 4.13e-08 time 30.5814s
  test loss 1.72e-08
Epoch 6
  training loss 9.13e-09 time 30.3763s
  test loss 3.61e-11
Epoch 7
  training loss 2.75e-11 time 29.8686s
  test loss 2.3e-11
Epoch 8
  training loss 2.35e-11 time 29.6829s
  test loss 2.3e-11
Epoch 9
  training loss 2.34e-11 time 30.2621s
  test loss 2.3e-11


In [38]:
# Evaluate accuracy on the entire training set
train_output = model([train_trk_input, train_hit_input])
train_probs = F.sigmoid(train_output)
accuracy(train_probs, train_hit_labels)

1.0

In [33]:
# Evaluate accuracy on the test set
test_probs = F.sigmoid(test_output)
accuracy(test_probs, test_hit_labels)

1.0

### Discussion

Wow, this works almost suspiciously well. Assuming there are no bugs, what is the next step?
- figure out how to deal with holes?
- move off-barrel?