In [1]:
import pandas as pd
import numpy as np
import torch
torch.set_default_dtype(torch.float64)

In [2]:
# Without considerations for the auxiliary label and no explicit features consisting of sensor geometry
from torch.utils.data import Dataset, DataLoader

# Class for a dataset generated from a dataframe and data from the sensor geometry file
class NeutrinoDataset(Dataset):
    def __init__(self, filename):
        self.sensor_geom = pd.read_csv('sensor_geometry.csv')
        self.vals_df = pd.read_parquet('batch_104_directions.parquet')
        self.dataframe = pd.read_parquet(filename)
        sensor_loc = np.array(sg.iloc[:])[:, 1:]
        self.num_features = 5160*3
        self.num_events = self.dataframe.index.nunique()
        self.unique_indices = np.unique(self.dataframe.index)
        
    def __len__(self):
        return self.num_events
    
    # Replaces sensor ID with sensor coordinates 
    def __getitem__(self, i):
        df = self.dataframe
        sg = self.sensor_geom
        meta_vals = np.array(
            self.vals_df.loc[self.vals_df['event_id'] == df.index[i]])[0].astype(float)
        
        pulse_array = np.array(df.loc[df.index[i]])
        pulse_array_sensors = np.concatenate((np.expand_dims(np.arange(5160), axis=1), np.zeros([5160, 3])), 1)

        for pulse in pulse_array:
            if(pulse_array_sensors[pulse[0]][1] == 0):
                pulse_array_sensors[pulse[0]][1] = pulse[1] - meta_vals[2] # first time
            else:
                # possible last time, will be the last time for the actual last one
                pulse_array_sensors[pulse[0]][2] = pulse[1] - meta_vals[2]
            # Add charge
            pulse_array_sensors[pulse[0]][3] += pulse[2]
        
        flattened_pulse = (pulse_array_sensors[:, 1:]).flatten()
        # print(flattened_pulse.shape)
                
        return (torch.from_numpy(flattened_pulse), 
                                 torch.from_numpy(meta_vals[-2:]))
    
    # Finds the first event with multiple pulses at the same sensors
    # here we ask for at least num_min_total_repeats repetitions
    def get_multi_pulse_event(self, num_min_total_repeats):
        for i in range(self.num_events):
            pulses = np.array(df.loc[unique_indices[i]])
            if(pulses[:,0].shape[0] - np.unique(pulses[:,0]).shape[0] >= num_min_total_repeats):
                return self.unique_indices[i]
            
    # Finds all events in a range with multiple pulses at the same sensors
    # here we ask for at least num_min_total_repeats repetitions
    def get_multi_pulse_events(self, num_min_total_repeats, start_index, end_index):
        list_multi_pulse = []
        for i in range(start_index, min(self.num_events, end_index)):
            pulses = np.array(df.loc[unique_indices[i]])
            if(pulses[:,0].shape[0] - np.unique(pulses[:,0]).shape[0] >= num_min_total_repeats):
                list_multi_pulse.append(self.unique_indices[i])
        return list_multi_pulse


In [3]:
# Checking torch device
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cpu device


In [4]:
# Set up Dataset and DataLoader, build NN

import os
import torch
os.environ['TORCH'] = torch.__version__
print(torch.__version__)
import torch
import torch.nn as nn
from torch.nn import Linear
from torch.utils.data import Dataset, DataLoader

sg = pd.read_csv('sensor_geometry.csv')

dataset = NeutrinoDataset('batch_104.parquet')

dataloader = DataLoader(dataset, batch_size=4, shuffle=True, num_workers=0)

class NNPredictor(torch.nn.Module):
    def __init__(self, use_activation = True):
        super().__init__()
        torch.manual_seed(1234)
        self.layers = nn.ModuleList()
        self.layer_norms = nn.ModuleList()
        self.use_activation = use_activation
        
        self.layers.append(nn.Linear(dataset.num_features, 100))
        self.layers.append(nn.Linear(100, 50))
        self.layers.append(nn.Linear(50, 10))
        self.classifier = (nn.Linear(10,2))

    def forward(self, x):
        new_x = x
        if(self.use_activation):
            for layer in self.layers:
                # print(layer, new_x.shape)
                new_x = layer(new_x)
                new_x = nn.ReLU()(new_x)
        else:
            for layer in self.layers:
                new_x = layer(new_x)
        
        # Apply a final (linear) classifier.

        return self.classifier(new_x)

model = NNPredictor()
print(model)


2.0.1+cpu
NNPredictor(
  (layers): ModuleList(
    (0): Linear(in_features=15480, out_features=100, bias=True)
    (1): Linear(in_features=100, out_features=50, bias=True)
    (2): Linear(in_features=50, out_features=10, bias=True)
  )
  (layer_norms): ModuleList()
  (classifier): Linear(in_features=10, out_features=2, bias=True)
)


In [5]:
learning_rate = 1e-3
batch_size = 64
epochs = 5

In [6]:
batch_104_vals_df = pd.read_parquet('batch_104_directions.parquet')

In [7]:

loss_fn = nn.MSELoss()

optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    # Set the model to training mode - important for batch normalization and dropout layers
    # Unnecessary in this situation but added for best practices
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        # print(X)
        # Compute prediction and loss
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if batch % 100 == 0:
            loss, current = loss.item(), (batch + 1) * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")
            


In [None]:
train_loop(dataloader, model, loss_fn, optimizer)

loss: 496818.836474  [    4/200000]
loss: 2441206878135286540055189913600.000000  [  404/200000]
loss: 1998491153300646868079484076032.000000  [  804/200000]


In [None]:
# i=0
# meta_vals = np.array(batch_104_vals_df.loc[batch_104_vals_df['event_id'] 
#                                            == df.index[0]])[0].astype(float)
        
# pulse_array = np.array(df.loc[df.index[i]])
# pulse_array_sensors = np.concatenate((np.expand_dims(np.arange(5160), axis=1), np.zeros([5160, 3])), 1)

# for pulse in pulse_array:
#     if(pulse_array_sensors[pulse[0]][1] == 0):
#         pulse_array_sensors[pulse[0]][1] = pulse[1] - meta_vals[2] # first time
#     else:
#         # possible last time, will be the last time for the actual last one
#         pulse_array_sensors[pulse[0]][2] = pulse[1] - meta_vals[2]
#     # Add charge
#     pulse_array_sensors[pulse[0]][3] += pulse[2]

# print(torch.from_numpy(np.concatenate(
#     (np.ndarray.flatten(pulse_array_sensors[:, 1:]), meta_vals[-2:]))))