In [11]:
# This cell imports necessary modules for anomaly detection in HEP data.
import pandas as pd
import numpy as np
import os
import gdown
from zipfile import ZipFile
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, roc_curve
import matplotlib.pyplot as plt


 # Load and Preprocess the data
    - Unzip the training sample 
    - Process data in chunks
    - Select relevant features of the data
    - Normalize the data

In [12]:
# Set the file ID and the output file name
file_id = "1PVQTx8l5Pdqws9-AIMLsPm0P8jslOz2r"
file_name = "dataset/train_sample.zip"

# Construct the download URL
url = f"https://drive.google.com/uc?id={file_id}"

# Download the file
gdown.download(url, file_name, quiet=False)

Downloading...
From (original): https://drive.google.com/uc?id=1PVQTx8l5Pdqws9-AIMLsPm0P8jslOz2r
From (redirected): https://drive.google.com/uc?id=1PVQTx8l5Pdqws9-AIMLsPm0P8jslOz2r&confirm=t&uuid=8f326d89-fc43-49e9-a757-d131e20a92d3
To: /home/hero/projects/masters/AI Models for Physics/anomaly-detection-HEP/dataset/train_sample.zip
100%|██████████| 903M/903M [00:43<00:00, 20.5MB/s] 


'dataset/train_sample.zip'

In [13]:
# Unzip the sample data
with ZipFile('dataset/train_sample.zip', 'r') as zip_ref:
    zip_ref.extractall('dataset/train_sample')

In [25]:
# Function to process data in chunks
def process_chunks(event_prefix, chunk_size=10000):
    hits_files = [f for f in os.listdir(event_prefix) if f.endswith('-hits.csv')]
    truth_files = [f for f in os.listdir(event_prefix) if f.endswith('-truth.csv')]

    features_list = []

    for hits_file, truth_file in zip(hits_files, truth_files):
        hits_df = pd.read_csv(os.path.join(event_prefix, hits_file), usecols=['hit_id', 'x', 'y', 'z', 'volume_id', 'layer_id', 'module_id'])
        truth_df = pd.read_csv(os.path.join(event_prefix, truth_file), usecols=['hit_id', 'particle_id', 'tpx', 'tpy', 'tpz'])
        
        merged_df = pd.merge(hits_df, truth_df, on='hit_id', suffixes=('_hit', '_truth'))

        features = merged_df[['x', 'y', 'z', 'tpx', 'tpy', 'tpz', 'volume_id', 'layer_id', 'module_id']]
        features_list.append(features)

        # Process in chunks to reduce memory usage
        if len(features_list) * chunk_size > 100000:  # Arbitrary limit to process in chunks
            features_chunk = pd.concat(features_list, ignore_index=True)
            yield features_chunk
            features_list = []

    if features_list:
        features_chunk = pd.concat(features_list, ignore_index=True)
        yield features_chunk


In [26]:
# Process the data in chunks
features_list = []
for features_chunk in process_chunks('dataset/train_sample/train_100_events'):
    features_list.append(features_chunk)

# Concatenate all chunks
all_features = pd.concat(features_list, ignore_index=True)


In [27]:
all_features.head()


Unnamed: 0,x,y,z,tpx,tpy,tpz,volume_id,layer_id,module_id
0,-91.5941,-2.51915,-1502.5,-0.46199,-0.023119,-7.26105,7,2,1
1,-79.3265,-7.12083,-1502.5,-0.407713,-0.100549,-8.31968,7,2,1
2,-78.2882,-1.67258,-1502.5,-0.56605,-0.023969,-8.94242,7,2,1
3,-66.8071,-11.3372,-1502.5,-0.379724,-0.02369,-7.30623,7,2,1
4,-78.4016,-14.5712,-1502.5,-0.756905,0.067117,-18.8848,7,2,1


In [28]:
all_features.shape

(10350837, 9)

In [29]:
# Check for and remove NaN values
all_features = all_features.dropna()
        
# Remove duplicates
all_features = all_features.drop_duplicates()


In [30]:
all_features.shape

(10350837, 9)

In [31]:
# Drop headers by resetting the index and ignoring the first row
all_features = all_features.reset_index(drop=True)
all_features.columns = range(all_features.shape[1])


In [32]:
# Normalize the data

scaler = StandardScaler()
features_scaled = scaler.fit_transform(all_features)
features_scaled = torch.tensor(features_scaled, dtype=torch.float32)
features_scaled

tensor([[-0.3279, -0.0070, -1.4202,  ..., -1.1444, -1.1566, -0.7123],
        [-0.2843, -0.0233, -1.4202,  ..., -1.1444, -1.1566, -0.7123],
        [-0.2807, -0.0040, -1.4202,  ..., -1.1444, -1.1566, -0.7123],
        ...,
        [-3.1056,  0.4135,  2.8956,  ...,  2.3150,  1.7786, -0.5141],
        [-3.2991,  0.2800,  2.8956,  ...,  2.3150,  1.7786, -0.5141],
        [-3.3021,  0.2313,  2.8956,  ...,  2.3150,  1.7786, -0.5141]])