In [7]:
# ⚙️ Configuration

# Path to original file_labels.csv and dataset
file_labels_csv = '~/wsu-grid/hm_jetscapeml_source/data/jet_ml_benchmark_config_01_to_09_alpha_0.2_0.3_0.4_q0_1.5_2.0_2.5_MMAT_MLBT_size_1000_balanced_unshuffled/file_labels.csv'  # ← CHANGE
dataset_root = '~/wsu-grid/hm_jetscapeml_source/data/jet_ml_benchmark_config_01_to_09_alpha_0.2_0.3_0.4_q0_1.5_2.0_2.5_MMAT_MLBT_size_1000_balanced_unshuffled/'             # ← CHANGE

# Aggregation settings
group_size = 5
agg_csv_out = 'aggregated_file_labels.csv'

In [8]:
# 🧱 Build Aggregation CSV from file_labels.csv

import pandas as pd
from collections import defaultdict
import random

df = pd.read_csv(file_labels_csv)

label_to_paths = defaultdict(list)
for _, row in df.iterrows():
    label = (row['energy_loss'], row['alpha'], row['q0'])
    label_to_paths[label].append(row['file_path'])

agg_entries = []
agg_id_counter = 0
for label, paths in label_to_paths.items():
    random.shuffle(paths)
    for i in range(0, len(paths) - group_size + 1, group_size):
        group = paths[i:i + group_size]
        if len(group) == group_size:
            agg_entries.append({
                'agg_id': f'agg_{agg_id_counter:06d}',
                'file_paths': '|'.join(group),
                'energy_loss': label[0],
                'alpha': label[1],
                'q0': label[2]
            })
            agg_id_counter += 1

agg_df = pd.DataFrame(agg_entries)
agg_df.to_csv(agg_csv_out, index=False)
print(f"✅ Aggregated CSV saved: {agg_csv_out}, samples: {len(agg_df)}")

✅ Aggregated CSV saved: aggregated_file_labels.csv, samples: 192


In [9]:
# 📦 Dataset Class with GPU Aggregation

import torch
from torch.utils.data import Dataset
import numpy as np
import os

class AggregatedJetDataset(Dataset):
    def __init__(self, agg_csv_path, dataset_root, global_max, device='cuda'):
        self.df = pd.read_csv(agg_csv_path)
        self.dataset_root = dataset_root
        self.global_max = global_max
        self.device = torch.device(device if torch.cuda.is_available() else 'cpu')

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        file_paths = row['file_paths'].split('|')
        energy_loss, alpha, q0 = int(row['energy_loss']), int(row['alpha']), int(row['q0'])

        tensors = []
        for path in file_paths:
            full_path = os.path.join(self.dataset_root, path)
            arr = np.load(full_path).astype(np.float32) / self.global_max
            tensors.append(torch.tensor(arr, device=self.device).unsqueeze(0))

        avg = torch.stack(tensors).mean(dim=0).unsqueeze(0)

        labels = {
            'energy_loss_output': torch.tensor([energy_loss], dtype=torch.long, device=self.device),
            'alpha_output': torch.tensor([alpha], dtype=torch.long, device=self.device),
            'q0_output': torch.tensor([q0], dtype=torch.long, device=self.device)
        }

        return avg, labels

In [10]:
# 📊 Stratified Split and DataLoader Builder

from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader

def stratified_split(df, test_size=0.2, val_size=0.1, seed=42):
    y = df[['energy_loss', 'alpha', 'q0']].astype(str).agg('_'.join, axis=1)
    df_train, df_temp = train_test_split(df, test_size=test_size + val_size, stratify=y, random_state=seed)
    y_temp = df_temp[['energy_loss', 'alpha', 'q0']].astype(str).agg('_'.join, axis=1)
    df_val, df_test = train_test_split(df_temp, test_size=0.5, stratify=y_temp, random_state=seed)
    return df_train, df_val, df_test

def build_dataloaders(agg_csv_path, dataset_root, global_max, batch_size=512):
    full_df = pd.read_csv(agg_csv_path)
    df_train, df_val, df_test = stratified_split(full_df)

    df_train.to_csv('agg_train.csv', index=False)
    df_val.to_csv('agg_val.csv', index=False)
    df_test.to_csv('agg_test.csv', index=False)

    train_ds = AggregatedJetDataset('agg_train.csv', dataset_root, global_max)
    val_ds = AggregatedJetDataset('agg_val.csv', dataset_root, global_max)
    test_ds = AggregatedJetDataset('agg_test.csv', dataset_root, global_max)

    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(test_ds, batch_size=batch_size, shuffle=False)

    return train_loader, val_loader, test_loader


In [11]:
# ✅ Test DataLoader Pipeline

train_loader, val_loader, test_loader = build_dataloaders(
    agg_csv_out,
    dataset_root,
    global_max=121.79151153564453,
    batch_size=32
)

# Check a batch
x, y = next(iter(train_loader))
print("Batch shape:", x.shape)
for k, v in y.items():
    print(f"{k}: {v.shape}")

FileNotFoundError: [Errno 2] No such file or directory: '~/wsu-grid/hm_jetscapeml_source/data/jet_ml_benchmark_config_01_to_09_alpha_0.2_0.3_0.4_q0_1.5_2.0_2.5_MMAT_MLBT_size_1000_balanced_unshuffled/MLBT_0.4_1.5/event_0000538.npy'