In [5]:
from pathlib import Path
import os
import pandas as pd

GCD_RESCUE = "/project/6008051/pone_simulation/GCD_Library/PONE_800mGrid.i3.gz"
DATA_DIR   = "/project/def-nahee/kbas/POM_Response_GZ"

OUT_DIR = Path("/project/def-nahee/kbas/pone_parquet")
OUT_DIR.mkdir(parents=True, exist_ok=True)

In [6]:
import sys, os

repo_root = os.path.abspath("..")
sys.path.append(repo_root)
from Helpers.readers import PONEI3Reader
from Helpers.readers import GraphNeTFileReader
from Helpers.extractors import PONE_TruthExtractor
from Helpers.extractors import PONE_FeatureExtractor



In [7]:
ALL_EVENTS_DIR  = Path("/project/def-nahee/kbas/Graphnet-Applications")
ALL_EVENTS_DIR.mkdir(parents=True, exist_ok=True)
ALL_EVENTS_PATH = ALL_EVENTS_DIR / "all_events.csv"

In [8]:
all_events_exists = ALL_EVENTS_PATH.exists()

reader = PONEI3Reader(gcd_rescue=GCD_RESCUE)
reader.set_extractors([PONE_FeatureExtractor(), PONE_TruthExtractor()])

filesets = reader.find_files(DATA_DIR)

for fs in filesets:
    i3_path = fs.i3_file
    fname = os.path.basename(i3_path)  

    batch_id = int(fname.split("_")[-1].split(".")[0])

    pulses_out = OUT_DIR / f"pulses_batch_{batch_id}.parquet"
    truth_out  = OUT_DIR / f"truth_batch_{batch_id}.parquet"

    if pulses_out.exists() and truth_out.exists():
        print(f"[SKIP] batch {batch_id} (parquet already exists)")
        continue

    print(f"[PROCESS] batch {batch_id}  ({i3_path})")
    events = reader(fs)

    pulses_list = []
    truth_list  = []
    events_list = []  

    for ev in events:
        pulses_df = ev["pone_pulses"].copy()
        truth     = ev["pone_truth"]

        if pulses_df.empty:
            continue

        pulses_df["batch_id"] = batch_id
        truth["batch_id"]     = batch_id

        pulses_list.append(pulses_df)
        truth_list.append(truth)

        events_list.append(
            {
                "batch_id":     batch_id,
                "run_id":       truth["run_id"],
                "sub_run_id":   truth["sub_run_id"],
                "event_id":     truth["event_id"],
                "sub_event_id": truth["sub_event_id"],
            }
        )

    if pulses_list:
        pulses_table = pd.concat(pulses_list, ignore_index=True)
        truth_table  = pd.DataFrame(truth_list)

        pulses_table.to_parquet(pulses_out)
        truth_table.to_parquet(truth_out)
        print(f"  -> wrote {pulses_out.name}, {truth_out.name}")
    else:
        print("  -> no events with pulses in this batch, skipping parquet write")

    if events_list:
        batch_events_df = pd.DataFrame(events_list)

        batch_events_df.to_csv(
            ALL_EVENTS_PATH,
            mode="a",
            index=False,
            header=not all_events_exists,
        )

        all_events_exists = True

        print(f"  -> appended {len(batch_events_df)} rows to {ALL_EVENTS_PATH}")
    else:
        print("  -> no events with pulses for all_events.csv")


Assuming list of directories.
[SKIP] batch 1718 (parquet already exists)
[SKIP] batch 1580 (parquet already exists)
[SKIP] batch 2919 (parquet already exists)
[SKIP] batch 2015 (parquet already exists)
[SKIP] batch 2887 (parquet already exists)
[SKIP] batch 1939 (parquet already exists)
[SKIP] batch 2540 (parquet already exists)
[SKIP] batch 1653 (parquet already exists)
[SKIP] batch 3667 (parquet already exists)
[SKIP] batch 2100 (parquet already exists)
[SKIP] batch 1819 (parquet already exists)
[SKIP] batch 3076 (parquet already exists)
[SKIP] batch 1944 (parquet already exists)
[SKIP] batch 2226 (parquet already exists)
[SKIP] batch 2981 (parquet already exists)
[SKIP] batch 3245 (parquet already exists)
[SKIP] batch 3625 (parquet already exists)
[PROCESS] batch 4071  (/project/def-nahee/kbas/POM_Response_GZ/pom_response_batch_4071.i3.gz)
  -> wrote pulses_batch_4071.parquet, truth_batch_4071.parquet
  -> appended 44 rows to /project/def-nahee/kbas/Graphnet-Applications/all_events.

  -> wrote pulses_batch_404.parquet, truth_batch_404.parquet
  -> appended 26 rows to /project/def-nahee/kbas/Graphnet-Applications/all_events.csv
[SKIP] batch 1909 (parquet already exists)
[SKIP] batch 3861 (parquet already exists)
[SKIP] batch 2042 (parquet already exists)
[SKIP] batch 2329 (parquet already exists)
[SKIP] batch 180 (parquet already exists)
[SKIP] batch 3720 (parquet already exists)
[SKIP] batch 1556 (parquet already exists)
[SKIP] batch 2467 (parquet already exists)
[SKIP] batch 277 (parquet already exists)
[SKIP] batch 2266 (parquet already exists)
[SKIP] batch 3426 (parquet already exists)
[SKIP] batch 174 (parquet already exists)
[SKIP] batch 1627 (parquet already exists)
[SKIP] batch 1116 (parquet already exists)
[SKIP] batch 2505 (parquet already exists)
[SKIP] batch 2859 (parquet already exists)
[SKIP] batch 1678 (parquet already exists)
[SKIP] batch 3054 (parquet already exists)
[SKIP] batch 1852 (parquet already exists)
[SKIP] batch 1616 (parquet already exis

  -> wrote pulses_batch_3989.parquet, truth_batch_3989.parquet
  -> appended 40 rows to /project/def-nahee/kbas/Graphnet-Applications/all_events.csv
[SKIP] batch 2117 (parquet already exists)
[SKIP] batch 1513 (parquet already exists)
[SKIP] batch 2271 (parquet already exists)
[SKIP] batch 1937 (parquet already exists)
[PROCESS] batch 4072  (/project/def-nahee/kbas/POM_Response_GZ/pom_response_batch_4072.i3.gz)
  -> wrote pulses_batch_4072.parquet, truth_batch_4072.parquet
  -> appended 26 rows to /project/def-nahee/kbas/Graphnet-Applications/all_events.csv
[SKIP] batch 2065 (parquet already exists)
[SKIP] batch 3623 (parquet already exists)
[SKIP] batch 2535 (parquet already exists)
[SKIP] batch 3186 (parquet already exists)
[SKIP] batch 1021 (parquet already exists)
[SKIP] batch 2030 (parquet already exists)
[SKIP] batch 1308 (parquet already exists)
[SKIP] batch 1344 (parquet already exists)
[PROCESS] batch 4088  (/project/def-nahee/kbas/POM_Response_GZ/pom_response_batch_4088.i3.gz

  -> wrote pulses_batch_4037.parquet, truth_batch_4037.parquet
  -> appended 38 rows to /project/def-nahee/kbas/Graphnet-Applications/all_events.csv
[SKIP] batch 1305 (parquet already exists)
[SKIP] batch 3118 (parquet already exists)
[SKIP] batch 2230 (parquet already exists)
[SKIP] batch 2228 (parquet already exists)
[SKIP] batch 3514 (parquet already exists)
[SKIP] batch 3085 (parquet already exists)
[SKIP] batch 1186 (parquet already exists)
[SKIP] batch 364 (parquet already exists)
[SKIP] batch 2247 (parquet already exists)
[SKIP] batch 2012 (parquet already exists)
[SKIP] batch 1963 (parquet already exists)
[SKIP] batch 158 (parquet already exists)
[SKIP] batch 3800 (parquet already exists)
[SKIP] batch 2830 (parquet already exists)
[SKIP] batch 2704 (parquet already exists)
[SKIP] batch 1147 (parquet already exists)
[SKIP] batch 269 (parquet already exists)
[SKIP] batch 1872 (parquet already exists)
[SKIP] batch 1304 (parquet already exists)
[PROCESS] batch 4087  (/project/def-n

UnboundLocalError: cannot access local variable 'frame' where it is not associated with a value

## To Shuffle

In [None]:
from pathlib import Path
import pandas as pd
import numpy as np



OUT_DIR = Path("/project/def-nahee/kbas/pone_parquet")
ALL_EVENTS_PATH = Path("/project/def-nahee/kbas/Graphnet-Applications/all_events.csv")

print("Loading all_events.csv...")
all_events = pd.read_csv(ALL_EVENTS_PATH)

print(f"Total events: {len(all_events)}")

rng = np.random.default_rng(42)
r = rng.random(len(all_events))
all_events["split"] = np.where(
    r < 0.8, "train",
    np.where(r < 0.9, "val", "test")
)

all_events = all_events.sample(frac=1, random_state=42).reset_index(drop=True)

all_events.to_parquet(OUT_DIR / "event_index.parquet")
print("Wrote shuffled event_index.parquet")
