In [49]:
import os
import glob
import pandas as pd
import numpy as np

In [50]:
BASE_DIR = "/project/def-nahee/kbas/POM_Response_Parquet"  

FEATURES_GLOB = os.path.join(BASE_DIR, "features", "*.parquet")
TRUTH_GLOB    = os.path.join(BASE_DIR, "truth", "*.parquet")

feature_files = sorted(glob.glob(FEATURES_GLOB))
truth_files   = sorted(glob.glob(TRUTH_GLOB))

print("FEATURE FILES:", len(feature_files))
print("TRUTH FILES  :", len(truth_files))
print()

if not feature_files:
    raise RuntimeError(f"No feature parquet found in {FEATURES_GLOB}")
if not truth_files:
    raise RuntimeError(f"No truth parquet found in {TRUTH_GLOB}")



FEATURE FILES: 20
TRUTH FILES  : 20



In [38]:
feat_path = feature_files[0]
truth_path = truth_files[0]

print("Using:")
print("  features:", feat_path)
print("  truth   :", truth_path)
print()

Using:
  features: /project/def-nahee/kbas/POM_Response_Parquet/features/pom_response_batch_000_features.parquet
  truth   : /project/def-nahee/kbas/POM_Response_Parquet/truth/pom_response_batch_000_truth.parquet



In [39]:
df_feat = pd.read_parquet(feat_path)
df_truth = pd.read_parquet(truth_path)

# Basic info
print("=== FEATURES ===")
print("rows:", len(df_feat), "cols:", len(df_feat.columns))
print("columns:", list(df_feat.columns))
print("event_no unique:", df_feat["event_no"].nunique() if "event_no" in df_feat.columns else "NO event_no")
print(df_feat.head(5))
print()

print("=== TRUTH ===")
print("rows:", len(df_truth), "cols:", len(df_truth.columns))
print("columns:", list(df_truth.columns))
print("event_no unique:", df_truth["event_no"].nunique() if "event_no" in df_truth.columns else "NO event_no")
print(df_truth.head(5))
print()



=== FEATURES ===
rows: 253445 cols: 5
columns: ['charge', 'dom_time', 'dom_x', 'dom_y', 'dom_z']
event_no unique: NO event_no
            charge     dom_time  dom_x       dom_y  dom_z
event_no                                                 
0         1.200854  2016.263500 -120.0 -730.717968 -350.0
0         1.701354  8889.985068 -120.0 -730.717968 -100.0
0         1.548579  7497.074484 -120.0 -730.717968  -50.0
0         1.270504  9321.680460 -120.0 -730.717968  -50.0
0         1.201895  7498.248672 -120.0 -730.717968  -50.0

=== TRUTH ===
rows: 56 cols: 15
columns: ['energy', 'position_x', 'position_y', 'position_z', 'azimuth', 'zenith', 'pid', 'event_time', 'sim_type', 'interaction_type', 'elasticity', 'RunID', 'SubrunID', 'EventID', 'SubEventID']
event_no unique: NO event_no
                 energy  position_x  position_y  position_z   azimuth  \
event_no                                                                
0            119.300665   46.768787  344.606761  276.394571  1.3

In [40]:
print(df_feat.index.name, df_truth.index.name)

event_no event_no


In [41]:
feat_event_nunique = df_feat.index.nunique() if df_feat.index.name=="event_no" else df_feat["event_no"].nunique()
truth_event_nunique = df_truth.index.nunique() if df_truth.index.name=="event_no" else df_truth["event_no"].nunique()

print("feat events:", feat_event_nunique)
print("truth events:", truth_event_nunique)


feat events: 56
truth events: 56


In [42]:
feat_events = set(df_feat.index.unique())
truth_events = set(df_truth.index.unique())

missing_in_feat = sorted(list(truth_events - feat_events))
missing_in_truth = sorted(list(feat_events - truth_events))

print("missing_in_feat:", missing_in_feat)
print("missing_in_truth:", missing_in_truth)


missing_in_feat: []
missing_in_truth: []


## Sanity Checks

In [43]:
assert df_feat.index.name == "event_no"
assert df_truth.index.name == "event_no"

print("features rows:", len(df_feat))
print("truth rows   :", len(df_truth))
print("n events feat:", df_feat.index.nunique())
print("n events tru :", df_truth.index.nunique())


features rows: 253445
truth rows   : 56
n events feat: 56
n events tru : 56


In [44]:
pulse_counts = df_feat.groupby(level=0).size()

print("Pulse count summary:")
print(pulse_counts.describe(percentiles=[0.01, 0.05, 0.5, 0.95, 0.99]))

print("\nMin pulses event_no:", int(pulse_counts.idxmin()), "pulses:", int(pulse_counts.min()))
print("Max pulses event_no:", int(pulse_counts.idxmax()), "pulses:", int(pulse_counts.max()))


Pulse count summary:
count      56.000000
mean     4525.803571
std       250.728931
min      3755.000000
1%       3828.150000
5%       4127.500000
50%      4523.500000
95%      4989.000000
99%      5259.650000
max      5489.000000
dtype: float64

Min pulses event_no: 46 pulses: 3755
Max pulses event_no: 32 pulses: 5489


In [45]:
truth_counts = df_truth.groupby(level=0).size()
print(truth_counts.value_counts().sort_index())

bad = truth_counts[truth_counts != 1]
print("\nEvents with truth_rows != 1:", len(bad))
if len(bad) > 0:
    print(bad.head(10))


1    56
Name: count, dtype: int64

Events with truth_rows != 1: 0


In [46]:
import numpy as np

def nan_inf_report(df, name):
    nan_counts = df.isna().sum().sort_values(ascending=False)
    inf_counts = pd.Series(
        {c: np.isinf(df[c].to_numpy(dtype=float, copy=False)).sum()
         for c in df.columns if pd.api.types.is_numeric_dtype(df[c])}
    ).sort_values(ascending=False)

    print(f"\n=== {name} NaN top ===")
    print(nan_counts.head(10))
    print(f"\n=== {name} INF top ===")
    print(inf_counts.head(10))

nan_inf_report(df_feat, "FEATURES")
nan_inf_report(df_truth, "TRUTH")



=== FEATURES NaN top ===
charge      0
dom_time    0
dom_x       0
dom_y       0
dom_z       0
dtype: int64

=== FEATURES INF top ===
charge      0
dom_time    0
dom_x       0
dom_y       0
dom_z       0
dtype: int64

=== TRUTH NaN top ===
energy              0
position_x          0
position_y          0
position_z          0
azimuth             0
zenith              0
pid                 0
event_time          0
sim_type            0
interaction_type    0
dtype: int64

=== TRUTH INF top ===
energy              0
position_x          0
position_y          0
position_z          0
azimuth             0
zenith              0
pid                 0
event_time          0
interaction_type    0
elasticity          0
dtype: int64


In [47]:
def range_check(df, cols, name):
    print(f"\n=== {name} ranges ===")
    for c in cols:
        if c in df.columns and pd.api.types.is_numeric_dtype(df[c]):
            s = df[c]
            print(f"{c:>12}  min={s.min():.6g}  max={s.max():.6g}")

range_check(df_feat, ["charge","dom_time","dom_x","dom_y","dom_z","rde","pmt_area","event_time"], "FEATURES")
range_check(df_truth, ["energy","zenith","azimuth","position_x","position_y","position_z","elasticity","inelasticity"], "TRUTH")



=== FEATURES ranges ===
      charge  min=0.250012  max=9.41821
    dom_time  min=0.105712  max=9999.82
       dom_x  min=-760  max=760
       dom_y  min=-730.718  max=724.205
       dom_z  min=-450  max=500

=== TRUTH ranges ===
      energy  min=102.098  max=484550
      zenith  min=0.167123  max=2.81997
     azimuth  min=0.0637432  max=6.12145
  position_x  min=-2691.5  max=1916.81
  position_y  min=-1228.77  max=9080.78
  position_z  min=-2208.44  max=1724.87
  elasticity  min=0.0268716  max=0.989243


In [48]:
common_events = sorted(set(df_feat.index.unique()) & set(df_truth.index.unique()))
sample_events = common_events[:3]

for e in sample_events:
    n_p = int((df_feat.index == e).sum())
    t = df_truth.loc[e]
    print("\n==============================")
    print("event_no:", e, "| pulses:", n_p, "| energy:", float(t["energy"]))
    print("truth (subset):")
    print(t[["energy","zenith","azimuth","pid","interaction_type","elasticity","inelasticity","is_starting"]]
          if "is_starting" in t.index else t.head(10))
    print("\nfirst 5 pulses:")
    print(df_feat.loc[e, ["charge","dom_time","dom_x","dom_y","dom_z"]].head(5))



event_no: 0 | pulses: 4472 | energy: 119.30066500027469
truth (subset):
energy                  119.300665
position_x               46.768787
position_y              344.606761
position_z              276.394571
azimuth                   1.378987
zenith                    2.129418
pid                             14
event_time                       0
sim_type            LeptonInjector
interaction_type                 1
Name: 0, dtype: object

first 5 pulses:
            charge     dom_time  dom_x       dom_y  dom_z
event_no                                                 
0         1.200854  2016.263500 -120.0 -730.717968 -350.0
0         1.701354  8889.985068 -120.0 -730.717968 -100.0
0         1.548579  7497.074484 -120.0 -730.717968  -50.0
0         1.270504  9321.680460 -120.0 -730.717968  -50.0
0         1.201895  7498.248672 -120.0 -730.717968  -50.0

event_no: 1 | pulses: 4604 | energy: 103.47168607572203
truth (subset):
energy                  103.471686
position_x             

### Check the "Merged" Folder

In [54]:
from pathlib import Path
import re
import pandas as pd

# =========================
# CONFIG
# =========================
MERGED_DIR = BASE_DIR  
MERGED_DIR = Path("/project/def-nahee/kbas/POM_Response_Parquet/merged")  # örnek
TRUTH_TABLE = "truth"

# features tablonun adı sende "features" idi; değilse aşağıyı değiştir
FEATURE_TABLE = "features"

# kaç batch dosyası inceleyelim?
N_FILES = 3


In [55]:

# =========================
# HELPERS
# =========================
def list_tables(root: Path):
    tables = sorted([p.name for p in root.iterdir() if p.is_dir()])
    print("Tables in merged:", tables)
    return tables

def list_parquets(table_dir: Path):
    files = sorted(table_dir.glob("*.parquet"))
    return files

def get_batch_id(fname: str):
    # truth_12.parquet / features_12.parquet gibi isimlerden 12’yi çekmeye çalışır
    m = re.search(r"_(\d+)\.parquet$", fname)
    return int(m.group(1)) if m else None

def quick_df_info(df: pd.DataFrame, name: str):
    print(f"\n--- {name} ---")
    print("rows:", len(df), "cols:", df.shape[1])
    print("columns:", list(df.columns))
    if "event_no" in df.columns:
        print("event_no unique:", df["event_no"].nunique())
        # truth için genelde 1 row/event bekleriz
        vc = df["event_no"].value_counts()
        print("event_no value_counts head:")
        print(vc.head(5))
    else:
        print("NO event_no column in columns (maybe it's index?)")


In [56]:

# =========================
# MAIN
# =========================
assert MERGED_DIR.exists(), f"MERGED_DIR not found: {MERGED_DIR}"

tables = list_tables(MERGED_DIR)

truth_dir = MERGED_DIR / TRUTH_TABLE
feat_dir  = MERGED_DIR / FEATURE_TABLE

assert truth_dir.exists(), f"truth dir not found: {truth_dir}"
assert feat_dir.exists(), f"feature dir not found: {feat_dir}"

truth_files = list_parquets(truth_dir)
feat_files  = list_parquets(feat_dir)

print("\nTruth parquet count:", len(truth_files))
print("Feature parquet count:", len(feat_files))


Tables in merged: ['features', 'truth']

Truth parquet count: 1
Feature parquet count: 1


In [57]:

# batch id setleri
truth_ids = {get_batch_id(f.name) for f in truth_files}
feat_ids  = {get_batch_id(f.name) for f in feat_files}
truth_ids.discard(None); feat_ids.discard(None)

print("\nBatch IDs (truth)  :", sorted(list(truth_ids))[:20], "...")
print("Batch IDs (feature):", sorted(list(feat_ids))[:20], "...")
print("IDs in both:", len(truth_ids & feat_ids))
print("Only truth:", sorted(list(truth_ids - feat_ids))[:20])
print("Only feat :", sorted(list(feat_ids - truth_ids))[:20])



Batch IDs (truth)  : [0] ...
Batch IDs (feature): [0] ...
IDs in both: 1
Only truth: []
Only feat : []


In [58]:

# ortak batch’lerden ilk N tanesini inceleyelim
common_ids = sorted(list(truth_ids & feat_ids))
if not common_ids:
    raise RuntimeError("No common batch ids between truth and feature!")

inspect_ids = common_ids[:N_FILES]
print("\nInspecting batch ids:", inspect_ids)

for bid in inspect_ids:
    tpath = truth_dir / f"{TRUTH_TABLE}_{bid}.parquet"
    fpath = feat_dir  / f"{FEATURE_TABLE}_{bid}.parquet"

    tdf = pd.read_parquet(tpath)
    fdf = pd.read_parquet(fpath)

    quick_df_info(tdf, f"TRUTH batch {bid} ({tpath.name})")
    quick_df_info(fdf, f"FEATURE batch {bid} ({fpath.name})")

    # ---- Sanity checks ----
    # 1) truth: idealde 1 row/event_no
    if "event_no" in tdf.columns:
        truth_bad = (tdf["event_no"].value_counts() != 1)
        n_bad = truth_bad.sum()
        print("truth rows != 1 per event:", int(n_bad))
        if n_bad > 0:
            bad_ids = truth_bad[truth_bad].index[:10].tolist()
            print("example bad truth event_no:", bad_ids)

    # 2) feature eventleri truth eventlerinin alt kümesi mi?
    if "event_no" in tdf.columns and "event_no" in fdf.columns:
        tset = set(tdf["event_no"].unique())
        fset = set(fdf["event_no"].unique())
        missing_in_truth = sorted(list(fset - tset))[:10]
        missing_in_feat  = sorted(list(tset - fset))[:10]
        print("feature events not in truth (sample):", missing_in_truth)
        print("truth events not in feature (sample):", missing_in_feat)

    # 3) hızlı göz at: ilk 5 satır
    print("\nTRUTH head:")
    print(tdf.head(5))
    print("\nFEATURE head:")
    print(fdf.head(5))
    print("\n" + "="*80)



Inspecting batch ids: [0]

--- TRUTH batch 0 (truth_0.parquet) ---
rows: 746 cols: 15
columns: ['energy', 'position_x', 'position_y', 'position_z', 'azimuth', 'zenith', 'pid', 'event_time', 'sim_type', 'interaction_type', 'elasticity', 'RunID', 'SubrunID', 'EventID', 'SubEventID']
NO event_no column in columns (maybe it's index?)

--- FEATURE batch 0 (features_0.parquet) ---
rows: 3372308 cols: 5
columns: ['charge', 'dom_time', 'dom_x', 'dom_y', 'dom_z']
NO event_no column in columns (maybe it's index?)

TRUTH head:
                energy   position_x  position_y  position_z   azimuth  \
event_no                                                                
696         639.466672 -1046.671099 -178.810131  775.750951  3.668017   
651         345.979393  -134.821341 -156.781099  511.837625  0.036637   
672       25579.431266   -12.274139 -366.356952  476.605146  3.963338   
695         376.114768   298.849338  526.693845 -363.415712  5.155793   
653         292.086474   226.158901  63