In [1]:
import os
import glob
import pandas as pd
import numpy as np

In [2]:
BASE_DIR = "/project/def-nahee/kbas/POM_Response_Parquet"
bid = 1189

feat_path = f"{BASE_DIR}/features/pom_response_batch_{bid:03d}_features.parquet"
truth_path = f"{BASE_DIR}/truth/pom_response_batch_{bid:03d}_truth.parquet"

df_feat = pd.read_parquet(feat_path)
df_truth = pd.read_parquet(truth_path)

print("features shape:", df_feat.shape)
print("truth shape   :", df_truth.shape)



features shape: (8554, 11)
truth shape   : (31, 15)


In [3]:

# Basic info
print("=== FEATURES ===")
print("rows:", len(df_feat), "cols:", len(df_feat.columns))
print("columns:", list(df_feat.columns))
print("event_no unique:", df_feat["event_no"].nunique() if "event_no" in df_feat.columns else "NO event_no")
print(df_feat.head(5))
print()

print("=== TRUTH ===")
print("rows:", len(df_truth), "cols:", len(df_truth.columns))
print("columns:", list(df_truth.columns))
print("event_no unique:", df_truth["event_no"].nunique() if "event_no" in df_truth.columns else "NO event_no")
print(df_truth.head(5))
print()



=== FEATURES ===
rows: 8554 cols: 11
columns: ['charge', 'dom_time', 'dom_x', 'dom_y', 'dom_z', 'string', 'pmt_number', 'dom_number', 'pmt_x', 'pmt_y', 'pmt_z']
event_no unique: NO event_no
            charge     dom_time  dom_x       dom_y  dom_z  string  pmt_number  \
event_no                                                                        
43719     1.264909  1945.716981  680.0  239.230485  150.0     246          16   
43719     1.058583  1866.558264  680.0  239.230485  200.0     246           7   
43719     0.864575  2017.284672  680.0  239.230485  250.0     246           6   
43719     1.383859  1915.451898  680.0  239.230485  250.0     246          15   
43719     0.967552  2106.463855  640.0  308.512517  150.0     263           6   

          dom_number       pmt_x       pmt_y       pmt_z  
event_no                                                  
43719             13  680.138361  239.139241  149.861639  
43719             14  680.138361  239.321728  199.861639  
43719 

In [4]:
print(df_feat.index.name, df_truth.index.name)

event_no event_no


In [5]:
feat_event_nunique = df_feat.index.nunique() if df_feat.index.name=="event_no" else df_feat["event_no"].nunique()
truth_event_nunique = df_truth.index.nunique() if df_truth.index.name=="event_no" else df_truth["event_no"].nunique()

print("feat events:", feat_event_nunique)
print("truth events:", truth_event_nunique)


feat events: 31
truth events: 31


In [6]:
feat_events = set(df_feat.index.unique())
truth_events = set(df_truth.index.unique())

missing_in_feat = sorted(list(truth_events - feat_events))
missing_in_truth = sorted(list(feat_events - truth_events))

print("missing_in_feat:", missing_in_feat)
print("missing_in_truth:", missing_in_truth)


missing_in_feat: []
missing_in_truth: []


## Sanity Checks

In [7]:
assert df_feat.index.name == "event_no"
assert df_truth.index.name == "event_no"

print("features rows:", len(df_feat))
print("truth rows   :", len(df_truth))
print("n events feat:", df_feat.index.nunique())
print("n events tru :", df_truth.index.nunique())


features rows: 8554
truth rows   : 31
n events feat: 31
n events tru : 31


In [8]:
pulse_counts = df_feat.groupby(level=0).size()

print("Pulse count summary:")
print(pulse_counts.describe(percentiles=[0.01, 0.05, 0.5, 0.95, 0.99]))

print("\nMin pulses event_no:", int(pulse_counts.idxmin()), "pulses:", int(pulse_counts.min()))
print("Max pulses event_no:", int(pulse_counts.idxmax()), "pulses:", int(pulse_counts.max()))


Pulse count summary:
count      31.000000
mean      275.935484
std       527.410462
min         5.000000
1%          5.900000
5%          9.500000
50%        88.000000
95%      1328.000000
99%      2179.600000
max      2488.000000
dtype: float64

Min pulses event_no: 43733 pulses: 5
Max pulses event_no: 43735 pulses: 2488


In [9]:
truth_counts = df_truth.groupby(level=0).size()
print(truth_counts.value_counts().sort_index())

bad = truth_counts[truth_counts != 1]
print("\nEvents with truth_rows != 1:", len(bad))
if len(bad) > 0:
    print(bad.head(10))


1    31
Name: count, dtype: int64

Events with truth_rows != 1: 0


In [10]:
import numpy as np

def nan_inf_report(df, name):
    nan_counts = df.isna().sum().sort_values(ascending=False)
    inf_counts = pd.Series(
        {c: np.isinf(df[c].to_numpy(dtype=float, copy=False)).sum()
         for c in df.columns if pd.api.types.is_numeric_dtype(df[c])}
    ).sort_values(ascending=False)

    print(f"\n=== {name} NaN top ===")
    print(nan_counts.head(10))
    print(f"\n=== {name} INF top ===")
    print(inf_counts.head(10))

nan_inf_report(df_feat, "FEATURES")
nan_inf_report(df_truth, "TRUTH")



=== FEATURES NaN top ===
charge        0
dom_time      0
dom_x         0
dom_y         0
dom_z         0
string        0
pmt_number    0
dom_number    0
pmt_x         0
pmt_y         0
dtype: int64

=== FEATURES INF top ===
charge        0
dom_time      0
dom_x         0
dom_y         0
dom_z         0
string        0
pmt_number    0
dom_number    0
pmt_x         0
pmt_y         0
dtype: int64

=== TRUTH NaN top ===
energy              0
position_x          0
position_y          0
position_z          0
azimuth             0
zenith              0
pid                 0
event_time          0
sim_type            0
interaction_type    0
dtype: int64

=== TRUTH INF top ===
energy              0
position_x          0
position_y          0
position_z          0
azimuth             0
zenith              0
pid                 0
event_time          0
interaction_type    0
elasticity          0
dtype: int64


In [11]:
def range_check(df, cols, name):
    print(f"\n=== {name} ranges ===")
    for c in cols:
        if c in df.columns and pd.api.types.is_numeric_dtype(df[c]):
            s = df[c]
            print(f"{c:>12}  min={s.min():.6g}  max={s.max():.6g}")

range_check(df_feat, ["charge","dom_time","dom_x","dom_y","dom_z","rde","pmt_area","event_time"], "FEATURES")
range_check(df_truth, ["energy","zenith","azimuth","position_x","position_y","position_z","elasticity","inelasticity"], "TRUTH")



=== FEATURES ranges ===
      charge  min=0.252694  max=11.2153
    dom_time  min=1400.16  max=9930.32
       dom_x  min=-760  max=760
       dom_y  min=-730.718  max=724.205
       dom_z  min=-450  max=500

=== TRUTH ranges ===
      energy  min=132.555  max=1.38235e+06
      zenith  min=0.278003  max=2.64809
     azimuth  min=0.126747  max=6.15757
  position_x  min=-1302.38  max=1034.67
  position_y  min=-1079.86  max=2372.3
  position_z  min=-1956.6  max=1973.95
  elasticity  min=0.0742871  max=0.99552


In [12]:
common_events = sorted(set(df_feat.index.unique()) & set(df_truth.index.unique()))
sample_events = common_events[:3]

for e in sample_events:
    n_p = int((df_feat.index == e).sum())
    t = df_truth.loc[e]
    print("\n==============================")
    print("event_no:", e, "| pulses:", n_p, "| energy:", float(t["energy"]))
    print("truth (subset):")
    print(t[["energy","zenith","azimuth","pid","interaction_type","elasticity","inelasticity","is_starting"]]
          if "is_starting" in t.index else t.head(10))
    print("\nfirst 5 pulses:")
    print(df_feat.loc[e, ["charge","dom_time","dom_x","dom_y","dom_z"]].head(5))



event_no: 43719 | pulses: 15 | energy: 3401.4438557625645
truth (subset):
energy                 3401.443856
position_x              766.099159
position_y              272.415879
position_z              204.561746
azimuth                   5.217458
zenith                    1.250895
pid                             14
event_time                       0
sim_type            LeptonInjector
interaction_type                 1
Name: 43719, dtype: object

first 5 pulses:
            charge     dom_time  dom_x       dom_y  dom_z
event_no                                                 
43719     1.264909  1945.716981  680.0  239.230485  150.0
43719     1.058583  1866.558264  680.0  239.230485  200.0
43719     0.864575  2017.284672  680.0  239.230485  250.0
43719     1.383859  1915.451898  680.0  239.230485  250.0
43719     0.967552  2106.463855  640.0  308.512517  150.0

event_no: 43720 | pulses: 111 | energy: 1419.953960325708
truth (subset):
energy                  1419.95396
position_x     