# Investigate: Parquets inside "merged" Folder

In [1]:
from pathlib import Path
import re
import pandas as pd
import numpy as np

## CONFIG


In [2]:
MERGED_DIR = Path("/project/def-nahee/kbas/POM_Response_Parquet/merged")
TRUTH_FOLDER = "truth"
FEATURE_FOLDER = "features"

## HELPERS

In [3]:
def list_folders(root: Path):
    folders = sorted([p.name for p in root.iterdir() if p.is_dir()])
    print("Folders in merged:", folders)
    return folders

def list_parquets(table_dir: Path):
    files = sorted(table_dir.glob("*.parquet"))
    return files

def get_batch_id(fname: str):
    # truth_12.parquet / features_12.parquet -> 12
    m = re.search(r"_(\d+)\.parquet$", fname)
    return int(m.group(1)) if m else None



def quick_df_info(df: pd.DataFrame, name: str):
    print(f"\n--- {name} ---")
    print("rows:", len(df), "cols:", df.shape[1])
    print("columns:", list(df.columns))

    def _print_event_stats(s: pd.Series, where: str):
        print(f"event_no found in {where}")
        print("event_no unique:", s.nunique(dropna=False))
        vc = s.value_counts(dropna=False)
        print("event_no value_counts head:")
        print(vc.head(5))

    # 1) Column check
    if "event_no" in df.columns:
        _print_event_stats(df["event_no"], "columns")
        return

    # 2) Index / MultiIndex check
    idx = df.index
    idx_names = list(idx.names) if hasattr(idx, "names") else [idx.name]
    if "event_no" in idx_names:
        # MultiIndex ise ilgili level'ı çek
        if isinstance(idx, pd.MultiIndex):
            s = pd.Series(idx.get_level_values("event_no"), name="event_no")
            _print_event_stats(s, "index level 'event_no'")
        else:
            s = pd.Series(idx, name="event_no")
            _print_event_stats(s, "index")
        return

    # 3) Fallback: index adı farklı ama parquet index kolon olarak geri gelmiş olabilir
    # reset_index sonrası 'event_no' ortaya çıkıyor mu diye bak
    tmp_cols = list(df.reset_index().columns)
    if "event_no" in tmp_cols:
        s = df.reset_index()["event_no"]
        _print_event_stats(s, "reset_index() columns")
        return

    # 4) tip about index 
    print("NO event_no in columns or index.")
    print("index type:", type(idx).__name__, "index name(s):", idx_names)
    try:
        print("index head:", list(idx[:5]))
    except Exception:
        pass


def summarize_features(df: pd.DataFrame, quantiles=(0.01, 0.05, 0.50, 0.95, 0.99)) -> pd.DataFrame:
    rows = []
    n = len(df)

    for col in df.columns:
        s = df[col]
        row = {
            "col": col,
            "dtype": str(s.dtype),
            "rows": n,
            "non_null": int(s.notna().sum()),
            "null": int(s.isna().sum()),
        }

        # numeric summary
        if pd.api.types.is_numeric_dtype(s):
            s_num = pd.to_numeric(s, errors="coerce")
            row.update({
                "mean": float(s_num.mean()),
                "std": float(s_num.std()),
                "min": float(s_num.min()),
                "max": float(s_num.max()),
            })
            qs = s_num.quantile(list(quantiles))
            for q in quantiles:
                row[f"p{int(q*100):02d}"] = float(qs.loc[q])
        else:
            # non-numeric: basic cardinality / top values
            row["nunique"] = int(s.nunique(dropna=True))
            vc = s.value_counts(dropna=True).head(5)
            row["top5"] = ", ".join([f"{idx}:{cnt}" for idx, cnt in vc.items()])

        rows.append(row)

    out = pd.DataFrame(rows)

    # nice column order
    base = ["col", "dtype", "rows", "non_null", "null"]
    num_cols = ["mean", "std", "min"] + [f"p{int(q*100):02d}" for q in quantiles] + ["max"]
    other = ["nunique", "top5"]
    cols = [c for c in base + num_cols + other if c in out.columns] + [c for c in out.columns if c not in (base + num_cols + other)]
    return out[cols]


## MAIN

In [4]:
assert MERGED_DIR.exists(), f"MERGED_DIR not found: {MERGED_DIR}"

In [5]:
folders = list_folders(MERGED_DIR)

Folders in merged: ['features', 'truth']


In [6]:
truth_dir = MERGED_DIR / TRUTH_FOLDER
feat_dir  = MERGED_DIR / FEATURE_FOLDER

assert truth_dir.exists(), f"truth dir not found: {truth_dir}"
assert feat_dir.exists(), f"feature dir not found: {feat_dir}"

In [7]:
truth_files = list_parquets(truth_dir)
feat_files  = list_parquets(feat_dir)

print("\nTruth parquet count:", len(truth_files))
print("Feature parquet count:", len(feat_files))


Truth parquet count: 1
Feature parquet count: 1


In [8]:
# batch id sets
truth_ids = {get_batch_id(f.name) for f in truth_files}
feat_ids  = {get_batch_id(f.name) for f in feat_files}
truth_ids.discard(None); feat_ids.discard(None)

print("\nBatch IDs (truth)  :", sorted(list(truth_ids))[:20], "...")
print("Batch IDs (feature):", sorted(list(feat_ids))[:20], "...")
print("IDs in both:", len(truth_ids & feat_ids))
print("Only truth:", sorted(list(truth_ids - feat_ids))[:20])
print("Only feat :", sorted(list(feat_ids - truth_ids))[:20])



Batch IDs (truth)  : [0] ...
Batch IDs (feature): [0] ...
IDs in both: 1
Only truth: []
Only feat : []


In [9]:
common_ids = sorted(list(truth_ids & feat_ids))
if not common_ids:
    raise RuntimeError("No common batch ids between truth and feature!")
common_ids

[0]

In [10]:
inspect_id = common_ids[0]
print("\nInspecting batch id:", inspect_id)


Inspecting batch id: 0


In [11]:
tpath = truth_dir / f"{TRUTH_FOLDER}_{inspect_id}.parquet"
fpath = feat_dir  / f"{FEATURE_FOLDER}_{inspect_id}.parquet"
print(tpath, fpath)

/project/def-nahee/kbas/POM_Response_Parquet/merged/truth/truth_0.parquet /project/def-nahee/kbas/POM_Response_Parquet/merged/features/features_0.parquet


In [12]:
tdf = pd.read_parquet(tpath)
fdf = pd.read_parquet(fpath)

# Analysis

In [13]:
quick_df_info(tdf, f"TRUTH batch {inspect_id} ({tpath.name})")


--- TRUTH batch 0 (truth_0.parquet) ---
rows: 746 cols: 15
columns: ['energy', 'position_x', 'position_y', 'position_z', 'azimuth', 'zenith', 'pid', 'event_time', 'sim_type', 'interaction_type', 'elasticity', 'RunID', 'SubrunID', 'EventID', 'SubEventID']
event_no found in index
event_no unique: 746
event_no value_counts head:
event_no
599    1
696    1
651    1
672    1
695    1
Name: count, dtype: int64


In [14]:
quick_df_info(fdf, f"FEATURE batch {inspect_id} ({fpath.name})")


--- FEATURE batch 0 (features_0.parquet) ---
rows: 3372308 cols: 5
columns: ['charge', 'dom_time', 'dom_x', 'dom_y', 'dom_z']
event_no found in index
event_no unique: 746
event_no value_counts head:
event_no
403    9367
361    7531
443    7362
493    6577
405    5891
Name: count, dtype: int64


In [15]:
print("\n=== TRUTH head ===")
display(tdf.head())             


=== TRUTH head ===


Unnamed: 0_level_0,energy,position_x,position_y,position_z,azimuth,zenith,pid,event_time,sim_type,interaction_type,elasticity,RunID,SubrunID,EventID,SubEventID
event_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
696,639.466672,-1046.671099,-178.810131,775.750951,3.668017,0.46026,-14,0,LeptonInjector,1,0.659847,1016,4294967295,187,0
651,345.979393,-134.821341,-156.781099,511.837625,0.036637,1.365456,14,0,LeptonInjector,1,0.484912,1016,4294967295,7,0
672,25579.431266,-12.274139,-366.356952,476.605146,3.963338,1.014043,14,0,LeptonInjector,1,0.570786,1016,4294967295,81,0
695,376.114768,298.849338,526.693845,-363.415712,5.155793,1.609414,-14,0,LeptonInjector,1,0.148769,1016,4294967295,186,0
653,292.086474,226.158901,631.034313,305.997852,5.312811,2.784032,14,0,LeptonInjector,1,0.153314,1016,4294967295,16,0


In [16]:
print("\n=== FEATURE head ===")
display(fdf.head())



=== FEATURE head ===


Unnamed: 0_level_0,charge,dom_time,dom_x,dom_y,dom_z
event_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
696,1.432395,1864.761098,-120.0,-730.717968,-300.0
696,1.257238,3782.905879,-120.0,-730.717968,-150.0
696,1.14719,9111.458881,-120.0,-730.717968,-150.0
696,0.946087,1736.127698,-120.0,-730.717968,200.0
696,0.666936,4235.702576,-120.0,-730.717968,250.0


In [17]:
ev = tdf.index[0]   # an event id
print("example event_no:", ev)
print(fdf.loc[ev].head(10))

example event_no: 696
            charge     dom_time  dom_x       dom_y  dom_z
event_no                                                 
696       1.432395  1864.761098 -120.0 -730.717968 -300.0
696       1.257238  3782.905879 -120.0 -730.717968 -150.0
696       1.147190  9111.458881 -120.0 -730.717968 -150.0
696       0.946087  1736.127698 -120.0 -730.717968  200.0
696       0.666936  4235.702576 -120.0 -730.717968  250.0
696       1.682696  4565.021736 -120.0 -730.717968  300.0
696       0.776496  3689.833058 -120.0 -730.717968  350.0
696       0.924859  3333.453982 -120.0 -730.717968  400.0
696       1.101063  8022.926487 -120.0 -730.717968  400.0
696       0.291259  1170.097076 -120.0 -730.717968  450.0


In [18]:
summary = summarize_features(fdf)
summary

Unnamed: 0,col,dtype,rows,non_null,null,mean,std,min,p01,p05,p50,p95,p99,max
0,charge,float64,3372308,3372308,0,1.00898,0.308825,0.25,0.357754,0.522785,1.003308,1.500683,1.721955,11.132623
1,dom_time,float64,3372308,3372308,0,5027.118276,2845.896947,0.004003,118.025216,576.735888,4982.351576,9500.953163,9899.824083,9999.999022
2,dom_x,float64,3372308,3372308,0,-0.370417,389.104032,-760.0,-720.0,-640.0,0.0,640.0,720.0,760.0
3,dom_y,float64,3372308,3372308,0,-1.316477,385.656532,-730.717968,-730.717968,-592.153903,31.384388,654.922678,724.204711,724.204711
4,dom_z,float64,3372308,3372308,0,25.899117,288.492284,-450.0,-450.0,-450.0,50.0,500.0,500.0,500.0
