In [1]:
# CELL: KFold split builder
import os, json, math, numpy as np, pandas as pd
from pathlib import Path
from sklearn.model_selection import StratifiedKFold

In [2]:
import os
import torch
import torch.nn as nn
import json

from config import get_config
cfg=get_config(config_path="/home/arsalan/wsu-grid/ml-jet-param-predictor/" \
"experiments/exp_kfold_pretrained_models/config/" \
"cross_validation_kfold_datasetrcalc.yml")

# cfg=get_config()
print(json.dumps(vars(cfg), indent=2))


[INFO] Config Path: /home/arsalan/wsu-grid/ml-jet-param-predictor/experiments/exp_kfold_pretrained_models/config/cross_validation_kfold_datasetrcalc.yml
[INFO] Detected native Ubuntu host: DS044955
[INFO] Using dataset root: /home/arsalan/Projects/110_JetscapeML/hm_jetscapeml_source/data/jet_ml_benchmark_config_01_to_09_alpha_0.2_0.3_0.4_q0_1.5_2.0_2.5_MMAT_MLBT_size_7200000_balanced_unshuffled
[INFO] Using dataset_size from config: 1008
{
  "model_tag": "exp_kfold_pretrained_models",
  "backbone": "vit_gaussian",
  "batch_size": 32,
  "epochs": 2,
  "learning_rate": 0.0001,
  "patience": 60,
  "input_shape": [
    1,
    32,
    32
  ],
  "global_max": 121.79151153564453,
  "dataset_root_dir": "/home/arsalan/Projects/110_JetscapeML/hm_jetscapeml_source/data/jet_ml_benchmark_config_01_to_09_alpha_0.2_0.3_0.4_q0_1.5_2.0_2.5_MMAT_MLBT_size_7200000_balanced_unshuffled",
  "train_csv": "/home/arsalan/Projects/110_JetscapeML/hm_jetscapeml_source/data/jet_ml_benchmark_config_01_to_09_alpha_0

In [3]:
# ---- config hooks you likely already have in your notebook ----
# cfg: your argparse/omegaconf-like object
# cfg.dataset_root_dir, cfg.train_csv, cfg.val_csv, cfg.test_csv
# cfg.group_size, cfg.global_max

# Utilities from your loader (we reuse your CSV schema)
from data.loader import load_split_from_csv, save_split_to_csv  # same columns as before

ROOT = Path(cfg.dataset_root_dir)
FOLDS_DIR = ROOT / "folds"
FOLDS_DIR.mkdir(exist_ok=True)

In [4]:
# Load base splits (if you want CV over train+val)
train_base = load_split_from_csv(cfg.train_csv, cfg.dataset_root_dir)
val_base   = load_split_from_csv(cfg.val_csv,   cfg.dataset_root_dir)
test_base  = load_split_from_csv(cfg.test_csv,  cfg.dataset_root_dir)

In [None]:


# Merge train+val, keep test fixed
trainval = train_base + val_base
X = np.array([fp for fp, _ in trainval])

# Pack tuple-label -> single class id for stratification
def pack_label(lbl_tuple):
    # energy (2) * alpha (3) * q0 (4) => 24 classes max
    e, a, q = lbl_tuple
    return e*12 + a*4 + q

y = np.array([pack_label(lbl) for _, lbl in trainval])

n_splits = getattr(cfg, "n_splits", 5)
random_state = getattr(cfg, "random_seed", 42)

skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)

fold_paths = []
for k, (tr_idx, va_idx) in enumerate(skf.split(X, y), start=1):
    fold_dir = FOLDS_DIR / f"fold_{k}"
    fold_dir.mkdir(parents=True, exist_ok=True)

    tr_list = [trainval[i] for i in tr_idx]
    va_list = [trainval[i] for i in va_idx]

    # Save CSVs relative to ROOT
    tr_csv = fold_dir / "train.csv"
    va_csv = fold_dir / "val.csv"
    te_csv = fold_dir / "test.csv"   # constant

    save_split_to_csv(tr_list, str(tr_csv), cfg.dataset_root_dir)
    save_split_to_csv(va_list, str(va_csv), cfg.dataset_root_dir)
    save_split_to_csv(test_base, str(te_csv), cfg.dataset_root_dir)

    fold_paths.append({"k": k, "train_csv": str(tr_csv), "val_csv": str(va_csv), "test_csv": str(te_csv)})

print(f"[OK] Built {n_splits} folds at {FOLDS_DIR}")

In [None]:
# CELL 0 — Setup (paths & params)
import os
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold

# ==== CONFIG: EDIT THESE ====
DATA_ROOT = Path("/wsu/home/gy/gy40/gy4065/hm_jetscapeml_source/data/jet_ml_benchmark_config_01_to_09_alpha_0.2_0.3_0.4_q0_1.5_2.0_2.5_MMAT_MLBT_size_7200000_balanced_unshuffled")

TRAIN_CSV = DATA_ROOT / "file_labels_aggregated_ds1008_g500_train.csv"
VAL_CSV   = DATA_ROOT / "file_labels_aggregated_ds1008_g500_val.csv"

# Output directory for folds
FOLDS_DIR = DATA_ROOT / "folds_agg_ds1008_g500"
N_SPLITS = 5
RANDOM_SEED = 42
SHUFFLE_BEFORE_SPLIT = True  # set False to keep incoming order
# ============================
FOLDS_DIR.mkdir(parents=True, exist_ok=True)
print(f"[INFO] Folds will be written to: {FOLDS_DIR}")


In [None]:
# CELL 1 — Load aggregated CSVs (train + val) and sanity-check
def _assert_columns(df: pd.DataFrame, file:str):
    required = {"agg_id", "file_paths", "energy_loss", "alpha", "q0"}
    missing = required - set(df.columns)
    if missing:
        raise ValueError(f"{file} is missing columns: {sorted(missing)}")

def _coerce_types(df: pd.DataFrame) -> pd.DataFrame:
    # Ensure label dtypes are ints (robust to str)
    for c in ["energy_loss", "alpha", "q0"]:
        df[c] = pd.to_numeric(df[c], downcast="integer")
    # Ensure agg_id is string-like (safe to keep as string)
    df["agg_id"] = df["agg_id"].astype(str)
    # file_paths must remain the pipe-separated string
    df["file_paths"] = df["file_paths"].astype(str)
    return df

train_df = pd.read_csv(TRAIN_CSV)
val_df   = pd.read_csv(VAL_CSV)

_assert_columns(train_df, str(TRAIN_CSV))
_assert_columns(val_df,   str(VAL_CSV))

train_df = _coerce_types(train_df)
val_df   = _coerce_types(val_df)

print(f"[INFO] Loaded aggregated CSVs: train={len(train_df)}, val={len(val_df)}")
all_df = pd.concat([train_df, val_df], ignore_index=True)

if SHUFFLE_BEFORE_SPLIT:
    all_df = all_df.sample(frac=1.0, random_state=RANDOM_SEED).reset_index(drop=True)

print(f"[INFO] Combined rows (train+val): {len(all_df)}")
all_df.head()
