# Phase 1 — Data Assembly (Manifests + Splits)

This notebook builds canonical CSV **manifests** and **train/val/test splits** for the MRI MVP.

**Key rule:** `/kaggle/working` is *ephemeral*. After any Kaggle runtime restart, rerun this notebook **top-to-bottom** to regenerate artifacts.

## Outputs (written to `/kaggle/working/data_artifacts`)
### Manifests
- `manifest_4class_images.csv` — 4-class image paths + labels (folder datasets)
- `manifest_4class_npz.csv` — NPZ index manifest (external dataset)
- `manifest_challenge_non_tumor.csv` — near-domain “challenge” pool (non-tumor / OOD-ish)

### Splits
- `split_train_images.csv`, `split_val_images.csv`, `split_test_images.csv`
- `split_external_test_npz.csv`
- `split_challenge_sampled.csv`


## 0) Configuration

In [None]:
# CELL: 01_CONFIG — Imports + canonical paths + dataset map (single source of truth)
from pathlib import Path
from datetime import datetime, timezone
import json
import subprocess

import numpy as np
import pandas as pd

DATASETS = Path("/kaggle/input/datasets")
OUT = Path("/kaggle/working/data_artifacts")
OUT.mkdir(parents=True, exist_ok=True)

DATASET_ROOTS = {
    # 4-class image folders (primary training pool)
    "masoudnickparvar/brain-tumor-mri-dataset": DATASETS/"masoudnickparvar"/"brain-tumor-mri-dataset",
    "sabersakin/brainmri": DATASETS/"sabersakin"/"brainmri",

    # 4-class NPZ (external test pool)
    "muazalzoubi/brain-tumor-gliomameningiomapituitary-not-tumors":
        DATASETS/"muazalzoubi"/"brain-tumor-gliomameningiomapituitary-not-tumors",

    # near-domain challenge pools (non-tumor / OOD-ish)
    "mitangshu11/brain-stroke-mri-images": DATASETS/"mitangshu11"/"brain-stroke-mri-images",
    "ninadaithal/imagesoasis": DATASETS/"ninadaithal"/"imagesoasis",
    "trainingdatapro/dicom-brain-dataset": DATASETS/"trainingdatapro"/"dicom-brain-dataset",
}

LABELS = {
    "glioma": 0,
    "meningioma": 1,
    "pituitary": 2,
    "notumor": 3,
    "no_tumor": 3,
    "no tumor": 3,
}
IMAGE_EXTS = {".jpg", ".jpeg", ".png", ".bmp", ".tif", ".tiff"}

print("DATASETS:", DATASETS, "exists:", DATASETS.exists())
print("OUT:", OUT, "exists:", OUT.exists())


In [None]:
!ls -1 /kaggle/input | sed -n '1,200p'


## 1) Dataset inventory (existence + size)

In [None]:
# CELL: 02_DATASET_INVENTORY — Validate dataset roots and print quick size (du -sh)
def _du_sh(p: Path) -> str:
    try:
        out = subprocess.check_output(["du", "-sh", str(p)], stderr=subprocess.DEVNULL).decode().strip()
        return out.split()[0]
    except Exception:
        return "n/a"

missing = []
for slug, root in DATASET_ROOTS.items():
    ok = root.exists()
    size = _du_sh(root) if ok else "-"
    print(("OK " if ok else "MISS"), f"{slug:65s}", "size:", f"{size:>6s}", "path:", root)
    if ok:
        # show a few top-level entries (cheap)
        try:
            entries = sorted([p.name for p in root.iterdir()])[:8]
            print("   -> top entries:", entries)
        except Exception as e:
            print("   -> (could not list entries):", repr(e))
    else:
        missing.append(slug)

assert not missing, f"Missing required dataset roots: {missing}"


## 2) Build manifests (canonical CSVs)

In [None]:
# CELL: 03_BUILD_MANIFESTS — Write manifest CSVs to /kaggle/working/data_artifacts
from pathlib import Path

def infer_label(path: Path):
    parts = [p.lower() for p in path.parts]
    for k, v in LABELS.items():
        if k in parts:
            return k, v
    return None, None

def build_image_manifest(dataset_slug: str, root: Path) -> pd.DataFrame:
    rows = []
    for p in root.rglob("*"):
        if not p.is_file():
            continue
        if p.suffix.lower() not in IMAGE_EXTS:
            continue
        name, lid = infer_label(p)
        if lid is None:
            continue
        rows.append({"source": dataset_slug, "path": str(p), "label_name": name, "label_id": lid})
    return pd.DataFrame(rows)

def build_npz_manifest(split_name: str, npz_path: Path) -> pd.DataFrame:
    d = np.load(npz_path, allow_pickle=True)
    y = d["y"]
    lids = np.argmax(y, axis=1).astype(int)
    return pd.DataFrame({
        "source": "muazalzoubi/brain-tumor-gliomameningiomapituitary-not-tumors",
        "container": str(npz_path),
        "split": split_name,
        "index": np.arange(len(lids), dtype=int),
        "label_id": lids,
    })

def list_images(root: Path, source: str, domain: str) -> pd.DataFrame:
    rows = []
    for p in root.rglob("*"):
        if p.is_file() and p.suffix.lower() in IMAGE_EXTS:
            rows.append({"source": source, "domain": domain, "path": str(p)})
    return pd.DataFrame(rows)

# --- 2.1 4-class image manifests (folder datasets)
df_img = pd.concat([
    build_image_manifest("masoudnickparvar/brain-tumor-mri-dataset", DATASET_ROOTS["masoudnickparvar/brain-tumor-mri-dataset"]),
    build_image_manifest("sabersakin/brainmri", DATASET_ROOTS["sabersakin/brainmri"]),
], ignore_index=True)

p_img = OUT/"manifest_4class_images.csv"
df_img.to_csv(p_img, index=False)

# --- 2.2 4-class NPZ manifest (external dataset)
MUAZ = DATASET_ROOTS["muazalzoubi/brain-tumor-gliomameningiomapituitary-not-tumors"]
print("MUAZ exists:", MUAZ.exists(), "->", MUAZ)
print("MUAZ files:", [p.name for p in MUAZ.glob("*.npz")])

df_npz = pd.concat([
    build_npz_manifest("train", MUAZ/"training.npz"),
    build_npz_manifest("val", MUAZ/"validation.npz"),
    build_npz_manifest("test", MUAZ/"test.npz"),
], ignore_index=True)

p_npz = OUT/"manifest_4class_npz.csv"
df_npz.to_csv(p_npz, index=False)

# --- 2.3 Challenge manifest (near-domain / non-tumor pools)
df_chal = pd.concat([
    list_images(DATASET_ROOTS["mitangshu11/brain-stroke-mri-images"], "mitangshu11/brain-stroke-mri-images", "stroke"),
    list_images(DATASET_ROOTS["ninadaithal/imagesoasis"], "ninadaithal/imagesoasis", "oasis"),
    pd.DataFrame([{
        "source": "trainingdatapro/dicom-brain-dataset",
        "domain": "normal_dicom",
        "path": str(p),
    } for p in DATASET_ROOTS["trainingdatapro/dicom-brain-dataset"].rglob("*")
      if p.is_file() and p.suffix.lower() == ".dcm"])
], ignore_index=True)

p_chal = OUT/"manifest_challenge_non_tumor.csv"
df_chal.to_csv(p_chal, index=False)

# --- hard assertions + metadata snapshot
for p in [p_img, p_npz, p_chal]:
    assert p.exists() and p.stat().st_size > 0, f"Failed to write {p}"

meta = {
    "utc_built_at": datetime.now(timezone.utc).isoformat(),
    "out_dir": str(OUT),
    "rows": {
        p_img.name: int(len(df_img)),
        p_npz.name: int(len(df_npz)),
        p_chal.name: int(len(df_chal)),
    },
}
(OUT/"run_metadata_manifests.json").write_text(json.dumps(meta, indent=2))
print("OK wrote:", p_img.name, "rows=", len(df_img))
print("OK wrote:", p_npz.name, "rows=", len(df_npz))
print("OK wrote:", p_chal.name, "rows=", len(df_chal))
print("OK manifests present in:", OUT)


## 3) Verify manifests (counts + label distributions)

In [None]:
# CELL: 04_VERIFY_MANIFESTS — Read manifests and print sanity summaries
df_img  = pd.read_csv(OUT/"manifest_4class_images.csv")
df_npz  = pd.read_csv(OUT/"manifest_4class_npz.csv")
df_chal = pd.read_csv(OUT/"manifest_challenge_non_tumor.csv")

print("manifest_4class_images rows:", len(df_img))
print("  sources:\n", df_img["source"].value_counts())
print("  labels:\n", df_img["label_name"].value_counts())

print("\nmanifest_4class_npz rows:", len(df_npz))
print("  splits:\n", df_npz["split"].value_counts())
print("  label_id:\n", df_npz["label_id"].value_counts())

print("\nmanifest_challenge_non_tumor rows:", len(df_chal))
print("  domains:\n", df_chal["domain"].value_counts())

print("\nArtifacts in OUT:")
for p in sorted(OUT.glob("*.csv")):
    print(" -", p.name, f"({p.stat().st_size/1e6:.2f} MB)")


## 4) Build splits (train/val/test + external test + challenge sample)

In [None]:
# CELL: 05_BUILD_SPLITS — Write split CSVs (future-safe groupby apply)
OUT = Path("/kaggle/working/data_artifacts")

df_img  = pd.read_csv(OUT/"manifest_4class_images.csv")
df_npz  = pd.read_csv(OUT/"manifest_4class_npz.csv")
df_chal = pd.read_csv(OUT/"manifest_challenge_non_tumor.csv")

print("loaded:", len(df_img), "images;", len(df_npz), "npz rows;", len(df_chal), "challenge rows")

# 4.1 image-folder split (stratified by label_id)
TEST_FRAC = 0.10
VAL_FRAC  = 0.10  # of remaining after test
SEED = 42

df_img = df_img.sample(frac=1.0, random_state=SEED).reset_index(drop=True)

train_parts, val_parts, test_parts = [], [], []
for lid, g in df_img.groupby("label_id"):
    g = g.sample(frac=1.0, random_state=SEED)
    n = len(g)
    n_test = int(round(n * TEST_FRAC))
    test = g.iloc[:n_test]
    rem = g.iloc[n_test:]
    n_val = int(round(len(rem) * VAL_FRAC))
    val = rem.iloc[:n_val]
    train = rem.iloc[n_val:]
    train_parts.append(train)
    val_parts.append(val)
    test_parts.append(test)

df_train = pd.concat(train_parts).sample(frac=1.0, random_state=SEED).reset_index(drop=True)
df_val   = pd.concat(val_parts).sample(frac=1.0, random_state=SEED).reset_index(drop=True)
df_test  = pd.concat(test_parts).sample(frac=1.0, random_state=SEED).reset_index(drop=True)

# 4.2 external test from NPZ: test split only (kept OUT-OF-TRAINING by default)
df_external_npz = df_npz[df_npz["split"] == "test"].copy().reset_index(drop=True)

# 4.3 challenge sampled (fast iteration; keep small but representative)
N_PER_DOMAIN = 2000
cols = ["source", "domain", "path"]
df_chal_sample = (
    df_chal.groupby("domain", group_keys=False)[cols]
          .apply(lambda g: g.sample(min(len(g), N_PER_DOMAIN), random_state=SEED))
          .reset_index(drop=True)
)

# write
p_train = OUT/"split_train_images.csv"
p_val   = OUT/"split_val_images.csv"
p_test  = OUT/"split_test_images.csv"
p_ext   = OUT/"split_external_test_npz.csv"
p_chal  = OUT/"split_challenge_sampled.csv"

df_train.to_csv(p_train, index=False)
df_val.to_csv(p_val, index=False)
df_test.to_csv(p_test, index=False)
df_external_npz.to_csv(p_ext, index=False)
df_chal_sample.to_csv(p_chal, index=False)

for p in [p_train, p_val, p_test, p_ext, p_chal]:
    assert p.exists() and p.stat().st_size > 0, f"Missing/empty after write: {p}"

print("\nImage splits:")
print("train:", len(df_train), "val:", len(df_val), "test:", len(df_test))
print("train label dist:\n", df_train["label_name"].value_counts())
print("val label dist:\n", df_val["label_name"].value_counts())
print("test label dist:\n", df_test["label_name"].value_counts())

print("\nExternal NPZ test:", len(df_external_npz))
print("npz label_id dist:\n", df_external_npz["label_id"].value_counts())

print("\nChallenge sampled:", len(df_chal_sample))
print("domain dist:\n", df_chal_sample["domain"].value_counts())

print("\nOK Split artifacts present.")


## 5) Verify outputs (quick peek)

In [None]:
from IPython.display import display

# CELL: 06_VERIFY_OUTPUTS — List artifacts and show sample rows
paths = [
    OUT/"split_train_images.csv",
    OUT/"split_val_images.csv",
    OUT/"split_test_images.csv",
    OUT/"split_external_test_npz.csv",
    OUT/"split_challenge_sampled.csv",
]
print("Artifacts in OUT:")
for p in sorted(OUT.glob("*.csv")):
    print(" -", p.name, f"({p.stat().st_size/1e6:.2f} MB)")

print("\nHeads:")
print("\ntrain head:")
display(pd.read_csv(OUT/"split_train_images.csv").head(3))
print("\nexternal npz head:")
display(pd.read_csv(OUT/"split_external_test_npz.csv").head(3))
print("\nchallenge head:")
display(pd.read_csv(OUT/"split_challenge_sampled.csv").head(3))


## Notes / Known limitations (honest engineering)

- These datasets are mostly **2D slices** and may not have patient IDs. True patient-level splitting is not possible without metadata.
- We treat the NPZ dataset as **external test** to measure domain shift (kept out of training by default).
- The challenge pool is for **validity/OOD behavior checks** (we expect conservative confidence once gating exists).


### 9****9 Health Check

In [None]:
from pathlib import Path
import pandas as pd

OUT = Path("/kaggle/working/data_artifacts")
print("OUT:", OUT, "exists:", OUT.exists())

expected = [
    "manifest_4class_images.csv",
    "manifest_4class_npz.csv",
    "manifest_challenge_non_tumor.csv",
    "split_train_images.csv",
    "split_val_images.csv",
    "split_test_images.csv",
    "split_external_test_npz.csv",
    "split_challenge_sampled.csv",
]

print("\nFiles:")
missing = []
for name in expected:
    p = OUT/name
    ok = p.exists() and p.stat().st_size > 0
    print(("OK  " if ok else "MISS"), f"{name:35s}", "size:", (p.stat().st_size if p.exists() else "-"))
    if not ok:
        missing.append(name)

assert not missing, f"Missing artifacts: {missing}"

# Load + validate splits
df_train = pd.read_csv(OUT/"split_train_images.csv")
df_val   = pd.read_csv(OUT/"split_val_images.csv")
df_test  = pd.read_csv(OUT/"split_test_images.csv")
df_ext   = pd.read_csv(OUT/"split_external_test_npz.csv")
df_chal  = pd.read_csv(OUT/"split_challenge_sampled.csv")

print("\nCounts:")
print("train:", len(df_train), "val:", len(df_val), "test:", len(df_test))
print("external_npz:", len(df_ext), "challenge_sampled:", len(df_chal))

# Sanity: required columns
for name, df, cols in [
    ("train", df_train, {"path","label_id","label_name","source"}),
    ("val",   df_val,   {"path","label_id","label_name","source"}),
    ("test",  df_test,  {"path","label_id","label_name","source"}),
    ("ext",   df_ext,   {"container","split","index","label_id","source"}),
    ("chal",  df_chal,  {"path","domain","source"}),
]:
    miss = cols - set(df.columns)
    assert not miss, f"{name} missing columns: {miss}"

# Sanity: files actually exist on disk (sample a few)
def sample_exists(df, col, n=5):
    s = df[col].sample(min(n, len(df)), random_state=42).tolist()
    for x in s:
        assert Path(x).exists(), f"Missing file on disk: {x}"

sample_exists(df_train, "path", 8)
sample_exists(df_val, "path", 5)
sample_exists(df_test, "path", 5)

# Label sanity
labels = sorted(df_train["label_id"].unique().tolist())
print("\nLabel ids present in train:", labels)
assert set(labels) <= {0,1,2,3}, "Unexpected label ids"

print("\nLabel distribution (train):\n", df_train["label_name"].value_counts())
print("\nChallenge domains:\n", df_chal["domain"].value_counts())

print("\n✅ DATA PHASE HEALTHCHECK PASSED")

train_set = set(pd.read_csv(OUT/"split_train_images.csv")["path"])
val_set   = set(pd.read_csv(OUT/"split_val_images.csv")["path"])
test_set  = set(pd.read_csv(OUT/"split_test_images.csv")["path"])

assert train_set.isdisjoint(val_set)
assert train_set.isdisjoint(test_set)
assert val_set.isdisjoint(test_set)
print("✅ No path overlap between train/val/test")
