# MRI Phase 2 — Training Baseline v1 (Kaggle)

This notebook trains a **baseline 4‑class brain MRI tumor classifier** using CSV split artifacts created in the Data Phase.

## How to run (important)

1) Attach inputs in Kaggle (right panel → **Add Input**):
- The raw datasets (image folders + any OOD/non-tumor challenge datasets you used)
- The split artifacts dataset (e.g. `mri-data-artifacts-v1`) containing:
  - `split_train_images.csv`, `split_val_images.csv`, `split_test_images.csv`
  - `split_external_test_npz.csv`
  - `split_challenge_sampled.csv`

2) **Turn on GPU right before training**
- Settings → Accelerator → **GPU**
- Restart session
- Run top → bottom.

If you run on CPU, it will work but will be slow.

In [3]:
# CELL: 00_ARTIFACTS_PATHS - Resolve split CSV paths from Kaggle inputs

from pathlib import Path

# --- 1) Helpful visibility: what is actually mounted? ---
INPUT_ROOT = Path("/kaggle/input")
print("Mounted under /kaggle/input:")
if INPUT_ROOT.exists():
    for p in sorted(INPUT_ROOT.iterdir()):
        if p.is_dir():
            print(" -", p.name)

# --- 2) Candidate locations (explicit + fallback search) ---
CANDIDATES = [
    Path("/kaggle/working/data_artifacts"),
    Path("/kaggle/input/mri-data-artifacts-v1/data_artifacts"),  # <-- explicit artifacts dataset path
    Path("/kaggle/input"),  # fallback: search under mounted inputs
]

def find_file(name: str) -> Path | None:
    # Prefer direct hits first
    for base in CANDIDATES:
        if not base.exists():
            continue
        direct = base / name
        if direct.exists():
            return direct

    # Fallback: recursive search (can be slow, so last resort)
    for base in CANDIDATES:
        if not base.exists():
            continue
        hits = list(base.rglob(name))
        if hits:
            hits = sorted(hits, key=lambda p: (len(p.parts), str(p)))
            return hits[0]
    return None

split_train = find_file("split_train_images.csv")
split_val   = find_file("split_val_images.csv")
split_test  = find_file("split_test_images.csv")
split_ext   = find_file("split_external_test_npz.csv")
split_chal  = find_file("split_challenge_sampled.csv")

print("\nResolved split paths:")
print("split_train:", split_train)
print("split_val:  ", split_val)
print("split_test: ", split_test)
print("split_ext:  ", split_ext)
print("split_chal: ", split_chal)

assert all([split_train, split_val, split_test, split_ext, split_chal]), (
    "Missing one or more split CSVs.\n"
    "Make sure the dataset 'kabomolefe/mri-data-artifacts-v1' (Version 2) is attached as an Input.\n"
    "Expected folder: /kaggle/input/mri-data-artifacts-v1/data_artifacts"
)


Mounted under /kaggle/input:
 - datasets
 - mri-demo-artifacts-bundle

Resolved split paths:
split_train: /kaggle/input/datasets/kabomolefe/mri-data-artifacts-v1/data_artifacts/split_train_images.csv
split_val:   /kaggle/input/datasets/kabomolefe/mri-data-artifacts-v1/data_artifacts/split_val_images.csv
split_test:  /kaggle/input/datasets/kabomolefe/mri-data-artifacts-v1/data_artifacts/split_test_images.csv
split_ext:   /kaggle/input/datasets/kabomolefe/mri-data-artifacts-v1/data_artifacts/split_external_test_npz.csv
split_chal:  /kaggle/input/datasets/kabomolefe/mri-data-artifacts-v1/data_artifacts/split_challenge_sampled.csv


In [5]:
# CELL: 01_CONFIG - Reproducibility, device selection, output dirs, env snapshot

from pathlib import Path
import os, random, json
import numpy as np
import pandas as pd
import torch

SEED = 42

def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    # Deterministic is slower but stable for baselines
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(SEED)

USE_CUDA = torch.cuda.is_available()
DEVICE = "cuda" if USE_CUDA else "cpu"
print("DEVICE:", DEVICE)

OUT = Path("/kaggle/working/train_artifacts")
OUT.mkdir(parents=True, exist_ok=True)
print("OUT:", OUT)

# Environment snapshot (useful for anti-regression)
env = {
    "python": os.sys.version,
    "torch": torch.__version__,
    "cuda_available": USE_CUDA,
}
try:
    import torchvision
    env["torchvision"] = torchvision.__version__
except Exception as e:
    env["torchvision"] = f"IMPORT_ERROR: {repr(e)}"

with open(OUT/"env_snapshot.json", "w") as f:
    json.dump(env, f, indent=2)

print("✅ wrote env snapshot:", OUT/"env_snapshot.json")

DEVICE: cpu
OUT: /kaggle/working/train_artifacts
✅ wrote env snapshot: /kaggle/working/train_artifacts/env_snapshot.json


In [15]:
# RESTORE: copy trained model artifacts from the model-bundle dataset into OUT

from pathlib import Path
import shutil

OUT = Path("/kaggle/working/train_artifacts")
OUT.mkdir(parents=True, exist_ok=True)

CANDIDATES = [
    Path("/kaggle/input/mri-demo-artifacts-bundle/train_artifacts"),
    Path("/kaggle/input/mri-demo-artifacts-bundle"),
    Path("/kaggle/input/datasets/mri-demo-artifacts-bundle/train_artifacts"),
    Path("/kaggle/input/datasets/mri-demo-artifacts-bundle"),
]

src = None
for c in CANDIDATES:
    if c.exists():
        # accept either direct train_artifacts folder or dataset root containing train_artifacts
        if c.name == "train_artifacts":
            src = c
            break
        if (c / "train_artifacts").exists():
            src = c / "train_artifacts"
            break

print("Model artifact candidates:")
for c in CANDIDATES:
    print(" -", c, "exists=", c.exists())

assert src is not None, "Could not find model artifacts dataset mount. Make sure 'mri-demo-artifacts-bundle' is attached as Input."

print("Using:", src)

# copy everything (idempotent)
for p in src.glob("*"):
    if p.is_file():
        shutil.copy2(p, OUT / p.name)

print("Restored into:", OUT)
print("Files now in OUT:")
for p in sorted(OUT.iterdir()):
    if p.is_file():
        print(" -", p.name, f"{p.stat().st_size/1e6:.2f}MB")


Model artifact candidates:
 - /kaggle/input/mri-demo-artifacts-bundle/train_artifacts exists= True
 - /kaggle/input/mri-demo-artifacts-bundle exists= True
 - /kaggle/input/datasets/mri-demo-artifacts-bundle/train_artifacts exists= False
 - /kaggle/input/datasets/mri-demo-artifacts-bundle exists= False
Using: /kaggle/input/mri-demo-artifacts-bundle/train_artifacts
Restored into: /kaggle/working/train_artifacts
Files now in OUT:
 - best_model.pth 44.79MB
 - calib_test_logits.pt 0.05MB
 - calib_val_logits.pt 0.05MB
 - calibration_metrics.json 0.00MB
 - calibration_summary_before.csv 0.00MB
 - calibration_summary_before_after.csv 0.00MB
 - challenge_conf_by_pred.csv 0.00MB
 - challenge_confidence_summary.csv 0.00MB
 - challenge_policy_outputs.csv 0.85MB
 - challenge_policy_outputs_demo.csv 1.13MB
 - challenge_policy_outputs_v2.csv 0.95MB
 - challenge_policy_outputs_v3.csv 1.10MB
 - challenge_pred_distribution.csv 0.00MB
 - challenge_predictions.csv 0.77MB
 - domain_guard_tau_sweep.csv 0.00

In [16]:
# LOAD CHECKPOINT + build feat_extractor (robust to filename differences)

import torch
import torch.nn as nn
from pathlib import Path

OUT = Path("/kaggle/working/train_artifacts")

# Prefer best_model.pth, fall back to other known names
cands = [
    OUT/"best_model.pth",
    OUT/"mri_resnet18_baseline_best.pth",
]
ckpt_path = next((p for p in cands if p.exists()), None)

# last-resort: any .pth
if ckpt_path is None:
    any_pth = sorted(OUT.glob("*.pth"))
    ckpt_path = any_pth[0] if any_pth else None

assert ckpt_path is not None, f"No .pth checkpoint found in {OUT}. Run the RESTORE cell above."

print("Loading checkpoint:", ckpt_path)

ckpt = torch.load(ckpt_path, map_location=DEVICE)

# training notebook checkpoints are dicts with model_state_dict
if isinstance(ckpt, dict) and "model_state_dict" in ckpt:
    model.load_state_dict(ckpt["model_state_dict"])
else:
    # fallback: plain state_dict
    model.load_state_dict(ckpt)

model.eval()

feat_extractor = nn.Sequential(*list(model.children())[:-1]).to(DEVICE)
feat_extractor.eval()

print("✅ checkpoint loaded + feat_extractor ready")


Loading checkpoint: /kaggle/working/train_artifacts/best_model.pth
✅ checkpoint loaded + feat_extractor ready


In [5]:
# CELL: 01B_DEVICE_FLAGS - Ensure USE_CUDA + DEVICE are defined for downstream cells

import torch

USE_CUDA = torch.cuda.is_available()
DEVICE = torch.device("cuda" if USE_CUDA else "cpu")

print("USE_CUDA:", USE_CUDA)
print("DEVICE:", DEVICE)
if USE_CUDA:
    print("GPU:", torch.cuda.get_device_name(0))


USE_CUDA: False
DEVICE: cpu


In [17]:
# CELL: 01C_RESTORE_TRAIN_ARTIFACTS - Restore trained artifacts into OUT (robust mount detection)

from pathlib import Path
import shutil

OUT = Path("/kaggle/working/train_artifacts")
OUT.mkdir(parents=True, exist_ok=True)

CANDIDATES = [
    Path("/kaggle/input/mri-demo-artifacts-bundle/train_artifacts"),
    Path("/kaggle/input/mri_demo_artifacts_bundle/train_artifacts"),
    Path("/kaggle/input/datasets/mri-demo-artifacts-bundle/train_artifacts"),
    Path("/kaggle/input/datasets/mri_demo_artifacts_bundle/train_artifacts"),
]

SRC = next((p for p in CANDIDATES if p.exists()), None)

print("Restore candidates:")
for p in CANDIDATES:
    print(" -", p, "exists=", p.exists())

assert SRC is not None, "Could not find train_artifacts in mounted inputs. Confirm 'mri-demo-artifacts-bundle' is attached."

print("Using:", SRC)

for p in SRC.glob("*"):
    if p.is_file():
        shutil.copy2(p, OUT / p.name)

print("✅ Restored into:", OUT)
print("Files now in OUT:")
for p in sorted(OUT.glob("*")):
    if p.is_file():
        print(" -", p.name, f"{p.stat().st_size/1e6:.2f}MB")


Restore candidates:
 - /kaggle/input/mri-demo-artifacts-bundle/train_artifacts exists= True
 - /kaggle/input/mri_demo_artifacts_bundle/train_artifacts exists= False
 - /kaggle/input/datasets/mri-demo-artifacts-bundle/train_artifacts exists= False
 - /kaggle/input/datasets/mri_demo_artifacts_bundle/train_artifacts exists= False
Using: /kaggle/input/mri-demo-artifacts-bundle/train_artifacts
✅ Restored into: /kaggle/working/train_artifacts
Files now in OUT:
 - best_model.pth 44.79MB
 - calib_test_logits.pt 0.05MB
 - calib_val_logits.pt 0.05MB
 - calibration_metrics.json 0.00MB
 - calibration_summary_before.csv 0.00MB
 - calibration_summary_before_after.csv 0.00MB
 - challenge_conf_by_pred.csv 0.00MB
 - challenge_confidence_summary.csv 0.00MB
 - challenge_policy_outputs.csv 0.85MB
 - challenge_policy_outputs_demo.csv 1.13MB
 - challenge_policy_outputs_v2.csv 0.95MB
 - challenge_policy_outputs_v3.csv 1.10MB
 - challenge_pred_distribution.csv 0.00MB
 - challenge_predictions.csv 0.77MB
 - dom

In [7]:
from pathlib import Path

# 1) Define the dataset roots we actually have in THIS notebook
ROOTS = {
    "brainmri": Path("/kaggle/input/datasets/sabersakin/brainmri"),
    "masoud":  Path("/kaggle/input/datasets/masoudnickparvar/brain-tumor-mri-dataset"),
    # add challenge roots here ONLY if df_chal points to them
    # "stroke": Path("/kaggle/input/datasets/mitangshu11/brain-stroke-mri-images"),
    # "oasis":  Path("/kaggle/input/datasets/ninadaithal/imagesoasis"),
    # "dicom":  Path("/kaggle/input/datasets/trainingdatapro/dicom-brain-dataset"),
}

for k,v in ROOTS.items():
    print(k, "exists:", v.exists(), "->", v)

def canon(p: str) -> str:
    p = str(p)
    # already valid
    if Path(p).exists():
        return p

    # If the path contains an identifiable dataset segment, re-root it.
    # Masoud dataset segment
    seg = "/masoudnickparvar/brain-tumor-mri-dataset/"
    if seg in p:
        tail = p.split(seg, 1)[1]
        cand = ROOTS["masoud"] / tail
        return str(cand)

    # Sabersakin brainmri segment
    seg = "/sabersakin/brainmri/"
    if seg in p:
        tail = p.split(seg, 1)[1]
        cand = ROOTS["brainmri"] / tail
        return str(cand)

    # fallback: keep original (debug)
    return p

def apply_canon(df, name):
    df = df.copy()
    df["path"] = df["path"].apply(canon)
    missing = (~df["path"].apply(lambda x: Path(x).exists())).sum()
    print(f"{name}: rows={len(df)} missing={missing}")
    if missing:
        print("Examples:", df.loc[~df["path"].apply(lambda x: Path(x).exists()), "path"].head(3).tolist())
    return df

df_train = apply_canon(df_train, "df_train")
df_val   = apply_canon(df_val,   "df_val")
df_test  = apply_canon(df_test,  "df_test")
df_chal  = apply_canon(df_chal,  "df_chal")


brainmri exists: True -> /kaggle/input/datasets/sabersakin/brainmri
masoud exists: True -> /kaggle/input/datasets/masoudnickparvar/brain-tumor-mri-dataset


NameError: name 'df_train' is not defined

In [6]:
# CELL: 02_LOAD_SPLITS - Load split CSVs into DataFrames

from pathlib import Path
import pandas as pd

split_train = Path(split_train)
split_val   = Path(split_val)
split_test  = Path(split_test)
split_ext   = Path(split_ext)
split_chal  = Path(split_chal)

ARTIFACTS_DIR = split_train.parent
print("ARTIFACTS_DIR:", ARTIFACTS_DIR)

df_train = pd.read_csv(split_train)
df_val   = pd.read_csv(split_val)
df_test  = pd.read_csv(split_test)
df_ext   = pd.read_csv(split_ext)
df_chal  = pd.read_csv(split_chal)

print("loaded: train/val/test:", len(df_train), len(df_val), len(df_test))
print("loaded: external_npz:", len(df_ext), "challenge:", len(df_chal))

# Schema sanity (fail early)
REQ_COLS = {"path", "label_id", "label_name"}
for name, df in [("train", df_train), ("val", df_val), ("test", df_test)]:
    missing = REQ_COLS - set(df.columns)
    assert not missing, f"{name} missing columns: {missing}"

print("✅ split schemas OK")

ARTIFACTS_DIR: /kaggle/input/datasets/kabomolefe/mri-data-artifacts-v1/data_artifacts
loaded: train/val/test: 16646 1850 2055
loaded: external_npz: 1311 challenge: 4275
✅ split schemas OK


In [3]:
from pathlib import Path

def missing_count(df, name):
    m = ~df["path"].apply(lambda p: Path(p).exists())
    print(f"{name}: rows={len(df)} missing={m.sum()}")
    if m.sum():
        print("Examples:", df.loc[m, "path"].head(3).tolist())

missing_count(df_train, "df_train")
missing_count(df_val, "df_val")
missing_count(df_test, "df_test")
missing_count(df_chal, "df_chal")


df_train: rows=16646 missing=0
df_val: rows=1850 missing=0
df_test: rows=2055 missing=0
df_chal: rows=4275 missing=0


In [None]:
# CELL: 03_PREFLIGHT - Strong checks (paths exist, labels valid, distributions)

from pathlib import Path
import numpy as np

def assert_paths_exist(df, n=50, seed=SEED, label="df"):
    sample = df["path"].sample(min(n, len(df)), random_state=seed).tolist()
    missing = [p for p in sample if not Path(p).exists()]
    assert not missing, f"{label}: Missing {len(missing)} files; example: {missing[0]}"

assert_paths_exist(df_train, n=50, label="train")
assert_paths_exist(df_val,   n=30, label="val")
assert_paths_exist(df_test,  n=30, label="test")
print("✅ sample paths exist")

# Label sanity
for name, df in [("train", df_train), ("val", df_val), ("test", df_test)]:
    bad = df[~df["label_id"].isin([0,1,2,3])]
    assert len(bad) == 0, f"{name}: found label_id outside [0,1,2,3]"

print("\nTrain label distribution:\n", df_train["label_name"].value_counts())
print("\nVal label distribution:\n", df_val["label_name"].value_counts())
print("\nTest label distribution:\n", df_test["label_name"].value_counts())


## Data pipeline (datasets, transforms, loaders)

We use CSV-backed datasets so training is fully driven by the saved split artifacts.

In [None]:
# CELL: 03B_VERIFY_LABEL_MAPPING - ensure label_id <-> label_name is 1:1 and consistent

import pandas as pd

def verify(df, name):
    assert "label_id" in df.columns, f"{name} missing label_id"
    assert "label_name" in df.columns, f"{name} missing label_name"

    pairs = df[["label_id", "label_name"]].drop_duplicates().sort_values(["label_id", "label_name"])
    print(f"\n== {name} label pairs ==")
    display(pairs)

    # 1-to-1 checks
    id_to_names = df.groupby("label_id")["label_name"].nunique()
    name_to_ids = df.groupby("label_name")["label_id"].nunique()
    assert (id_to_names <= 1).all(), f"{name}: a label_id maps to multiple label_names: {id_to_names[id_to_names>1].to_dict()}"
    assert (name_to_ids <= 1).all(), f"{name}: a label_name maps to multiple label_ids: {name_to_ids[name_to_ids>1].to_dict()}"

    # counts
    print(f"{name} label_id dist:\n", df["label_id"].value_counts().sort_index())
    print(f"{name} label_name dist:\n", df["label_name"].value_counts())

verify(df_train, "train")
verify(df_val,   "val")
verify(df_test,  "test")

print("\n✅ label_id/label_name mapping is consistent across splits")


In [None]:
# CELL: 03C_VISUALIZE_LABELS - show examples per label_id so we stop guessing

import matplotlib.pyplot as plt
from pathlib import Path
from PIL import Image

K = 4
N_PER_CLASS = 4

pairs = df_train[["label_id","label_name"]].drop_duplicates().sort_values("label_id")
print("label map:\n", pairs)

for lid in sorted(df_train["label_id"].unique()):
    name = df_train.loc[df_train["label_id"]==lid, "label_name"].iloc[0]
    samp = df_train[df_train["label_id"]==lid].sample(min(N_PER_CLASS, (df_train["label_id"]==lid).sum()), random_state=0)

    fig, axes = plt.subplots(1, len(samp), figsize=(3*len(samp), 3))
    if len(samp) == 1:
        axes = [axes]

    for ax, (_, row) in zip(axes, samp.iterrows()):
        img = Image.open(Path(row["path"])).convert("RGB")
        ax.imshow(img)
        ax.axis("off")
    fig.suptitle(f"label_id={lid}  label_name={name}", fontsize=14)
    plt.show()


In [None]:
# CELL: 03C_LOCK_LABEL_MAP - hard assert canonical mapping

EXPECTED = {
    0: "glioma",
    1: "meningioma",
    2: "pituitary",
    3: "notumor",
}

for name, df in [("train", df_train), ("val", df_val), ("test", df_test)]:
    pairs = dict(df[["label_id","label_name"]].drop_duplicates().values.tolist())
    assert pairs == EXPECTED, f"{name} mapping changed!\nGot: {pairs}\nExpected: {EXPECTED}"

print("✅ canonical label map locked:", EXPECTED)


In [7]:
# CELL: 04_TRAIN_CONFIG - Central training hyperparameters (CPU/GPU aware)

from dataclasses import dataclass

NUM_CLASSES = 4
LABEL_MAP = {0:"glioma", 1:"meningioma", 2:"pituitary", 3:"notumor"}

@dataclass(frozen=True)
class TrainConfig:
    img_size: int = 224
    batch_size_cpu: int = 32
    batch_size_gpu: int = 128
    num_workers_cpu: int = 2
    num_workers_gpu: int = 4
    lr: float = 3e-4
    weight_decay: float = 1e-4
    epochs_smoke: int = 1
    epochs_full: int = 10

CFG = TrainConfig()

BATCH_SIZE = CFG.batch_size_gpu if USE_CUDA else CFG.batch_size_cpu
NUM_WORKERS = CFG.num_workers_gpu if USE_CUDA else CFG.num_workers_cpu

print("CFG:", CFG)
print("BATCH_SIZE:", BATCH_SIZE, "| NUM_WORKERS:", NUM_WORKERS)

CFG: TrainConfig(img_size=224, batch_size_cpu=32, batch_size_gpu=128, num_workers_cpu=2, num_workers_gpu=4, lr=0.0003, weight_decay=0.0001, epochs_smoke=1, epochs_full=10)
BATCH_SIZE: 32 | NUM_WORKERS: 2


In [8]:
# CELL: 05_DATASET_AND_TRANSFORMS - CSV dataset + torchvision transforms

from pathlib import Path
import pandas as pd
import torch
from torch.utils.data import Dataset
from PIL import Image
import torchvision.transforms as T

train_tfms = T.Compose([
    T.Resize((CFG.img_size, CFG.img_size)),
    T.RandomHorizontalFlip(p=0.5),
    T.ToTensor(),
    # Baseline: ImageNet stats. We'll revisit for MRI-specific normalization later.
    T.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225]),
])

eval_tfms = T.Compose([
    T.Resize((CFG.img_size, CFG.img_size)),
    T.ToTensor(),
    T.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225]),
])

class CSVDataset(Dataset):
    def __init__(self, df: pd.DataFrame, transforms):
        self.df = df.reset_index(drop=True)
        self.transforms = transforms

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx: int):
        row = self.df.iloc[idx]
        img_path = Path(row["path"])
        y = int(row["label_id"])
        img = Image.open(img_path).convert("RGB")
        x = self.transforms(img)
        return x, y

train_ds = CSVDataset(df_train, train_tfms)
val_ds   = CSVDataset(df_val, eval_tfms)
test_ds  = CSVDataset(df_test, eval_tfms)

print("✅ datasets ready:", len(train_ds), len(val_ds), len(test_ds))

✅ datasets ready: 16646 1850 2055


In [9]:
# CELL: 06_DATALOADERS - Build PyTorch DataLoaders

from torch.utils.data import DataLoader

train_loader = DataLoader(
    train_ds, batch_size=BATCH_SIZE, shuffle=True,
    num_workers=NUM_WORKERS, pin_memory=USE_CUDA
)
val_loader = DataLoader(
    val_ds, batch_size=BATCH_SIZE, shuffle=False,
    num_workers=NUM_WORKERS, pin_memory=USE_CUDA
)
test_loader = DataLoader(
    test_ds, batch_size=BATCH_SIZE, shuffle=False,
    num_workers=NUM_WORKERS, pin_memory=USE_CUDA
)

xb, yb = next(iter(train_loader))
print("batch x:", xb.shape, xb.dtype, "batch y:", yb.shape, yb.dtype)
print("labels in batch:", sorted(set(yb.tolist())))
print("✅ loaders ready")


batch x: torch.Size([32, 3, 224, 224]) torch.float32 batch y: torch.Size([32]) torch.int64
labels in batch: [0, 1, 2, 3]
✅ loaders ready


In [15]:
!find /kaggle/input/datasets/masoudnickparvar/brain-tumor-mri-dataset -name "Tr-no_0747.jpg" | head


## Model

Baseline is **ResNet‑18** (ImageNet pretrained) with a 4‑way classification head.


In [10]:
# CELL: 07_MODEL_OPT - ResNet18, loss, optimizer (pretrained)

import torch
import torch.nn as nn
import torchvision.models as models

assert "CFG" in globals(), "CFG not defined — run Cell 01_CONFIG / 04_TRAIN_CONFIG first."
assert "DEVICE" in globals(), "DEVICE not defined — run Cell 01_CONFIG first."

# Single source of truth
NUM_CLASSES = int(getattr(CFG, "num_classes", 4))  # default 4 if not present

# Build model
weights = models.ResNet18_Weights.DEFAULT
model = models.resnet18(weights=weights)
print("✅ loaded ImageNet pretrained weights")

# Replace head
in_features = model.fc.in_features
model.fc = nn.Linear(in_features, NUM_CLASSES)

model = model.to(DEVICE)

# Loss + optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=float(getattr(CFG, "lr", 3e-4)), weight_decay=float(getattr(CFG, "weight_decay", 1e-4)))

print(f"✅ model ready | num_classes={NUM_CLASSES} | device={DEVICE} | lr={optimizer.param_groups[0]['lr']}")


Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth


100%|██████████| 44.7M/44.7M [00:00<00:00, 145MB/s]

✅ loaded ImageNet pretrained weights
✅ model ready | num_classes=4 | device=cpu | lr=0.0003





In [18]:
import torch
import torch.nn as nn
from pathlib import Path

OUT = Path("/kaggle/working/train_artifacts")

ckpt = torch.load(OUT/"best_model.pth", map_location=DEVICE)
model.load_state_dict(ckpt["model_state_dict"])
model.eval()

feat_extractor = nn.Sequential(*list(model.children())[:-1]).to(DEVICE)
feat_extractor.eval()

print("✅ Loaded checkpoint and built feat_extractor")


✅ Loaded checkpoint and built feat_extractor


## Metrics

We track:
- loss
- confusion matrix
- macro‑F1 (class-balanced)


In [None]:
# CELL: 08_METRICS - Confusion matrix + macro-F1 helpers

import torch

@torch.no_grad()
def confusion_matrix_from_logits(logits: torch.Tensor, y_true: torch.Tensor, num_classes: int) -> torch.Tensor:
    preds = torch.argmax(logits, dim=1)
    cm = torch.zeros((num_classes, num_classes), dtype=torch.int64)
    for t, p in zip(y_true.view(-1).cpu(), preds.view(-1).cpu()):
        cm[int(t), int(p)] += 1
    return cm

def macro_f1_from_cm(cm: torch.Tensor) -> float:
    # cm[t, p]
    cm = cm.to(torch.float32)
    f1s = []
    for k in range(cm.shape[0]):
        tp = cm[k, k]
        fp = cm[:, k].sum() - tp
        fn = cm[k, :].sum() - tp
        denom = (2*tp + fp + fn)
        f1 = (2*tp / denom) if denom > 0 else torch.tensor(0.0)
        f1s.append(f1)
    return float(torch.stack(f1s).mean().item())

print("✅ metrics helpers loaded")


## Training

Run a 1‑epoch smoke test first, then the full baseline training.

**Tip:** turn GPU on in Kaggle Settings before running this section.

In [None]:
# CELL: 09_TRAIN - Train loop with AMP on GPU, save best checkpoint by val macro-F1

import time
import json
import torch

use_cuda = USE_CUDA
scaler = torch.cuda.amp.GradScaler(enabled=use_cuda)

best_f1 = -1.0
best_path = OUT / "best_model.pth"
history = []

def run_epoch(train: bool):
    if train:
        model.train()
    else:
        model.eval()

    total_loss, n = 0.0, 0
    cm = torch.zeros((NUM_CLASSES, NUM_CLASSES), dtype=torch.int64)

    loader = train_loader if train else val_loader

    for xb, yb in loader:
        xb = xb.to(DEVICE, non_blocking=True)
        yb = yb.to(DEVICE, non_blocking=True)

        with torch.set_grad_enabled(train):
            with torch.cuda.amp.autocast(enabled=use_cuda):
                logits = model(xb)
                loss = criterion(logits, yb)

            if train:
                optimizer.zero_grad(set_to_none=True)
                scaler.scale(loss).backward()
                scaler.step(optimizer)
                scaler.update()

        total_loss += float(loss.item()) * xb.size(0)
        n += xb.size(0)

        if not train:
            cm += confusion_matrix_from_logits(logits.detach(), yb.detach(), NUM_CLASSES)

    avg_loss = total_loss / max(n, 1)
    return avg_loss, cm

def train_for(epochs: int, tag: str):
    global best_f1
    for epoch in range(1, epochs + 1):
        t0 = time.time()
        tr_loss, _ = run_epoch(train=True)
        va_loss, cm = run_epoch(train=False)
        va_f1 = macro_f1_from_cm(cm)

        rec = {
            "phase": tag,
            "epoch": epoch,
            "train_loss": tr_loss,
            "val_loss": va_loss,
            "val_macro_f1": va_f1,
            "elapsed_s": round(time.time() - t0, 2),
        }
        history.append(rec)

        improved = va_f1 > best_f1
        if improved:
            best_f1 = va_f1
            torch.save(
                {
                    "model_state_dict": model.state_dict(),
                    "label_map": LABEL_MAP,
                    "num_classes": NUM_CLASSES,
                    "config": CFG.__dict__,
                    "best_val_macro_f1": best_f1,
                },
                best_path,
            )

        print(f"[{tag}] epoch {epoch:02d} | train_loss={tr_loss:.4f} val_loss={va_loss:.4f} val_f1={va_f1:.4f} {'✅ best' if improved else ''}")

# 1 epoch smoke test
train_for(CFG.epochs_smoke, tag="smoke")

# full training
train_for(CFG.epochs_full, tag="full")

with open(OUT/"train_history.json", "w") as f:
    json.dump(history, f, indent=2)

print("\n✅ Training done. best_val_macro_f1:", best_f1)
print("best checkpoint:", best_path)


In [17]:
# CHECK: best_model.pth exists and is the right size

from pathlib import Path
p = OUT / "best_model.pth"
print("Looking for:", p)
assert p.exists(), "best_model.pth not found after training"
print("✅ size:", p.stat().st_size/1e6, "MB")
assert p.stat().st_size > 20, "Checkpoint too small — invalid"


Looking for: /kaggle/working/train_artifacts/best_model.pth
✅ size: 44.794315 MB


## Evaluation

Evaluate the best checkpoint on the held-out **image test split**.


In [20]:
# CELL: 10_EVAL_TEST - Evaluate best checkpoint on test split

import torch

ckpt = torch.load(OUT/"best_model.pth", map_location=DEVICE)
model.load_state_dict(ckpt["model_state_dict"])
model.eval()

test_cm = torch.zeros((NUM_CLASSES, NUM_CLASSES), dtype=torch.int64)
test_loss_total, n = 0.0, 0

with torch.no_grad():
    for xb, yb in test_loader:
        xb = xb.to(DEVICE, non_blocking=True)
        yb = yb.to(DEVICE, non_blocking=True)
        logits = model(xb)
        loss = criterion(logits, yb)
        test_loss_total += float(loss.item()) * xb.size(0)
        n += xb.size(0)
        test_cm += confusion_matrix_from_logits(logits, yb, NUM_CLASSES)

test_loss = test_loss_total / max(n, 1)
test_f1 = macro_f1_from_cm(test_cm)

print("test_loss:", round(test_loss, 5))
print("test_macro_f1:", round(test_f1, 5))
print("confusion matrix (rows=true, cols=pred):\n", test_cm)


NameError: name 'confusion_matrix_from_logits' is not defined

In [20]:
# CELL: 10A_COLLECT_LOGITS_VAL_TEST - Collect logits/labels for calibration (val + test)

import torch
import torch.nn.functional as F

@torch.no_grad()
def collect_logits(loader, model):
    model.eval()
    all_logits, all_y = [], []
    for xb, yb in loader:
        xb = xb.to(DEVICE, non_blocking=True)
        yb = yb.to(DEVICE, non_blocking=True)
        logits = model(xb)
        all_logits.append(logits.detach().cpu())
        all_y.append(yb.detach().cpu())

    logits = torch.cat(all_logits, dim=0)
    y = torch.cat(all_y, dim=0)

    assert logits.ndim == 2 and logits.shape[1] == NUM_CLASSES, logits.shape
    assert y.ndim == 1 and y.shape[0] == logits.shape[0], (y.shape, logits.shape)
    assert torch.isfinite(logits).all(), "Found NaN/Inf in logits"

    return logits, y

val_logits, val_y = collect_logits(val_loader, model)
test_logits, test_y = collect_logits(test_loader, model)

torch.save({"logits": val_logits, "y": val_y}, OUT / "calib_val_logits.pt")
torch.save({"logits": test_logits, "y": test_y}, OUT / "calib_test_logits.pt")

print("✅ Collected logits:")
print("  val :", val_logits.shape, val_y.shape)
print("  test:", test_logits.shape, test_y.shape)
print("  saved ->", OUT)


✅ Collected logits:
  val : torch.Size([1850, 4]) torch.Size([1850])
  test: torch.Size([2055, 4]) torch.Size([2055])
  saved -> /kaggle/working/train_artifacts


In [None]:
# CELL: 10B_CALIBRATION_METRICS_BEFORE - ECE + reliability bins + plots (before scaling)

import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn.functional as F

def _onehot(y, k):
    return F.one_hot(y.to(torch.int64), num_classes=k).to(torch.float32)

def compute_reliability_bins(probs: torch.Tensor, y_true: torch.Tensor, n_bins: int = 15):
    """
    probs: (N, C) softmax probs
    y_true: (N,)
    Returns (ece, bins_df)
    """
    assert probs.ndim == 2 and y_true.ndim == 1
    conf, preds = probs.max(dim=1)
    correct = (preds == y_true).to(torch.float32)

    bin_edges = torch.linspace(0.0, 1.0, n_bins + 1)
    rows = []
    ece = 0.0
    N = probs.shape[0]

    for i in range(n_bins):
        lo = float(bin_edges[i].item())
        hi = float(bin_edges[i + 1].item())

        # (lo, hi] except first bin includes 0
        if i == 0:
            in_bin = (conf >= lo) & (conf <= hi)
        else:
            in_bin = (conf > lo) & (conf <= hi)

        idx = torch.where(in_bin)[0]
        cnt = int(idx.numel())
        if cnt == 0:
            rows.append({"bin_lo": lo, "bin_hi": hi, "count": 0, "avg_conf": None, "avg_acc": None})
            continue

        avg_conf = float(conf[idx].mean().item())
        avg_acc = float(correct[idx].mean().item())

        frac = cnt / N
        ece += frac * abs(avg_acc - avg_conf)

        rows.append({"bin_lo": lo, "bin_hi": hi, "count": cnt, "avg_conf": avg_conf, "avg_acc": avg_acc})

    return float(ece), pd.DataFrame(rows)

def plot_reliability(bins_df: pd.DataFrame, title: str, out_path):
    df = bins_df.dropna(subset=["avg_conf", "avg_acc"]).copy()
    plt.figure(figsize=(6, 6))
    plt.plot([0, 1], [0, 1])
    plt.plot(df["avg_conf"], df["avg_acc"], marker="o")
    plt.xlabel("Confidence (avg per bin)")
    plt.ylabel("Accuracy (avg per bin)")
    plt.title(title)
    plt.grid(True, alpha=0.3)
    plt.savefig(out_path, bbox_inches="tight")
    plt.show()

def summarize_calibration(logits: torch.Tensor, y_true: torch.Tensor, split_name: str, tag: str, n_bins: int = 15):
    probs = torch.softmax(logits, dim=1)
    preds = probs.argmax(dim=1)

    acc = float((preds == y_true).to(torch.float32).mean().item())
    nll = float(F.cross_entropy(logits, y_true).item())

    oh = _onehot(y_true, NUM_CLASSES)
    brier = float(((probs - oh) ** 2).sum(dim=1).mean().item())

    cm = confusion_matrix_from_logits(logits, y_true, NUM_CLASSES)
    macro_f1 = macro_f1_from_cm(cm)

    ece, bins_df = compute_reliability_bins(probs, y_true, n_bins=n_bins)

    bins_csv = OUT / f"reliability_bins_{split_name}_{tag}.csv"
    bins_df.to_csv(bins_csv, index=False)

    plot_path = OUT / f"reliability_{split_name}_{tag}.png"
    plot_reliability(bins_df, f"{split_name.upper()} reliability ({tag}) | ECE={ece:.4f}", plot_path)

    return {
        "split": split_name,
        "tag": tag,
        "n": int(y_true.shape[0]),
        "acc": acc,
        "macro_f1": float(macro_f1),
        "nll": nll,
        "brier": brier,
        "ece": ece,
        "bins_csv": str(bins_csv),
        "plot_png": str(plot_path),
    }

N_BINS = 15

val_before = summarize_calibration(val_logits, val_y, "val", "before", n_bins=N_BINS)
test_before = summarize_calibration(test_logits, test_y, "test", "before", n_bins=N_BINS)

print("✅ Calibration BEFORE:")
print(val_before)
print(test_before)

pd.DataFrame([val_before, test_before]).to_csv(OUT / "calibration_summary_before.csv", index=False)
print("saved ->", OUT / "calibration_summary_before.csv")


In [21]:
# CELL: 10C_TEMPERATURE_SCALING_FIT - Fit temperature on VAL by minimizing NLL (no test leakage)

import json
import torch
import torch.nn as nn
import torch.nn.functional as F

class TemperatureScaler(nn.Module):
    def __init__(self):
        super().__init__()
        self.log_T = nn.Parameter(torch.zeros(1))  # T=exp(0)=1

    def temperature(self):
        # Clamp to avoid degenerate values
        return torch.exp(self.log_T).clamp(0.05, 100.0)

    def forward(self, logits):
        return logits / self.temperature()

ts = TemperatureScaler().to(DEVICE)

val_logits_d = val_logits.to(DEVICE)
val_y_d = val_y.to(DEVICE)

opt = torch.optim.LBFGS([ts.log_T], lr=0.5, max_iter=50, line_search_fn="strong_wolfe")

def closure():
    opt.zero_grad()
    loss = F.cross_entropy(ts(val_logits_d), val_y_d)
    loss.backward()
    return loss

opt.step(closure)

T = float(ts.temperature().detach().cpu().item())
val_nll_after_fit = float(F.cross_entropy((val_logits / T), val_y).item())

print(f"✅ Fitted temperature T = {T:.6f}")
print(f"VAL NLL (after applying T): {val_nll_after_fit:.6f}")

with open(OUT / "temperature_scaling.json", "w") as f:
    json.dump({"temperature": T, "val_nll_after_fit": val_nll_after_fit}, f, indent=2)

print("saved ->", OUT / "temperature_scaling.json")


✅ Fitted temperature T = 0.925916
VAL NLL (after applying T): 0.012948
saved -> /kaggle/working/train_artifacts/temperature_scaling.json


In [None]:
# CELL: 10D_CALIBRATION_METRICS_AFTER - Recompute metrics after temperature scaling (val + test)

import pandas as pd
import json

val_logits_cal = val_logits / T
test_logits_cal = test_logits / T

val_after = summarize_calibration(val_logits_cal, val_y, "val", "after", n_bins=N_BINS)
test_after = summarize_calibration(test_logits_cal, test_y, "test", "after", n_bins=N_BINS)

print("✅ Calibration AFTER:")
print(val_after)
print(test_after)

df_all = pd.DataFrame([val_before, test_before, val_after, test_after])
df_all.to_csv(OUT / "calibration_summary_before_after.csv", index=False)

with open(OUT / "calibration_metrics.json", "w") as f:
    json.dump(
        {
            "n_bins": N_BINS,
            "temperature": T,
            "val": {"before": val_before, "after": val_after},
            "test": {"before": test_before, "after": test_after},
        },
        f,
        indent=2,
    )

print("saved ->", OUT / "calibration_summary_before_after.csv")
print("saved ->", OUT / "calibration_metrics.json")


In [None]:
# CELL: 10E_PICK_CONFIDENCE_THRESHOLD_FROM_VAL - Pick τ_conf from VAL (calibrated)

import torch

val_probs_cal = torch.softmax(val_logits / T, dim=1)
val_maxprob_cal = val_probs_cal.max(dim=1).values

# Choose coverage target: keep 95% of in-domain samples
KEEP_RATE = 0.95
tau_conf = float(torch.quantile(val_maxprob_cal, 1.0 - KEEP_RATE).item())

print("✅ tau_conf chosen from VAL (calibrated)")
print("KEEP_RATE:", KEEP_RATE)
print("tau_conf:", tau_conf)

# Sanity: what fraction would abstain on val/test under this threshold?
test_probs_cal = torch.softmax(test_logits / T, dim=1)
test_maxprob_cal = test_probs_cal.max(dim=1).values

val_abstain_rate = float((val_maxprob_cal < tau_conf).to(torch.float32).mean().item())
test_abstain_rate = float((test_maxprob_cal < tau_conf).to(torch.float32).mean().item())

print("VAL abstain rate:", val_abstain_rate)
print("TEST abstain rate:", test_abstain_rate)


In [None]:
# CELL: 11_EVAL_EXTERNAL_NPZ - evaluate MUAZ external NPZ test (domain shift)

import numpy as np
import pandas as pd
from pathlib import Path
import torch
import torch.nn.functional as F

# ---- Locate split CSV ----
# Prefer variables from earlier cells if they exist; otherwise auto-find.
ARTIFACTS_DIR = Path(split_train).parent if "split_train" in globals() else None
if ARTIFACTS_DIR is None:
    # fallback: search common location if you didn't keep split_train variable
    ARTIFACTS_DIR = Path("/kaggle/input").rglob("split_external_test_npz.csv").__iter__().__next__().parent

split_ext_path = ARTIFACTS_DIR / "split_external_test_npz.csv"
assert split_ext_path.exists(), f"Can't find split_external_test_npz.csv at: {split_ext_path}"

df_ext = pd.read_csv(split_ext_path)
print("split_external_test_npz.csv rows:", len(df_ext))
print("columns:", df_ext.columns.tolist())
display(df_ext.head(3))

# ---- Load checkpoint (best) into model ----
best_path = Path("/kaggle/working/train_artifacts/best_model.pth")
assert best_path.exists(), f"Missing checkpoint: {best_path}"

ckpt = torch.load(best_path, map_location="cpu")
model.load_state_dict(ckpt["model_state_dict"])
model.eval()
model.to(DEVICE)

# ---- Helpers: tensor normalization consistent with eval_tfms ----
MEAN = torch.tensor([0.485,0.456,0.406]).view(1,3,1,1)
STD  = torch.tensor([0.229,0.224,0.225]).view(1,3,1,1)

def prep_npz_batch(x_np: np.ndarray) -> torch.Tensor:
    """
    x_np: (B,H,W,3) float32. Values might be 0..1 or 0..255.
    Returns: (B,3,224,224) float32 normalized.
    """
    x = torch.from_numpy(x_np).permute(0,3,1,2).float()  # B,C,H,W
    # scale if looks like 0..255
    if x.max().item() > 1.5:
        x = x / 255.0
    x = F.interpolate(x, size=(224,224), mode="bilinear", align_corners=False)
    x = (x - MEAN) / STD
    return x

@torch.no_grad()
def eval_npz_arrays(x: np.ndarray, y_onehot: np.ndarray, batch_size: int = 256):
    y = np.argmax(y_onehot, axis=1).astype(np.int64)
    cm = torch.zeros((NUM_CLASSES, NUM_CLASSES), dtype=torch.int64)
    total = 0

    for i in range(0, len(x), batch_size):
        xb = prep_npz_batch(x[i:i+batch_size]).to(DEVICE)
        yb = torch.from_numpy(y[i:i+batch_size]).to(DEVICE)
        logits = model(xb)
        cm += confusion_matrix_from_logits(logits.detach().cpu(), yb.detach().cpu(), NUM_CLASSES)
        total += len(xb)

    f1 = macro_f1_from_cm(cm)
    return f1, cm, total

# ---- Decide how to evaluate based on CSV structure ----
# Case A: CSV is per-sample with npz_path + idx
cols = set(df_ext.columns)
per_sample = (("npz_path" in cols or "npz_file" in cols) and ("idx" in cols or "index" in cols or "i" in cols))

if per_sample:
    npz_col = "npz_path" if "npz_path" in cols else "npz_file"
    idx_col = "idx" if "idx" in cols else ("index" if "index" in cols else "i")

    # Load each npz file once, then gather samples
    # (works even if there are multiple npz files)
    grouped = df_ext.groupby(npz_col)

    all_logits = []
    all_y = []
    for npz_path, g in grouped:
        npz_path = Path(npz_path)
        assert npz_path.exists(), f"Missing npz: {npz_path}"
        data = np.load(npz_path)
        x = data["x"]
        y = data["y"]
        idxs = g[idx_col].astype(int).to_numpy()

        x_sel = x[idxs]
        y_sel = y[idxs]
        f1, cm, n = eval_npz_arrays(x_sel, y_sel)
        print(f"NPZ file {npz_path.name} n={n} macro_f1={f1:.4f}")
        print("cm:\n", cm)

else:
    # Case B: CSV is just metadata; evaluate directly from MUAZ test.npz
    # (This matches what you previously printed: files = training/validation/test.npz)
    MUAZ_DIR = Path("/kaggle/input/datasets/muazalzoubi/brain-tumor-gliomameningiomapituitary-not-tumors")
    if not MUAZ_DIR.exists():
        # fallback: locate by searching
        MUAZ_DIR = next(Path("/kaggle/input/datasets").rglob("brain-tumor-gliomameningiomapituitary-not-tumors"))
    test_npz = MUAZ_DIR / "test.npz"
    assert test_npz.exists(), f"Missing expected file: {test_npz}"

    data = np.load(test_npz)
    x = data["x"]
    y = data["y"]
    f1, cm, n = eval_npz_arrays(x, y)
    print(f"✅ MUAZ external test.npz n={n} macro_f1={f1:.4f}")
    print("confusion matrix (rows=true, cols=pred):\n", cm)


In [None]:
# CELL: 11B_MUAZ_LABEL_VISUAL_CHECK - confirm what MUAZ label_id means

import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

MUAZ_DIR = Path("/kaggle/input/datasets/muazalzoubi/brain-tumor-gliomameningiomapituitary-not-tumors")
test_npz = MUAZ_DIR / "test.npz"
data = np.load(test_npz)
x = data["x"]          # (N,150,150,3)
y = data["y"]          # (N,4) one-hot
label_id = y.argmax(axis=1)

def show_grid(lbl, n=12):
    idx = np.where(label_id == lbl)[0][:n]
    fig, axes = plt.subplots(3, 4, figsize=(10, 7))
    fig.suptitle(f"MUAZ label_id = {lbl}", fontsize=14)
    for ax, i in zip(axes.ravel(), idx):
        img = x[i]
        if img.max() <= 1.5:  # likely 0..1
            img = (img * 255).clip(0,255).astype("uint8")
        ax.imshow(img)
        ax.axis("off")
    plt.tight_layout()
    plt.show()

for lbl in [0,1,2,3]:
    show_grid(lbl)


In [None]:
# CELL: 11C_MUAZ_FIND_LABEL_MAPPING - find best MUAZ->OURS label_id mapping (no eyeballing)

import numpy as np
import itertools
import torch
import torch.nn.functional as F
from pathlib import Path

assert "model" in globals(), "model not defined yet — run model load cell first"
assert "DEVICE" in globals(), "DEVICE not defined yet — run device setup cell first"

# --- Load MUAZ test ---
MUAZ_DIR = Path("/kaggle/input/datasets/muazalzoubi/brain-tumor-gliomameningiomapituitary-not-tumors")
test_npz = MUAZ_DIR / "test.npz"
data = np.load(test_npz)
x = data["x"]
y_onehot = data["y"]
y_muaz = y_onehot.argmax(axis=1).astype(np.int64)
MUAZ_TO_OURS = np.array([0, 1, 3, 2], dtype=np.int64)
y_true = MUAZ_TO_OURS[y_muaz]

print("counts y_muaz:", np.bincount(y_muaz, minlength=4))
print("counts y_true:", np.bincount(y_true, minlength=4))


# --- Prepare like eval ---
MEAN = torch.tensor([0.485,0.456,0.406]).view(1,3,1,1)
STD  = torch.tensor([0.229,0.224,0.225]).view(1,3,1,1)

def prep_npz_batch(x_np):
    x_t = torch.from_numpy(x_np).permute(0,3,1,2).float()  # B,C,H,W
    if x_t.max().item() > 1.5:
        x_t = x_t / 255.0
    x_t = F.interpolate(x_t, size=(224,224), mode="bilinear", align_corners=False)
    x_t = (x_t - MEAN) / STD
    return x_t

@torch.no_grad()
def predict_all(batch_size=256):
    preds = []
    for i in range(0, len(x), batch_size):
        xb = prep_npz_batch(x[i:i+batch_size]).to(DEVICE)
        logits = model(xb)
        pred = logits.argmax(dim=1).detach().cpu().numpy()
        preds.append(pred)
    return np.concatenate(preds)

pred_ours = predict_all(batch_size=256 if DEVICE=="cuda" else 64)

def cm_from_ints(y_true, y_pred, k=4):
    cm = np.zeros((k,k), dtype=np.int64)
    for t,p in zip(y_true, y_pred):
        cm[t,p] += 1
    return cm

def macro_f1_from_cm(cm):
    f1s = []
    for c in range(cm.shape[0]):
        tp = cm[c,c]
        fp = cm[:,c].sum() - tp
        fn = cm[c,:].sum() - tp
        denom = (2*tp + fp + fn)
        f1 = (2*tp / denom) if denom > 0 else 0.0
        f1s.append(f1)
    return float(np.mean(f1s))

# Try all MUAZ->OURS mappings: mapped_true = perm[muaz_id]
best = None
for perm in itertools.permutations(range(4)):
    mapped_true = np.array([perm[t] for t in y_muaz], dtype=np.int64)
    cm = cm_from_ints(mapped_true, pred_ours, k=4)
    f1 = macro_f1_from_cm(cm)
    if (best is None) or (f1 > best["f1"]):
        best = {"perm": perm, "f1": f1, "cm": cm}

print("✅ Best MUAZ->OURS mapping (muaz_id -> ours_id):", best["perm"])
print("✅ macro_f1 after remap:", round(best["f1"], 4))
print("confusion matrix after remap (rows=true, cols=pred):\n", best["cm"])


In [None]:
# CELL: 11D_MUAZ_EXTERNAL_EVAL_MAPPED - Evaluate MUAZ test.npz with fixed label remap (muaz->ours)

import numpy as np
import torch
import torch.nn.functional as F
from pathlib import Path

# --- Guards: model + device must exist ---
assert "model" in globals(), "model not defined — run the model init/load cell first."
DEVICE = globals().get("DEVICE", "cuda" if torch.cuda.is_available() else "cpu")
model = model.to(DEVICE).eval()

IMG_SIZE = getattr(globals().get("CFG", object()), "img_size", 224)

# --- MUAZ paths ---
MUAZ_DIR = Path("/kaggle/input/datasets/muazalzoubi/brain-tumor-gliomameningiomapituitary-not-tumors")
test_npz = MUAZ_DIR / "test.npz"
assert test_npz.exists(), f"Missing MUAZ test file: {test_npz}"

# --- Load MUAZ test ---
data = np.load(test_npz)
x = data["x"]                     # (N, H, W, 3)
y_onehot = data["y"]              # (N, 4) one-hot
y_muaz = y_onehot.argmax(axis=1).astype(np.int64)

# --- Fixed mapping discovered earlier: (0, 1, 3, 2) ---
# Meaning: ours_id = MUAZ_TO_OURS[muaz_id]
MUAZ_TO_OURS = np.array([0, 1, 3, 2], dtype=np.int64)
y_true = MUAZ_TO_OURS[y_muaz]

print("counts y_muaz:", np.bincount(y_muaz, minlength=4).tolist())
print("counts y_true:", np.bincount(y_true, minlength=4).tolist())
print("✅ Using MUAZ_TO_OURS:", MUAZ_TO_OURS.tolist(), "(muaz_id -> ours_id)")

# --- Preprocess: match training (ImageNet mean/std + resize) ---
MEAN = torch.tensor([0.485, 0.456, 0.406], device=DEVICE).view(1, 3, 1, 1)
STD  = torch.tensor([0.229, 0.224, 0.225], device=DEVICE).view(1, 3, 1, 1)

def prep_npz_batch(x_np: np.ndarray) -> torch.Tensor:
    x_t = torch.from_numpy(x_np).permute(0, 3, 1, 2).float().to(DEVICE)  # B,C,H,W
    # If MUAZ pixels are 0..255, scale to 0..1
    if x_t.max().item() > 1.5:
        x_t = x_t / 255.0
    x_t = F.interpolate(x_t, size=(IMG_SIZE, IMG_SIZE), mode="bilinear", align_corners=False)
    x_t = (x_t - MEAN) / STD
    return x_t

@torch.no_grad()
def predict_all(batch_size: int = 256) -> np.ndarray:
    preds = []
    for i in range(0, len(x), batch_size):
        xb = prep_npz_batch(x[i:i+batch_size])
        # autocast speeds up on GPU; safe no-op on CPU
        if DEVICE == "cuda":
            with torch.amp.autocast(device_type="cuda"):
                logits = model(xb)
        else:
            logits = model(xb)
        pred = logits.argmax(dim=1).detach().cpu().numpy()
        preds.append(pred)
    return np.concatenate(preds)

pred = predict_all(batch_size=256 if DEVICE == "cuda" else 64)

# --- Metrics (no sklearn dependency) ---
def cm_from_ints(y_t: np.ndarray, y_p: np.ndarray, k: int = 4) -> np.ndarray:
    cm = np.zeros((k, k), dtype=np.int64)
    for t, p in zip(y_t, y_p):
        cm[int(t), int(p)] += 1
    return cm

def macro_f1_from_cm(cm: np.ndarray) -> float:
    f1s = []
    for c in range(cm.shape[0]):
        tp = cm[c, c]
        fp = cm[:, c].sum() - tp
        fn = cm[c, :].sum() - tp
        denom = (2 * tp + fp + fn)
        f1 = (2 * tp / denom) if denom > 0 else 0.0
        f1s.append(f1)
    return float(np.mean(f1s))

cm = cm_from_ints(y_true, pred, k=4)
macro_f1 = macro_f1_from_cm(cm)

print(f"✅ MUAZ external test.npz n={len(x)} macro_f1={macro_f1:.4f}")
print("confusion matrix (rows=true, cols=pred):\n", cm)


In [None]:
# CELL: 11E_EXTERNAL_MUAZ_POLICY_EVAL - Build MUAZ test loader from NPZ + eval baseline vs policy

import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from pathlib import Path
from PIL import Image

# ----------------------------
# Preconditions
# ----------------------------
need = ["DEVICE", "OUT", "NUM_CLASSES", "model", "feat_extractor", "eval_tfms", "T", "tau_conf", "clf"]
missing = [k for k in need if k not in globals()]
assert not missing, f"Missing prereqs: {missing}. Run: 05_DATASET_AND_TRANSFORMS, 06_DATALOADERS, calibration (10A-10D), domain guard (12E)."

# Use your locked tau_domain (from sweep recommendation)
TAU_DOMAIN = float(globals().get("TAU_DOMAIN", 0.580288))

print("Using policy thresholds:")
print("  tau_conf  :", float(tau_conf))
print("  tau_domain:", TAU_DOMAIN)
print("  T         :", float(T))

# ----------------------------
# Locate MUAZ NPZ file
# ----------------------------
# NOTE: this requires the MUAZ dataset to be attached as Kaggle input
# Dataset: muazalzoubi/brain-tumor-gliomameningiomapituitary-not-tumors

MUAZ_ROOT_CANDIDATES = [
    Path("/kaggle/input/datasets/muazalzoubi/brain-tumor-gliomameningiomapituitary-not-tumors"),
    Path("/kaggle/input/brain-tumor-gliomameningiomapituitary-not-tumors"),
]

muaz_root = None
for p in MUAZ_ROOT_CANDIDATES:
    if p.exists():
        muaz_root = p
        break

assert muaz_root is not None, (
    "MUAZ dataset not found in /kaggle/input. Attach Kaggle dataset:\n"
    "muazalzoubi/brain-tumor-gliomameningiomapituitary-not-tumors"
)

npz_files = sorted(muaz_root.rglob("*.npz"))
assert len(npz_files) > 0, f"No .npz files found under {muaz_root}"

# Prefer a file with 'test' in the name
def score_npz(p: Path) -> int:
    name = p.name.lower()
    if "test" in name:
        return 0
    if "valid" in name or "val" in name:
        return 1
    if "train" in name:
        return 2
    return 3

npz_files = sorted(npz_files, key=lambda p: (score_npz(p), len(p.name), p.name.lower()))
npz_path = npz_files[0]

print("\n✅ Using MUAZ NPZ:", npz_path)

# ----------------------------
# Load NPZ robustly (handle different key names)
# ----------------------------
npz = np.load(npz_path, allow_pickle=True)
keys = list(npz.keys())
print("NPZ keys:", keys)

def pick_first(keys, options):
    for k in options:
        if k in keys:
            return k
    return None

x_key = pick_first(keys, ["images", "image", "x", "X", "data", "arr_0"])
y_key = pick_first(keys, ["labels", "label", "y", "Y", "targets", "target", "arr_1"])

assert x_key is not None, f"Could not find image key in NPZ. Keys: {keys}"
assert y_key is not None, f"Could not find label key in NPZ. Keys: {keys}"

X = npz[x_key]
y_muaz = npz[y_key]

# Ensure arrays are numpy arrays
X = np.asarray(X)
y_muaz = np.asarray(y_muaz).astype(np.int64)

print("y raw shape:", y_muaz.shape, "unique values:", np.unique(y_muaz))

# Convert one-hot labels -> class ids if needed
if y_muaz.ndim == 2:
    # Expect shape (N,4)
    assert y_muaz.shape[1] == 4, f"Expected one-hot with 4 columns, got {y_muaz.shape}"
    y_muaz = y_muaz.argmax(axis=1).astype(np.int64)
elif y_muaz.ndim == 1:
    y_muaz = y_muaz.astype(np.int64)
else:
    raise AssertionError(f"Unexpected label array shape: {y_muaz.shape}")

print("y class shape:", y_muaz.shape, "classes:", np.unique(y_muaz))

assert X.shape[0] == y_muaz.shape[0], "Mismatch between X and y length"


# ----------------------------
# MUAZ label-id -> OUR label-id mapping (from your permutation discovery)
# muaz_id -> ours_id
# ----------------------------
MUAZ_TO_OURS = {0: 0, 1: 1, 2: 3, 3: 2}

def map_labels_muaz_to_ours(y):
    y2 = np.vectorize(MUAZ_TO_OURS.get)(y)
    return y2.astype(np.int64)

y_ours = map_labels_muaz_to_ours(y_muaz)

# ----------------------------
# Dataset: convert each slice to PIL RGB then apply eval_tfms
# ----------------------------
def to_uint8_img(arr):
    arr = np.asarray(arr)
    # common shapes: (H,W), (H,W,1), (H,W,3)
    if arr.ndim == 3 and arr.shape[-1] == 1:
        arr = arr[..., 0]
    if arr.ndim == 3 and arr.shape[-1] == 3:
        # already 3-channel
        pass

    # scale to uint8
    if arr.dtype != np.uint8:
        arr = arr.astype(np.float32)
        mn, mx = float(arr.min()), float(arr.max())
        if mx > mn:
            arr = (arr - mn) / (mx - mn)
        else:
            arr = np.zeros_like(arr)
        arr = (arr * 255.0).clip(0, 255).astype(np.uint8)

    # make PIL
    if arr.ndim == 2:
        img = Image.fromarray(arr).convert("RGB")
    else:
        img = Image.fromarray(arr).convert("RGB")
    return img

class MUAZNPZDataset(Dataset):
    def __init__(self, X, y, tfms):
        self.X = X
        self.y = y
        self.tfms = tfms

    def __len__(self):
        return self.y.shape[0]

    def __getitem__(self, idx):
        img = to_uint8_img(self.X[idx])
        x = self.tfms(img)
        return x, int(self.y[idx]), int(idx), int(y_muaz[idx])  # x, y_ours, row_idx, y_muaz

muaz_ds = MUAZNPZDataset(X, y_ours, eval_tfms)
muaz_loader = DataLoader(muaz_ds, batch_size=128, shuffle=False, num_workers=2, pin_memory=True)

print("✅ MUAZ loader ready. rows:", len(muaz_ds))

# ----------------------------
# Metrics helpers
# ----------------------------
def confusion_matrix(preds, y_true, k):
    cm = torch.zeros((k, k), dtype=torch.int64)
    for t, p in zip(y_true.tolist(), preds.tolist()):
        cm[t, p] += 1
    return cm

def macro_f1_from_cm(cm):
    k = cm.shape[0]
    f1s = []
    for c in range(k):
        tp = cm[c, c].item()
        fp = cm[:, c].sum().item() - tp
        fn = cm[c, :].sum().item() - tp
        denom = (2*tp + fp + fn)
        f1 = 0.0 if denom == 0 else (2*tp / denom)
        f1s.append(f1)
    return float(sum(f1s) / k)

# ----------------------------
# Inference: baseline vs policy
# ----------------------------
model.eval()
feat_extractor.eval()

rows = []
y_true_all = []
pred_all = []
abstain_all = []

with torch.no_grad():
    for xb, yb, ridx, yb_muaz in muaz_loader:
        xb = xb.to(DEVICE, non_blocking=True)
        yb = yb.to(torch.int64)

        logits = model(xb)
        probs_cal = torch.softmax(logits / T, dim=1)
        max_prob, pred_id = probs_cal.max(dim=1)

        # domain score from embeddings
        z = feat_extractor(xb).view(xb.shape[0], -1).detach().cpu().numpy()  # (B,512)
        p_in = clf.predict_proba(z)[:, 1]

        # abstain rule
        max_prob_np = max_prob.detach().cpu().numpy()
        abstain = (p_in < TAU_DOMAIN) | (max_prob_np < float(tau_conf))

        # collect
        y_true_all.append(yb)
        pred_all.append(pred_id.detach().cpu())
        abstain_all.append(torch.tensor(abstain, dtype=torch.bool))

        # row-level outputs
        ridx = ridx.detach().cpu().numpy()
        yb_muaz = yb_muaz.detach().cpu().numpy()
        yb_np = yb.detach().cpu().numpy()
        pred_np = pred_id.detach().cpu().numpy()

        for j in range(len(ridx)):
            rows.append({
                "row_idx": int(ridx[j]),
                "y_muaz": int(yb_muaz[j]),
                "y_ours": int(yb_np[j]),
                "pred_id": int(pred_np[j]),
                "max_prob_cal": float(max_prob_np[j]),
                "p_in_domain": float(p_in[j]),
                "abstain": bool(abstain[j]),
            })

y_true = torch.cat(y_true_all, dim=0)
pred = torch.cat(pred_all, dim=0)
abstain_mask = torch.cat(abstain_all, dim=0)

# Baseline metrics (no abstain)
acc = float((pred == y_true).to(torch.float32).mean().item())
cm = confusion_matrix(pred, y_true, NUM_CLASSES)
macro_f1 = macro_f1_from_cm(cm)

# Policy metrics (accepted only)
accepted = ~abstain_mask
coverage = float(accepted.to(torch.float32).mean().item())
abstain_rate = 1.0 - coverage

if accepted.any():
    y_acc = y_true[accepted]
    p_acc = pred[accepted]
    acc_acc = float((p_acc == y_acc).to(torch.float32).mean().item())
    cm_acc = confusion_matrix(p_acc, y_acc, NUM_CLASSES)
    macro_f1_acc = macro_f1_from_cm(cm_acc)
else:
    acc_acc, macro_f1_acc = None, None

print("\n✅ MUAZ BASELINE (mapped labels)")
print("rows:", int(y_true.shape[0]))
print("acc:", acc)
print("macro_f1:", macro_f1)

print("\n✅ MUAZ WITH POLICY (tau_conf + tau_domain)")
print("coverage:", coverage)
print("abstain_rate:", abstain_rate)
print("acc_on_accepted:", acc_acc)
print("macro_f1_on_accepted:", macro_f1_acc)

# Save artifacts
df_rows = pd.DataFrame(rows)
df_rows.to_csv(OUT / "muaz_policy_outputs.csv", index=False)

pd.DataFrame([{
    "npz_path": str(npz_path),
    "rows": int(y_true.shape[0]),
    "baseline_acc": acc,
    "baseline_macro_f1": macro_f1,
    "policy_coverage": coverage,
    "policy_abstain_rate": abstain_rate,
    "policy_acc_on_accepted": acc_acc,
    "policy_macro_f1_on_accepted": macro_f1_acc,
    "tau_conf": float(tau_conf),
    "tau_domain": float(TAU_DOMAIN),
    "T": float(T),
}]).to_csv(OUT / "muaz_policy_report.csv", index=False)

print("\nsaved ->", OUT / "muaz_policy_outputs.csv")
print("saved ->", OUT / "muaz_policy_report.csv")


In [None]:
# CELL: 12_AUDIT_CHALLENGE_CONFIDENCE - confidence + prediction audit on challenge domains (handles JPG/PNG + DICOM)

from __future__ import annotations

from pathlib import Path
from typing import List, Tuple, Dict, Any
import numpy as np
import pandas as pd
import torch
from PIL import Image
import pydicom

# ---- display prefs (once) ----
pd.set_option("display.max_columns", 100)
pd.set_option("display.width", 160)
pd.set_option("display.max_colwidth", 80)

# ---- guards ----
assert "model" in globals(), "model not defined — run model init/load cells first."
assert "DEVICE" in globals(), "DEVICE not defined — run device cell first."
assert "eval_tfms" in globals(), "eval_tfms not defined — run transforms cell first."

# ---- locate challenge split ----
ARTIFACTS_DIR = Path(split_train).parent if "split_train" in globals() else globals().get("ARTIFACTS_DIR")
assert ARTIFACTS_DIR is not None, "ARTIFACTS_DIR not found — run Cell 00 that locates artifacts first."

split_chal_path = Path(ARTIFACTS_DIR) / "split_challenge_sampled.csv"
assert split_chal_path.exists(), f"Missing: {split_chal_path}"

df_chal = pd.read_csv(split_chal_path)
print("challenge rows:", len(df_chal))
print("domains:", df_chal["domain"].value_counts().to_dict())
display(df_chal.head(3))

# ---- load checkpoint (best) ----
best_path = Path("/kaggle/working/train_artifacts/best_model.pth")
assert best_path.exists(), f"Missing checkpoint: {best_path}"

ckpt = torch.load(best_path, map_location="cpu")
model.load_state_dict(ckpt["model_state_dict"])
model.to(DEVICE).eval()

# ---- label map (OURS) ----
ID2NAME = {0: "glioma", 1: "meningioma", 2: "pituitary", 3: "notumor"}

def load_any_image(path: Path) -> Image.Image:
    """Load JPG/PNG or DICOM and return RGB PIL."""
    suffix = path.suffix.lower()
    if suffix == ".dcm":
        ds = pydicom.dcmread(str(path), force=True)
        arr = ds.pixel_array.astype(np.float32)

        # min-max normalize per slice for stable conversion
        arr = arr - np.min(arr)
        denom = np.max(arr) + 1e-6
        arr = (arr / denom * 255.0).clip(0, 255).astype(np.uint8)

        return Image.fromarray(arr).convert("RGB")

    return Image.open(path).convert("RGB")

@torch.no_grad()
def infer_rows(
    rows: pd.DataFrame,
    batch_size: int = 128,
) -> Tuple[pd.DataFrame, Dict[str, int]]:
    """
    Runs inference over df rows (expects `path` col).
    Returns (df_scored, stats) where df_scored contains only successfully read rows.
    """
    ok_rows = []
    failed = 0

    # We'll build batches of tensors and keep their corresponding row dicts.
    for i in range(0, len(rows), batch_size):
        chunk = rows.iloc[i:i+batch_size]
        xs = []
        meta = []

        for _, r in chunk.iterrows():
            p = Path(r["path"])
            try:
                img = load_any_image(p)
                x = eval_tfms(img)
                xs.append(x)
                meta.append(r.to_dict())
            except Exception:
                failed += 1

        if not xs:
            continue

        xb = torch.stack(xs).to(DEVICE)
        logits = model(xb)
        prob = torch.softmax(logits, dim=1)
        max_prob, pred_id = torch.max(prob, dim=1)

        pred_id = pred_id.detach().cpu().numpy().astype(np.int64)
        max_prob = max_prob.detach().cpu().numpy().astype(np.float32)

        for m, pid, conf in zip(meta, pred_id, max_prob):
            m["pred_id"] = int(pid)
            m["max_prob"] = float(conf)
            m["pred_name"] = ID2NAME[int(pid)]
            ok_rows.append(m)

    df_scored = pd.DataFrame(ok_rows)
    stats = {"failed_reads": failed, "scored_rows": len(df_scored)}
    return df_scored, stats

batch = 256 if DEVICE == "cuda" else 32
df_out, stats = infer_rows(df_chal, batch_size=batch)
print(f"✅ scored_rows={stats['scored_rows']}  failed_reads={stats['failed_reads']}  batch={batch}")

assert len(df_out) > 0, "No rows scored — check paths and loaders."

# ---- confidence summary per domain ----
def q(x: np.ndarray, quant: float) -> float:
    return float(np.quantile(x, quant))

summary = []
for dom, g in df_out.groupby("domain"):
    conf = g["max_prob"].to_numpy()
    top_pred = g["pred_name"].value_counts().idxmax()
    top_pred_frac = float((g["pred_name"] == top_pred).mean())

    summary.append({
        "domain": dom,
        "n": int(len(g)),
        "mean_conf": float(conf.mean()),
        "p50": q(conf, 0.50),
        "p90": q(conf, 0.90),
        "p95": q(conf, 0.95),
        "p99": q(conf, 0.99),
        "frac_ge_0.90": float((conf >= 0.90).mean()),
        "frac_ge_0.95": float((conf >= 0.95).mean()),
        "top_pred": top_pred,
        "top_pred_frac": top_pred_frac,
    })

df_summary = pd.DataFrame(summary).sort_values("domain")
display(df_summary)

# ---- NEW: prediction distribution per domain (counts + %) ----
df_pred_dist = (
    df_out
    .groupby(["domain", "pred_name"])
    .size()
    .reset_index(name="count")
)
df_pred_dist["pct"] = df_pred_dist.groupby("domain")["count"].transform(lambda s: s / s.sum())
df_pred_dist = df_pred_dist.sort_values(["domain", "count"], ascending=[True, False])
display(df_pred_dist.head(20))

# ---- NEW: mean confidence by (domain, pred_name) ----
df_conf_by_pred = (
    df_out
    .groupby(["domain", "pred_name"])["max_prob"]
    .agg(n="count", mean_conf="mean", p50=lambda s: float(np.quantile(s, 0.50)), p95=lambda s: float(np.quantile(s, 0.95)))
    .reset_index()
    .sort_values(["domain", "n"], ascending=[True, False])
)
display(df_conf_by_pred.head(20))

# ---- save artifacts ----
OUT = Path("/kaggle/working/train_artifacts")
OUT.mkdir(parents=True, exist_ok=True)

df_out.to_csv(OUT / "challenge_predictions.csv", index=False)
df_summary.to_csv(OUT / "challenge_confidence_summary.csv", index=False)
df_pred_dist.to_csv(OUT / "challenge_pred_distribution.csv", index=False)
df_conf_by_pred.to_csv(OUT / "challenge_conf_by_pred.csv", index=False)

print("✅ wrote:", OUT / "challenge_predictions.csv")
print("✅ wrote:", OUT / "challenge_confidence_summary.csv")
print("✅ wrote:", OUT / "challenge_pred_distribution.csv")
print("✅ wrote:", OUT / "challenge_conf_by_pred.csv")


In [22]:
# CELL: 12B_OOD_GUARD_POLICY - OOD guard + abstain policy on challenge (diag-Mahalanobis + calibrated confidence)

import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import pydicom
from pathlib import Path

# ----------------------------
# Preconditions / guardrails
# ----------------------------
need = ["DEVICE", "OUT", "NUM_CLASSES", "model", "val_loader", "eval_tfms", "df_chal", "T"]
missing = [k for k in need if k not in globals()]
assert not missing, f"Missing prereqs: {missing}. Run: 01_CONFIG, 05_DATASET_AND_TRANSFORMS, 06_DATALOADERS, 09/10, 10A-10D, and 12."

# ----------------------------
# 1) Pick tau_conf from VAL (calibrated)
# ----------------------------
KEEP_RATE = 0.95  # keep 95% of in-domain samples; abstain ~5%
assert "val_logits" in globals() and "val_y" in globals(), "Run calibration Cell 10A first (val_logits/val_y missing)."

val_probs_cal = torch.softmax(val_logits / T, dim=1)
val_maxprob_cal = val_probs_cal.max(dim=1).values
tau_conf = float(torch.quantile(val_maxprob_cal, 1.0 - KEEP_RATE).item())

print("✅ tau_conf chosen from VAL (calibrated)")
print("KEEP_RATE:", KEEP_RATE)
print("tau_conf:", tau_conf)

# ----------------------------
# 2) Robust loader for JPG/PNG + DICOM
# ----------------------------
def load_any_image(path: str) -> Image.Image:
    p = Path(path)
    suf = p.suffix.lower()

    if suf == ".dcm":
        dcm = pydicom.dcmread(str(p), force=True)
        arr = dcm.pixel_array.astype(np.float32)

        # demo-safe min-max per slice (consistent with your existing challenge audit)
        mn, mx = float(arr.min()), float(arr.max())
        if mx > mn:
            arr = (arr - mn) / (mx - mn)
        else:
            arr = np.zeros_like(arr)

        arr = (arr * 255.0).clip(0, 255).astype(np.uint8)
        return Image.fromarray(arr).convert("RGB")

    return Image.open(str(p)).convert("RGB")

class ChallengeDataset(Dataset):
    def __init__(self, df, tfms):
        self.df = df.reset_index(drop=True)
        self.tfms = tfms

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img = load_any_image(row["path"])
        x = self.tfms(img)
        return x, idx

# ----------------------------
# 3) Build feature extractor + in-domain embedding stats (VAL)
#    Using ResNet18: penultimate features = model without final FC
# ----------------------------
feat_extractor = torch.nn.Sequential(*list(model.children())[:-1]).to(DEVICE)
feat_extractor.eval()
model.eval()

@torch.no_grad()
def collect_embeddings(loader):
    embs = []
    for xb, _ in loader:
        xb = xb.to(DEVICE, non_blocking=True)
        z = feat_extractor(xb)          # (B,512,1,1)
        z = z.view(z.shape[0], -1)      # (B,512)
        embs.append(z.detach().cpu())
    return torch.cat(embs, dim=0)       # (N,512)

# Collect in-domain embeddings from VAL
# NOTE: this iterates val_loader once; acceptable for demo-grade, reproducible
val_embs = collect_embeddings(val_loader)  # (N,512)

mu = val_embs.mean(dim=0)
var = val_embs.var(dim=0, unbiased=False) + 1e-6  # diagonal covariance (stability)

# diag Mahalanobis distance for VAL, then set tau_ood at 99th percentile
val_dist = (((val_embs - mu) ** 2) / var).sum(dim=1)
OOD_Q = 0.99
tau_ood = float(torch.quantile(val_dist, OOD_Q).item())

print("\n✅ OOD threshold tau_ood (diag-Mahalanobis)")
print("OOD_Q:", OOD_Q)
print("tau_ood:", tau_ood)

# ----------------------------
# 4) Run challenge inference + policy
# ----------------------------
chal_ds = ChallengeDataset(df_chal, eval_tfms)

# num_workers=0 is the safest for pydicom; avoids worker crashes
chal_loader = DataLoader(chal_ds, batch_size=64, shuffle=False, num_workers=0, pin_memory=True)

rows = []

@torch.no_grad()
def run_policy(loader):
    for xb, idxs in loader:
        xb = xb.to(DEVICE, non_blocking=True)

        # logits -> calibrated probs for confidence
        logits = model(xb)
        probs_cal = torch.softmax(logits / T, dim=1)
        max_prob, pred_id = probs_cal.max(dim=1)

        # embeddings for OOD score
        z = feat_extractor(xb).view(xb.shape[0], -1).detach().cpu()
        dist = (((z - mu) ** 2) / var).sum(dim=1)

        max_prob = max_prob.detach().cpu()
        pred_id = pred_id.detach().cpu()

        for j in range(xb.shape[0]):
            ridx = int(idxs[j].item())
            # domain column name differs sometimes; try common ones
            dom = None
            for k in ["domain", "source", "dataset", "origin"]:
                if k in df_chal.columns:
                    dom = df_chal.iloc[ridx][k]
                    break
            if dom is None:
                dom = "unknown"

            rows.append({
                "row_idx": ridx,
                "path": df_chal.iloc[ridx]["path"],
                "domain": dom,
                "pred_id": int(pred_id[j].item()),
                "max_prob_cal": float(max_prob[j].item()),
                "ood_dist": float(dist[j].item()),
            })

run_policy(chal_loader)
df_out = pd.DataFrame(rows)

# pred_id -> pred_name mapping
if "ID2NAME" in globals():
    id2name = ID2NAME
else:
    id2name = {0: "glioma", 1: "meningioma", 2: "pituitary", 3: "notumor"}

df_out["pred_name"] = df_out["pred_id"].map(id2name)

# Policy:
# 1) if OOD -> ABSTAIN
# 2) else if low confidence -> ABSTAIN
# 3) else -> predicted class
df_out["is_ood"] = df_out["ood_dist"] > tau_ood
df_out["abstain_lowconf"] = df_out["max_prob_cal"] < tau_conf
df_out["final_action"] = np.where(df_out["is_ood"] | df_out["abstain_lowconf"], "ABSTAIN", df_out["pred_name"])

tumor_set = {"glioma", "meningioma", "pituitary"}
df_out["is_tumor_pred"] = df_out["pred_name"].isin(tumor_set)

pre_tumor = int(df_out["is_tumor_pred"].sum())
post_tumor = int(df_out["final_action"].isin(tumor_set).sum())
abstain_total = int((df_out["final_action"] == "ABSTAIN").sum())

print("\n✅ Challenge policy summary:")
print("Total rows:", len(df_out))
print("Tumor preds BEFORE policy:", pre_tumor)
print("Tumor preds AFTER policy:", post_tumor)
print("Total ABSTAINS:", abstain_total)

# Save artifacts
df_out.to_csv(OUT / "challenge_policy_outputs.csv", index=False)

summary = pd.DataFrame([{
    "keep_rate_in_domain": KEEP_RATE,
    "tau_conf": tau_conf,
    "ood_quantile": OOD_Q,
    "tau_ood": tau_ood,
    "challenge_rows": len(df_out),
    "tumor_preds_before": pre_tumor,
    "tumor_preds_after": post_tumor,
    "abstain_total": abstain_total,
}])
summary.to_csv(OUT / "policy_summary.csv", index=False)

print("saved ->", OUT / "challenge_policy_outputs.csv")
print("saved ->", OUT / "policy_summary.csv")

# Show worst offenders (highest-confidence tumor preds) and whether policy catches them
try:
    from IPython.display import display
    df_tumor = df_out[df_out["is_tumor_pred"]].sort_values("max_prob_cal", ascending=False)
    print("\nTop 10 tumor preds by confidence (with policy flags):")
    display(df_tumor[["domain","pred_name","max_prob_cal","is_ood","abstain_lowconf","final_action","path"]].head(10))
except Exception as e:
    print("Display skipped:", e)


✅ tau_conf chosen from VAL (calibrated)
KEEP_RATE: 0.95
tau_conf: 0.9921115636825562

✅ OOD threshold tau_ood (diag-Mahalanobis)
OOD_Q: 0.99
tau_ood: 1089.089599609375





✅ Challenge policy summary:
Total rows: 4275
Tumor preds BEFORE policy: 563
Tumor preds AFTER policy: 62
Total ABSTAINS: 1282
saved -> /kaggle/working/train_artifacts/challenge_policy_outputs.csv
saved -> /kaggle/working/train_artifacts/policy_summary.csv

Top 10 tumor preds by confidence (with policy flags):


Unnamed: 0,domain,pred_name,max_prob_cal,is_ood,abstain_lowconf,final_action,path
4008,stroke,meningioma,0.999789,False,False,meningioma,/kaggle/input/datasets/mitangshu11/brain-strok...
2808,stroke,meningioma,0.999732,False,False,meningioma,/kaggle/input/datasets/mitangshu11/brain-strok...
115,normal_dicom,meningioma,0.9996,False,False,meningioma,/kaggle/input/datasets/trainingdatapro/dicom-b...
2371,stroke,meningioma,0.999562,False,False,meningioma,/kaggle/input/datasets/mitangshu11/brain-strok...
4254,stroke,meningioma,0.999487,False,False,meningioma,/kaggle/input/datasets/mitangshu11/brain-strok...
2633,stroke,meningioma,0.999435,False,False,meningioma,/kaggle/input/datasets/mitangshu11/brain-strok...
3454,stroke,meningioma,0.999348,False,False,meningioma,/kaggle/input/datasets/mitangshu11/brain-strok...
206,normal_dicom,meningioma,0.999341,False,False,meningioma,/kaggle/input/datasets/trainingdatapro/dicom-b...
3999,stroke,meningioma,0.999204,False,False,meningioma,/kaggle/input/datasets/mitangshu11/brain-strok...
165,normal_dicom,meningioma,0.99887,False,False,meningioma,/kaggle/input/datasets/trainingdatapro/dicom-b...


In [None]:
# CELL: 12C_CLASS_CONDITIONAL_INLIER_GUARD - Catch high-confidence OOD tumor preds via class-conditional distance

import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader

# Preconditions: 12B must have run
need = ["DEVICE", "OUT", "NUM_CLASSES", "model", "val_loader", "feat_extractor", "tau_conf", "tau_ood", "T", "df_out"]
missing = [k for k in need if k not in globals()]
assert not missing, f"Missing prereqs: {missing}. Run 12B first."

model.eval()
feat_extractor.eval()

@torch.no_grad()
def collect_embeddings_and_labels(loader):
    Zs, Ys = [], []
    for xb, yb in loader:
        xb = xb.to(DEVICE, non_blocking=True)
        z = feat_extractor(xb).view(xb.shape[0], -1).detach().cpu()
        Zs.append(z)
        Ys.append(yb.detach().cpu())
    return torch.cat(Zs, dim=0), torch.cat(Ys, dim=0)

# 1) Build class-conditional stats from in-domain VAL embeddings
Z_val, y_val = collect_embeddings_and_labels(val_loader)  # (N, D), (N,)
D = Z_val.shape[1]

mu_c = torch.zeros((NUM_CLASSES, D))
var_c = torch.zeros((NUM_CLASSES, D))
tau_c = torch.zeros((NUM_CLASSES,))

Q_CLASS = 0.99  # class-conditional inlier threshold

for c in range(NUM_CLASSES):
    idx = torch.where(y_val == c)[0]
    assert idx.numel() > 10, f"Too few val examples for class {c}: {idx.numel()}"
    Zc = Z_val[idx]
    mu = Zc.mean(dim=0)
    var = Zc.var(dim=0, unbiased=False) + 1e-6

    # class-conditional diag Mahalanobis dist for in-domain examples of class c
    dist_c = (((Zc - mu) ** 2) / var).sum(dim=1)
    tau = float(torch.quantile(dist_c, Q_CLASS).item())

    mu_c[c] = mu
    var_c[c] = var
    tau_c[c] = tau

print("✅ Built class-conditional thresholds (tau_c) at quantile:", Q_CLASS)
print("tau_c:", [float(x) for x in tau_c])

# 2) Compute embedding distances for each challenge row to its predicted class cluster
# We need embeddings for challenge again (df_out contains row_idx in original df_chal order)
# We'll reuse the cached chal embeddings approach by reloading from paths.

from PIL import Image
import pydicom
from pathlib import Path
from torch.utils.data import Dataset

def load_any_image(path: str) -> Image.Image:
    p = Path(path)
    suf = p.suffix.lower()
    if suf == ".dcm":
        dcm = pydicom.dcmread(str(p), force=True)
        arr = dcm.pixel_array.astype(np.float32)
        mn, mx = float(arr.min()), float(arr.max())
        if mx > mn:
            arr = (arr - mn) / (mx - mn)
        else:
            arr = np.zeros_like(arr)
        arr = (arr * 255.0).clip(0,255).astype(np.uint8)
        return Image.fromarray(arr).convert("RGB")
    return Image.open(str(p)).convert("RGB")

class ChallengeEmbedDataset(Dataset):
    def __init__(self, df_out, tfms):
        self.df = df_out.reset_index(drop=True)
        self.tfms = tfms

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        path = self.df.iloc[idx]["path"]
        img = load_any_image(path)
        x = self.tfms(img)
        return x, idx

chal_ds2 = ChallengeEmbedDataset(df_out, eval_tfms)
chal_loader2 = DataLoader(chal_ds2, batch_size=64, shuffle=False, num_workers=0, pin_memory=True)

ood_dist_predclass = np.zeros(len(df_out), dtype=np.float32)

@torch.no_grad()
def compute_predclass_dist(loader):
    for xb, idxs in loader:
        xb = xb.to(DEVICE, non_blocking=True)
        z = feat_extractor(xb).view(xb.shape[0], -1).detach().cpu()  # (B,D)

        for j in range(z.shape[0]):
            i = int(idxs[j].item())
            pred = int(df_out.iloc[i]["pred_id"])  # predicted class from 12B
            mu = mu_c[pred]
            var = var_c[pred]
            d = (((z[j] - mu) ** 2) / var).sum().item()
            ood_dist_predclass[i] = d

compute_predclass_dist(chal_loader2)
df_out["predclass_dist"] = ood_dist_predclass

# 3) New guard: if far from predicted class cluster => abstain
df_out["is_outlier_predclass"] = df_out["predclass_dist"] > df_out["pred_id"].map(lambda c: float(tau_c[c]))

tumor_set = {"glioma","meningioma","pituitary"}
df_out["is_tumor_pred"] = df_out["pred_name"].isin(tumor_set)

# Policy v2:
# ABSTAIN if (global OOD) OR (low confidence) OR (outlier for predicted class)
df_out["final_action_v2"] = np.where(
    df_out["is_ood"] | df_out["abstain_lowconf"] | df_out["is_outlier_predclass"],
    "ABSTAIN",
    df_out["pred_name"]
)

pre_tumor = int(df_out["is_tumor_pred"].sum())
post_tumor_v1 = int(df_out["final_action"].isin(tumor_set).sum())
post_tumor_v2 = int(df_out["final_action_v2"].isin(tumor_set).sum())
abstain_v1 = int((df_out["final_action"] == "ABSTAIN").sum())
abstain_v2 = int((df_out["final_action_v2"] == "ABSTAIN").sum())

print("\n✅ Policy comparison on challenge:")
print("Tumor preds BEFORE policy:", pre_tumor)
print("Tumor preds AFTER policy v1:", post_tumor_v1)
print("Tumor preds AFTER policy v2:", post_tumor_v2)
print("ABSTAINS v1:", abstain_v1)
print("ABSTAINS v2:", abstain_v2)

# Save artifacts
df_out.to_csv(OUT / "challenge_policy_outputs_v2.csv", index=False)

summary2 = pd.DataFrame([{
    "tau_conf": float(tau_conf),
    "tau_ood": float(tau_ood),
    "q_class": float(Q_CLASS),
    "tau_c": [float(x) for x in tau_c],
    "challenge_rows": int(len(df_out)),
    "tumor_before": pre_tumor,
    "tumor_after_v1": post_tumor_v1,
    "tumor_after_v2": post_tumor_v2,
    "abstain_v1": abstain_v1,
    "abstain_v2": abstain_v2,
}])
summary2.to_csv(OUT / "policy_summary_v2.csv", index=False)

print("saved ->", OUT / "challenge_policy_outputs_v2.csv")
print("saved ->", OUT / "policy_summary_v2.csv")

# Show top remaining tumor preds after v2 (if any)
try:
    from IPython.display import display
    df_remain = df_out[df_out["final_action_v2"].isin(tumor_set)].sort_values("max_prob_cal", ascending=False)
    print("\nTop 10 remaining tumor preds AFTER v2:")
    display(df_remain[["domain","pred_name","max_prob_cal","is_ood","abstain_lowconf","is_outlier_predclass","final_action_v2","path"]].head(10))
except Exception as e:
    print("Display skipped:", e)



In [23]:
# CELL: 12D_DOMAIN_CLASSIFIER_OOD_GUARD - Train a domain (in-domain vs OOD) guard on embeddings + apply abstain policy

import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from pathlib import Path
from PIL import Image
import pydicom

# Preconditions
need = ["DEVICE","OUT","model","feat_extractor","eval_tfms","df_chal","val_loader","T","tau_conf"]
missing = [k for k in need if k not in globals()]
assert not missing, f"Missing prereqs: {missing}. Run 12B first (and calibration)."

# --- robust image loader ---
def load_any_image(path: str) -> Image.Image:
    p = Path(path)
    suf = p.suffix.lower()
    if suf == ".dcm":
        dcm = pydicom.dcmread(str(p), force=True)
        arr = dcm.pixel_array.astype(np.float32)
        mn, mx = float(arr.min()), float(arr.max())
        if mx > mn:
            arr = (arr - mn) / (mx - mn)
        else:
            arr = np.zeros_like(arr)
        arr = (arr * 255.0).clip(0,255).astype(np.uint8)
        return Image.fromarray(arr).convert("RGB")
    return Image.open(str(p)).convert("RGB")

class PathDataset(Dataset):
    def __init__(self, paths, tfms):
        self.paths = list(paths)
        self.tfms = tfms
    def __len__(self):
        return len(self.paths)
    def __getitem__(self, idx):
        img = load_any_image(self.paths[idx])
        x = self.tfms(img)
        return x

@torch.no_grad()
def embed_loader(loader):
    feat_extractor.eval()
    Z = []
    for xb in loader:
        if isinstance(xb, (tuple, list)):
            xb = xb[0]
        xb = xb.to(DEVICE, non_blocking=True)
        z = feat_extractor(xb).view(xb.shape[0], -1).detach().cpu()
        Z.append(z)
    return torch.cat(Z, dim=0).numpy()  # (N,D)

# -------------------------
# 1) Get in-domain embeddings (VAL) and OOD embeddings (challenge)
#    Keep it tight + reproducible (VAL only) to start
# -------------------------
@torch.no_grad()
def embed_from_val_loader(val_loader):
    Z = []
    for xb, _ in val_loader:
        xb = xb.to(DEVICE, non_blocking=True)
        z = feat_extractor(xb).view(xb.shape[0], -1).detach().cpu()
        Z.append(z)
    return torch.cat(Z, dim=0).numpy()

Z_in = embed_from_val_loader(val_loader)   # in-domain
paths_ood = df_chal["path"].tolist()
ood_ds = PathDataset(paths_ood, eval_tfms)
ood_loader = DataLoader(ood_ds, batch_size=64, shuffle=False, num_workers=0, pin_memory=True)
Z_ood = embed_loader(ood_loader)

y_in = np.ones((Z_in.shape[0],), dtype=np.int64)
y_ood = np.zeros((Z_ood.shape[0],), dtype=np.int64)

X = np.vstack([Z_in, Z_ood])
y = np.concatenate([y_in, y_ood])

print("Embeddings shapes:")
print("  in-domain (val):", Z_in.shape)
print("  ood (challenge):", Z_ood.shape)
print("  X:", X.shape, "y:", y.shape)

# -------------------------
# 2) Train a simple domain classifier on embeddings
# -------------------------
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

clf = LogisticRegression(max_iter=2000, class_weight="balanced", n_jobs=-1)
clf.fit(X, y)

p_in_train = clf.predict_proba(X)[:, 1]
auc = roc_auc_score(y, p_in_train)
print("✅ Domain guard AUC (val vs challenge):", auc)

# -------------------------
# 3) Choose tau_domain to keep 99% of in-domain val (low false abstain)
# -------------------------
KEEP_IN_DOMAIN = 0.99
p_in_val = clf.predict_proba(Z_in)[:, 1]
tau_domain = float(np.quantile(p_in_val, 1.0 - KEEP_IN_DOMAIN))

print("✅ tau_domain chosen from VAL")
print("KEEP_IN_DOMAIN:", KEEP_IN_DOMAIN)
print("tau_domain:", tau_domain)

# -------------------------
# 4) Apply domain guard to challenge + compare to your v1 results
# -------------------------
# We reuse df_out if it exists (from 12B/12C). Otherwise, we rebuild a minimal df_out using your existing challenge predictions.
assert "df_out" in globals(), "df_out not found. Run 12B first (it creates df_out)."

# Compute p_in_domain for each challenge row using embeddings we already computed (Z_ood aligns with df_chal order)
# df_out uses df_chal row order; we rebuild embeddings aligned to df_out rows:
# safest: re-embed df_out paths (small enough, 4275)
paths_out = df_out["path"].tolist()
out_ds = PathDataset(paths_out, eval_tfms)
out_loader = DataLoader(out_ds, batch_size=64, shuffle=False, num_workers=0, pin_memory=True)
Z_out = embed_loader(out_loader)

p_in_out = clf.predict_proba(Z_out)[:, 1]
df_out["p_in_domain"] = p_in_out

df_out["abstain_domain"] = df_out["p_in_domain"] < tau_domain

tumor_set = {"glioma","meningioma","pituitary"}
df_out["final_action_v3"] = np.where(
    df_out["abstain_domain"] | df_out["abstain_lowconf"],
    "ABSTAIN",
    df_out["pred_name"]
)

pre_tumor = int(df_out["pred_name"].isin(tumor_set).sum())
post_tumor_v1 = int(df_out["final_action"].isin(tumor_set).sum())
post_tumor_v3 = int(df_out["final_action_v3"].isin(tumor_set).sum())
abstain_v1 = int((df_out["final_action"] == "ABSTAIN").sum())
abstain_v3 = int((df_out["final_action_v3"] == "ABSTAIN").sum())

print("\n✅ Policy comparison (challenge):")
print("Tumor preds BEFORE:", pre_tumor)
print("Tumor preds AFTER v1:", post_tumor_v1)
print("Tumor preds AFTER v3 (domain guard):", post_tumor_v3)
print("ABSTAINS v1:", abstain_v1)
print("ABSTAINS v3:", abstain_v3)

# Save artifacts
df_out.to_csv(OUT / "challenge_policy_outputs_v3.csv", index=False)
pd.DataFrame([{
    "tau_conf": float(tau_conf),
    "keep_in_domain": float(KEEP_IN_DOMAIN),
    "tau_domain": float(tau_domain),
    "auc_val_vs_challenge": float(auc),
    "tumor_before": pre_tumor,
    "tumor_after_v1": post_tumor_v1,
    "tumor_after_v3": post_tumor_v3,
    "abstain_v1": abstain_v1,
    "abstain_v3": abstain_v3,
}]).to_csv(OUT / "policy_summary_v3.csv", index=False)

print("saved ->", OUT / "challenge_policy_outputs_v3.csv")
print("saved ->", OUT / "policy_summary_v3.csv")

# Show top remaining tumor preds after v3
try:
    from IPython.display import display
    remain = df_out[df_out["final_action_v3"].isin(tumor_set)].sort_values("max_prob_cal", ascending=False)
    print("\nTop 10 remaining tumor preds AFTER v3:")
    display(remain[["domain","pred_name","max_prob_cal","p_in_domain","abstain_domain","final_action_v3","path"]].head(10))
except Exception as e:
    print("Display skipped:", e)




Embeddings shapes:
  in-domain (val): (1850, 512)
  ood (challenge): (4275, 512)
  X: (6125, 512) y: (6125,)
✅ Domain guard AUC (val vs challenge): 1.0
✅ tau_domain chosen from VAL
KEEP_IN_DOMAIN: 0.99
tau_domain: 0.9011013984593388





✅ Policy comparison (challenge):
Tumor preds BEFORE: 563
Tumor preds AFTER v1: 62
Tumor preds AFTER v3 (domain guard): 0
ABSTAINS v1: 1282
ABSTAINS v3: 4275
saved -> /kaggle/working/train_artifacts/challenge_policy_outputs_v3.csv
saved -> /kaggle/working/train_artifacts/policy_summary_v3.csv

Top 10 remaining tumor preds AFTER v3:


Unnamed: 0,domain,pred_name,max_prob_cal,p_in_domain,abstain_domain,final_action_v3,path


In [None]:
import numpy as np
from pathlib import Path
from PIL import Image
import pydicom
from torch.utils.data import Dataset

def load_any_image(path: str) -> Image.Image:
    p = Path(path)
    suf = p.suffix.lower()
    if suf == ".dcm":
        dcm = pydicom.dcmread(str(p), force=True)
        arr = dcm.pixel_array.astype(np.float32)
        mn, mx = float(arr.min()), float(arr.max())
        if mx > mn:
            arr = (arr - mn) / (mx - mn)
        else:
            arr = np.zeros_like(arr)
        arr = (arr * 255.0).clip(0,255).astype(np.uint8)
        return Image.fromarray(arr).convert("RGB")
    return Image.open(str(p)).convert("RGB")

class PathDataset(Dataset):
    def __init__(self, paths, tfms):
        self.paths = list(paths)
        self.tfms = tfms
    def __len__(self):
        return len(self.paths)
    def __getitem__(self, idx):
        img = load_any_image(self.paths[idx])
        return self.tfms(img)

print("✅ PathDataset + load_any_image ready")


In [25]:
# CELL: 12E_DOMAIN_GUARD_HELDOUT_AND_SWEEP - held-out AUC + tau_domain sweep (val/test/challenge)

import numpy as np
import pandas as pd
import torch
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

# Preconditions
need = ["DEVICE","feat_extractor","eval_tfms","val_loader","test_loader","df_chal"]
missing = [k for k in need if k not in globals()]
assert not missing, f"Missing prereqs: {missing}. Run 12D prerequisites first."

feat_extractor.eval()

@torch.no_grad()
def embed_from_labeled_loader(loader):
    Z = []
    for xb, _ in loader:
        xb = xb.to(DEVICE, non_blocking=True)
        z = feat_extractor(xb).view(xb.shape[0], -1).detach().cpu()
        Z.append(z)
    return torch.cat(Z, dim=0).numpy()

# reuse the PathDataset + load_any_image from 12D if present
assert "PathDataset" in globals() and "load_any_image" in globals(), "Run 12D first (expects PathDataset/load_any_image)."
from torch.utils.data import DataLoader

@torch.no_grad()
def embed_from_paths(paths, batch_size=64):
    ds = PathDataset(paths, eval_tfms)
    dl = DataLoader(ds, batch_size=batch_size, shuffle=False, num_workers=0, pin_memory=True)
    Z = []
    for xb in dl:
        xb = xb.to(DEVICE, non_blocking=True)
        z = feat_extractor(xb).view(xb.shape[0], -1).detach().cpu()
        Z.append(z)
    return torch.cat(Z, dim=0).numpy()

# 1) Embeddings
Z_val = embed_from_labeled_loader(val_loader)   # in-domain
Z_test = embed_from_labeled_loader(test_loader) # in-domain (holdout)
Z_chal = embed_from_paths(df_chal["path"].tolist())

y_val = np.ones((Z_val.shape[0],), dtype=np.int64)
y_chal = np.zeros((Z_chal.shape[0],), dtype=np.int64)

# 2) Held-out evaluation (split within val+challenge)
X = np.vstack([Z_val, Z_chal])
y = np.concatenate([y_val, y_chal])

X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

clf = LogisticRegression(max_iter=2000, class_weight="balanced", n_jobs=-1)
clf.fit(X_tr, y_tr)

p_in_te = clf.predict_proba(X_te)[:, 1]
auc_heldout = roc_auc_score(y_te, p_in_te)

print("✅ Held-out domain AUC (val vs challenge):", auc_heldout)

# 3) Sweep tau_domain based on "keep rate" on VAL, report abstain rates on TEST + CHALLENGE
p_in_val = clf.predict_proba(Z_val)[:, 1]
p_in_test = clf.predict_proba(Z_test)[:, 1]
p_in_chal = clf.predict_proba(Z_chal)[:, 1]

keep_rates = [0.999, 0.995, 0.99, 0.98, 0.95, 0.90]

rows = []
for keep in keep_rates:
    tau = float(np.quantile(p_in_val, 1.0 - keep))
    abst_val = float((p_in_val < tau).mean())
    abst_test = float((p_in_test < tau).mean())
    abst_chal = float((p_in_chal < tau).mean())
    rows.append({
        "keep_in_val_target": keep,
        "tau_domain": tau,
        "abstain_val": abst_val,
        "abstain_test": abst_test,
        "abstain_challenge": abst_chal,
    })

df_sweep = pd.DataFrame(rows).sort_values("keep_in_val_target", ascending=False)
print("\n✅ tau_domain sweep:")
display(df_sweep)

df_sweep.to_csv(OUT / "domain_guard_tau_sweep.csv", index=False)
print("saved ->", OUT / "domain_guard_tau_sweep.csv")

# --- export domain guard weights for MVP (no sklearn dependency) ---
import numpy as np
np.savez(
    OUT / "domain_guard_lr.npz",
    coef=clf.coef_.astype(np.float32),
    intercept=clf.intercept_.astype(np.float32),
)
print("saved ->", OUT / "domain_guard_lr.npz")




✅ Held-out domain AUC (val vs challenge): 0.9999157380295338

✅ tau_domain sweep:


Unnamed: 0,keep_in_val_target,tau_domain,abstain_val,abstain_test,abstain_challenge
0,0.999,0.428545,0.001081,0.007786,0.998363
1,0.995,0.800385,0.004865,0.023844,0.999766
2,0.99,0.848166,0.01027,0.026764,0.999766
3,0.98,0.913868,0.02,0.039416,1.0
4,0.95,0.964167,0.05027,0.0618,1.0
5,0.9,0.988675,0.1,0.112895,1.0


saved -> /kaggle/working/train_artifacts/domain_guard_tau_sweep.csv
saved -> /kaggle/working/train_artifacts/domain_guard_lr.npz


In [26]:
from pathlib import Path
p = Path("/kaggle/working/train_artifacts/domain_guard_lr.npz")
print("domain_guard_lr.npz exists:", p.exists(), "size_mb:", (p.stat().st_size/1e6 if p.exists() else None))


domain_guard_lr.npz exists: True size_mb: 0.002564


In [None]:
# CELL: 12F_FINAL_POLICY_REPORT - Lock tau_domain + report final demo metrics (test + challenge)

import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
import json

# ---- set your chosen tau_domain from the sweep ----
TAU_DOMAIN = 0.580288  # keep_in_val_target=0.999 row

# prereqs
need = ["OUT","NUM_CLASSES","T","tau_conf","test_logits","test_y","df_out"]
missing = [k for k in need if k not in globals()]
assert not missing, f"Missing prereqs: {missing}. Run calibration + 12B + domain guard cells first."

# Helper: macro-F1 on subset
def confusion_matrix(preds, y_true, k):
    cm = torch.zeros((k, k), dtype=torch.int64)
    for t, p in zip(y_true.tolist(), preds.tolist()):
        cm[t, p] += 1
    return cm

def macro_f1_from_cm(cm):
    k = cm.shape[0]
    f1s = []
    for c in range(k):
        tp = cm[c, c].item()
        fp = cm[:, c].sum().item() - tp
        fn = cm[c, :].sum().item() - tp
        denom = (2*tp + fp + fn)
        f1 = 0.0 if denom == 0 else (2*tp / denom)
        f1s.append(f1)
    return float(sum(f1s) / k)

# -----------------------
# 1) IN-DOMAIN (TEST): apply final policy
# -----------------------
test_probs_cal = torch.softmax(test_logits / T, dim=1)
test_maxprob = test_probs_cal.max(dim=1).values
test_pred = test_probs_cal.argmax(dim=1)

# Domain scores on TEST were computed in the sweep cell as p_in_test;
# if not in globals, we recompute quickly using clf + Z_test if present.
assert "p_in_test" in globals() or ("clf" in globals() and "Z_test" in globals()), \
    "Need p_in_test from sweep cell. Re-run the sweep cell first."

if "p_in_test" not in globals():
    p_in_test = clf.predict_proba(Z_test)[:, 1]

p_in_test = np.asarray(p_in_test)
assert p_in_test.shape[0] == test_y.shape[0], (p_in_test.shape, test_y.shape)

abstain_domain_test = p_in_test < TAU_DOMAIN
abstain_lowconf_test = (test_maxprob.numpy() < float(tau_conf))

abstain_test = abstain_domain_test | abstain_lowconf_test
coverage_test = float((~abstain_test).mean())
abstain_rate_test = float(abstain_test.mean())

# Metrics on accepted only
accepted_idx = np.where(~abstain_test)[0]
test_acc_all = float((test_pred == test_y).to(torch.float32).mean().item())

if accepted_idx.size > 0:
    y_acc = test_y[accepted_idx]
    p_acc = test_pred[accepted_idx]
    acc_acc = float((p_acc == y_acc).to(torch.float32).mean().item())
    cm = confusion_matrix(p_acc, y_acc, NUM_CLASSES)
    f1_acc = macro_f1_from_cm(cm)
else:
    acc_acc, f1_acc = None, None

print("✅ FINAL POLICY (TEST)")
print("tau_conf:", float(tau_conf))
print("tau_domain:", TAU_DOMAIN)
print("test_acc_all (no abstain):", test_acc_all)
print("test_abstain_rate:", abstain_rate_test)
print("test_coverage:", coverage_test)
print("test_acc_on_accepted:", acc_acc)
print("test_macro_f1_on_accepted:", f1_acc)

# -----------------------
# 2) OOD (CHALLENGE): apply final policy using p_in_domain from domain guard cell
# -----------------------
assert "p_in_domain" in df_out.columns, "df_out missing p_in_domain. Run domain guard cell (12D) first."

df_final = df_out.copy()
df_final["abstain_domain"] = df_final["p_in_domain"] < TAU_DOMAIN
df_final["final_action_demo"] = np.where(
    df_final["abstain_domain"] | df_final["abstain_lowconf"],
    "ABSTAIN",
    df_final["pred_name"]
)

tumor_set = {"glioma","meningioma","pituitary"}
tumor_before = int(df_final["pred_name"].isin(tumor_set).sum())
tumor_after = int(df_final["final_action_demo"].isin(tumor_set).sum())
abstain_chal = int((df_final["final_action_demo"] == "ABSTAIN").sum())
coverage_chal = 1.0 - abstain_chal / len(df_final)

print("\n✅ FINAL POLICY (CHALLENGE)")
print("challenge_rows:", len(df_final))
print("tumor_preds_before:", tumor_before)
print("tumor_preds_after:", tumor_after)
print("abstains:", abstain_chal)
print("coverage:", coverage_chal)

# Save artifacts + config
df_final.to_csv(OUT / "challenge_policy_outputs_demo.csv", index=False)

cfg = {
    "temperature_T": float(T),
    "tau_conf": float(tau_conf),
    "tau_domain": float(TAU_DOMAIN),
    "notes": "Final demo policy: ABSTAIN if p_in_domain < tau_domain OR max_prob_cal < tau_conf."
}
with open(OUT / "final_policy_config.json", "w") as f:
    json.dump(cfg, f, indent=2)

pd.DataFrame([{
    "test_acc_all": test_acc_all,
    "test_abstain_rate": abstain_rate_test,
    "test_coverage": coverage_test,
    "test_acc_on_accepted": acc_acc,
    "test_macro_f1_on_accepted": f1_acc,
    "challenge_rows": len(df_final),
    "challenge_tumor_before": tumor_before,
    "challenge_tumor_after": tumor_after,
    "challenge_abstains": abstain_chal,
    "challenge_coverage": coverage_chal,
    "tau_conf": float(tau_conf),
    "tau_domain": float(TAU_DOMAIN),
    "T": float(T),
}]).to_csv(OUT / "final_policy_report.csv", index=False)

print("\nsaved ->", OUT / "challenge_policy_outputs_demo.csv")
print("saved ->", OUT / "final_policy_config.json")
print("saved ->", OUT / "final_policy_report.csv")


In [None]:
# CELL: 13_CHALLENGE_TOP_TUMOR_FALSE_POSITIVES - inspect highest-confidence tumor preds on non-tumor domains

import pandas as pd
from pathlib import Path
from PIL import Image
import matplotlib.pyplot as plt

OUT = Path("/kaggle/working/train_artifacts")
pred_path = OUT / "challenge_predictions.csv"
assert pred_path.exists(), f"Missing: {pred_path} (run the challenge audit cell first)"

dfp = pd.read_csv(pred_path)

# Tumor classes only (everything except notumor)
tumor = dfp[dfp["pred_name"].isin(["glioma", "meningioma", "pituitary"])].copy()
tumor = tumor.sort_values("max_prob", ascending=False)

print("tumor-pred rows:", len(tumor))
display(tumor[["domain","pred_name","max_prob","path"]].head(20))

# Visual grid (top N)
N = 24
show = tumor.head(N).reset_index(drop=True)

cols = 6
rows = (N + cols - 1) // cols
plt.figure(figsize=(3*cols, 3*rows))

for i in range(len(show)):
    p = Path(show.loc[i, "path"])
    title = f'{show.loc[i,"domain"]}\n{show.loc[i,"pred_name"]} {show.loc[i,"max_prob"]:.2f}'
    try:
        img = Image.open(p).convert("RGB")
    except Exception:
        # If any DICOM slipped in, skip rendering here (your audit loader handles DICOM).
        continue
    ax = plt.subplot(rows, cols, i+1)
    ax.imshow(img)
    ax.set_title(title, fontsize=9)
    ax.axis("off")

plt.tight_layout()
plt.show()


In [27]:
# CELL98: SAVE_AND_ZIP_ARTIFACTS - Verify artifacts + bundle into one zip for download

from pathlib import Path
import os, zipfile, json, time

OUT = Path(OUT)  # ensure Path

must_have = [
    "best_model.pth",
    "train_history.json",
    "env_snapshot.json",
    "temperature_scaling.json",
    "calibration_summary_before_after.csv",
    "calibration_metrics.json",
    "final_policy_config.json",
    "domain_guard_lr.npz",
    "final_policy_report.csv",
    "muaz_policy_report.csv",
    "muaz_policy_outputs.csv",
    "challenge_policy_outputs_demo.csv",
]

print("OUT =", OUT)
print("\nChecking required files...\n")

missing = []
present = []
for fn in must_have:
    p = OUT / fn
    if p.exists():
        present.append((fn, p.stat().st_size))
    else:
        missing.append(fn)

for fn, sz in sorted(present, key=lambda x: -x[1]):
    print(f"✅ {fn:35s} {sz/1e6:8.2f} MB")

if missing:
    print("\n❌ Missing:")
    for fn in missing:
        print(" -", fn)
else:
    print("\n✅ All required files present.")

# Always include "everything in OUT" in the zip (even if some names differ)
zip_path = OUT.parent / "mri_demo_artifacts_bundle.zip"
if zip_path.exists():
    zip_path.unlink()

print("\nZipping all files under:", OUT)
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as z:
    for p in OUT.rglob("*"):
        if p.is_file():
            z.write(p, arcname=str(p.relative_to(OUT.parent)))

print("\n✅ Zip created:", zip_path, f"size={zip_path.stat().st_size/1e6:.2f} MB")

# Small manifest for human sanity
manifest = {
    "created_utc": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
    "out_dir": str(OUT),
    "zip_path": str(zip_path),
    "files": [{"name": f, "bytes": int(sz)} for f, sz in present],
    "missing": missing,
}
manifest_path = OUT / "BUNDLE_MANIFEST.json"
with open(manifest_path, "w") as f:
    json.dump(manifest, f, indent=2)

print("✅ Wrote:", manifest_path)


OUT = /kaggle/working/train_artifacts

Checking required files...

✅ best_model.pth                         44.79 MB
✅ challenge_policy_outputs_demo.csv       1.13 MB
✅ muaz_policy_outputs.csv                 0.07 MB
✅ domain_guard_lr.npz                     0.00 MB
✅ train_history.json                      0.00 MB
✅ calibration_metrics.json                0.00 MB
✅ calibration_summary_before_after.csv     0.00 MB
✅ muaz_policy_report.csv                  0.00 MB
✅ final_policy_report.csv                 0.00 MB
✅ final_policy_config.json                0.00 MB
✅ env_snapshot.json                       0.00 MB
✅ temperature_scaling.json                0.00 MB

✅ All required files present.

Zipping all files under: /kaggle/working/train_artifacts

✅ Zip created: /kaggle/working/mri_demo_artifacts_bundle.zip size=84.05 MB
✅ Wrote: /kaggle/working/train_artifacts/BUNDLE_MANIFEST.json


## Appendix: Debug helpers (optional)

These cells are optional and safe. They help confirm inputs and GPU availability.


In [None]:
# CELL: 99_DEBUG_INPUTS - List mounted inputs (optional)

!ls -1 /kaggle/input | sed -n '1,200p'


In [None]:
# CELL: 99_DEBUG_GPU - Check GPU (optional)

!nvidia-smi || true


In [None]:
%%bash
ls -lah /kaggle/working | sed -n '1,120p'
ls -lah /kaggle/working/train_artifacts 2>/dev/null | sed -n '1,200p' || true

echo "---- searching for best_model.pth ----"
find /kaggle -maxdepth 6 -name "best_model.pth" -print 2>/dev/null | sed -n '1,50p'

echo "---- searching for any .pth ----"
find /kaggle -maxdepth 6 -name "*.pth" -print 2>/dev/null | sed -n '1,50p'


In [28]:
# CELL: 13_PERSIST_PACKAGE - bundle model + metadata for publishing (safe to run multiple times)

from pathlib import Path
import json, hashlib, shutil, time

SRC = Path("/kaggle/working/train_artifacts")
assert SRC.exists(), f"Missing {SRC}. Did you run training in this session?"

# Required checkpoint
CKPT = SRC / "best_model.pth"
assert CKPT.exists(), f"Missing checkpoint: {CKPT}\nTip: check SRC contents with: !ls -lah /kaggle/working/train_artifacts"

# Create a clean publish folder
PUBLISH = Path("/kaggle/working/publish_mri_model_v1")
if PUBLISH.exists():
    shutil.rmtree(PUBLISH)
PUBLISH.mkdir(parents=True, exist_ok=True)

# Copy everything in train_artifacts (keeps your eval CSVs too)
for p in SRC.iterdir():
    if p.is_file():
        shutil.copy2(p, PUBLISH / p.name)

# Compute sha256 of checkpoint (so you can verify integrity later)
h = hashlib.sha256()
with open(CKPT, "rb") as f:
    for chunk in iter(lambda: f.read(1024 * 1024), b""):
        h.update(chunk)
ckpt_sha256 = h.hexdigest()

# Write lightweight metadata
meta = {
    "created_utc": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
    "checkpoint_file": CKPT.name,
    "checkpoint_sha256": ckpt_sha256,
    "artifacts_included": sorted([p.name for p in PUBLISH.iterdir() if p.is_file()]),
    "notes": "Baseline ResNet18 classifier. See train_history.json + challenge_* CSVs for metrics/audits.",
}
(PUBLISH / "MODEL_PACKAGE_METADATA.json").write_text(json.dumps(meta, indent=2))

# Optional: create a zip for easy download (dataset can store folder OR zip)
zip_path = shutil.make_archive(str(PUBLISH), "zip", root_dir=str(PUBLISH))

print("✅ publish folder:", PUBLISH)
print("✅ zip:", zip_path)
print("✅ checkpoint sha256:", ckpt_sha256)
print("Files in publish folder:")
print("\n".join([" - " + p.name for p in sorted(PUBLISH.iterdir()) if p.is_file()]))


✅ publish folder: /kaggle/working/publish_mri_model_v1
✅ zip: /kaggle/working/publish_mri_model_v1.zip
✅ checkpoint sha256: caa46dbe12301cb25a75225d82496952b14841f2166db75041a9eccc7b70c861
Files in publish folder:
 - BUNDLE_MANIFEST.json
 - MODEL_PACKAGE_METADATA.json
 - best_model.pth
 - calib_test_logits.pt
 - calib_val_logits.pt
 - calibration_metrics.json
 - calibration_summary_before.csv
 - calibration_summary_before_after.csv
 - challenge_conf_by_pred.csv
 - challenge_confidence_summary.csv
 - challenge_policy_outputs.csv
 - challenge_policy_outputs_demo.csv
 - challenge_policy_outputs_v2.csv
 - challenge_policy_outputs_v3.csv
 - challenge_pred_distribution.csv
 - challenge_predictions.csv
 - domain_guard_lr.npz
 - domain_guard_tau_sweep.csv
 - env_snapshot.json
 - final_policy_config.json
 - final_policy_report.csv
 - mri_resnet18_baseline_best.pth
 - muaz_policy_outputs.csv
 - muaz_policy_report.csv
 - policy_summary.csv
 - policy_summary_v2.csv
 - policy_summary_v3.csv
 - reli

In [None]:
# CHECKPOINT INVENTORY (run this now)
!echo "=== /kaggle/working ==="
!ls -lah /kaggle/working | sed -n '1,200p' || true

!echo "\n=== /kaggle/working/train_artifacts ==="
!ls -lah /kaggle/working/train_artifacts | sed -n '1,200p' || true

!echo "\n=== find checkpoints (*.pth, *.pt) ==="
!find /kaggle/working -maxdepth 4 -type f \( -name "*.pth" -o -name "*.pt" \) -print 2>/dev/null | sed -n '1,200p' || true

!echo "\n=== sizes ==="
!du -sh /kaggle/working/train_artifacts 2>/dev/null || true


In [None]:
from pathlib import Path
import shutil

src = Path("/kaggle/working/train_artifacts/best_model.pth")
dst = Path("/kaggle/working/train_artifacts/mri_resnet18_baseline_best.pth")

assert src.exists(), f"Missing: {src}"
shutil.copy2(src, dst)
print("✅ copied to:", dst)


In [None]:
# VERIFY: What datasets/domains are in each split (by parsing the path prefix)

import pandas as pd
import re

def dataset_key_from_path(p: str) -> str:
    # common Kaggle pattern you’ve seen: /kaggle/input/datasets/<owner>/<slug>/...
    m = re.search(r"/kaggle/input/datasets/([^/]+/[^/]+)/", str(p))
    if m:
        return m.group(1)  # owner/slug
    # fallback: /kaggle/input/<slug>/...
    m2 = re.search(r"/kaggle/input/([^/]+)/", str(p))
    return m2.group(1) if m2 else "unknown"

def summarize_split(df, name):
    df = df.copy()
    df["dataset_key"] = df["path"].map(dataset_key_from_path)
    print(f"\n=== {name} ===")
    print("rows:", len(df))
    if "label_name" in df.columns:
        print("\nlabel distribution:")
        display(df["label_name"].value_counts(dropna=False))
    print("\ndataset_key distribution:")
    display(df["dataset_key"].value_counts(dropna=False).head(20))
    print("\nexample paths:")
    display(df[["dataset_key","path"]].head(5))

# If df_train/df_val/df_test already exist, use them; otherwise read from split paths
if "df_train" in globals() and "df_val" in globals() and "df_test" in globals():
    summarize_split(df_train, "TRAIN")
    summarize_split(df_val,   "VAL")
    summarize_split(df_test,  "TEST")
else:
    # uses your earlier split_train/split_val/split_test paths
    df_train2 = pd.read_csv(split_train)
    df_val2   = pd.read_csv(split_val)
    df_test2  = pd.read_csv(split_test)
    summarize_split(df_train2, "TRAIN")
    summarize_split(df_val2,   "VAL")
    summarize_split(df_test2,  "TEST")


In [None]:
# DIAG: list all DataLoaders currently defined (name + dataset size + batch size)

import torch

loaders = []
for name, obj in list(globals().items()):
    if isinstance(obj, torch.utils.data.DataLoader):
        ds_len = None
        try:
            ds_len = len(obj.dataset)
        except Exception:
            pass
        loaders.append((name, ds_len, getattr(obj, "batch_size", None)))

loaders = sorted(loaders, key=lambda x: (x[1] is None, x[1] if x[1] is not None else 10**9))
print("Found DataLoaders:")
for name, ds_len, bs in loaders:
    print(f" - {name:25s} dataset_len={ds_len} batch_size={bs}")

# Heuristic: pick external MUAZ loader by size ~1311
cands = [x for x in loaders if x[1] == 1311]
print("\nCandidates with dataset_len == 1311:", cands)


In [None]:
import torch
import torch.nn as nn

ckpt = torch.load(OUT/"best_model.pth", map_location=DEVICE)
model.load_state_dict(ckpt["model_state_dict"])
model.eval()

# penultimate layer features for embeddings
feat_extractor = nn.Sequential(*list(model.children())[:-1]).to(DEVICE)
feat_extractor.eval()

print("✅ loaded checkpoint + built feat_extractor")
