Prove your splits are valid (no overlaps, no missing files, images load, and there aren’t obvious duplicates/leaks).

Checking for the CWD just to see if I should use relative paths or absolute paths.

In [2]:
from _bootstrap import bootstrap
bootstrap()

from xai_lab.utils.paths import find_project_root, resolve_path

PROJECT_ROOT = find_project_root()


Checking import paths

In [3]:
from pathlib import Path
import pandas as pd
from PIL import Image
import json

# ---- Adjust these if your split filenames/paths differ ----
SPLITS_DIR = Path("../data/processed/ckplus/splits")  # notebook is in notebooks/, so ../
TRAIN_CSV = SPLITS_DIR / "train.csv"
VAL_CSV   = SPLITS_DIR / "val.csv"
TEST_CSV  = SPLITS_DIR / "test.csv"

ARTIFACTS_DIR = Path("../artifacts/reports/eda")
ARTIFACTS_DIR.mkdir(parents=True, exist_ok=True)

print(TRAIN_CSV.exists(), VAL_CSV.exists(), TEST_CSV.exists())

True True True


Load splits and validate schema to check for columns

In [4]:
def load_split(path: Path) -> pd.DataFrame:
    df = pd.read_csv(path)
    return df

train_df = load_split(TRAIN_CSV)
val_df   = load_split(VAL_CSV)
test_df  = load_split(TEST_CSV)

display(train_df.head())

required_cols = {"path", "label", "label_name"}
for name, df in [("train", train_df), ("val", val_df), ("test", test_df)]:
    missing = required_cols - set(df.columns)
    assert not missing, f"{name} is missing columns: {missing}"

print("Loaded:", len(train_df), len(val_df), len(test_df))

Unnamed: 0,path,label,label_name,width,height,mode,sha1
0,data/raw/ckplus/happy/S078_004_00000027.png,4,happy,48,48,L,b8cf9e957093b01e482005a1c47c514c72861e28
1,data/raw/ckplus/sadness/S080_005_00000011.png,5,sadness,48,48,L,54c7b1b5cf50c23cda4b4b6fe76fed4ffb639374
2,data/raw/ckplus/happy/S100_006_00000015.png,4,happy,48,48,L,5382e1c188577cb99e0d8c1c06f5551600ddb152
3,data/raw/ckplus/sadness/S138_007_00000009.png,5,sadness,48,48,L,9168e33c06eaf8bd31c87efb42c8e915604bfa3a
4,data/raw/ckplus/anger/S050_004_00000021.png,0,anger,48,48,L,ba8ea832e8c1feaf480996a441bafff3f172923b


Loaded: 686 147 148


Basic integrity: counts + obvious nulls. There should be no NaNs and duplicates.

In [5]:
def basic_checks(name, df):
    print(f"\n== {name} ==")
    print(df[["label", "label_name"]].isna().sum())
    print("Unique labels:", df["label"].nunique())
    print("Unique label_names:", df["label_name"].nunique())
    print("Unique paths:", df["path"].nunique(), " / total rows:", len(df))

basic_checks("train", train_df)
basic_checks("val", val_df)
basic_checks("test", test_df)


== train ==
label         0
label_name    0
dtype: int64
Unique labels: 7
Unique label_names: 7
Unique paths: 686  / total rows: 686

== val ==
label         0
label_name    0
dtype: int64
Unique labels: 7
Unique label_names: 7
Unique paths: 147  / total rows: 147

== test ==
label         0
label_name    0
dtype: int64
Unique labels: 7
Unique label_names: 7
Unique paths: 148  / total rows: 148


File existence confirmation on disk

In [6]:
def find_missing(df: pd.DataFrame) -> pd.DataFrame:
    paths = df["path"].astype(str).map(resolve_path)
    exists = paths.map(lambda p: p.exists())
    missing_df = df.loc[~exists].copy()
    missing_df["resolved_path"] = paths.loc[~exists].astype(str)
    return missing_df

missing_train = find_missing(train_df)
missing_val   = find_missing(val_df)
missing_test  = find_missing(test_df)

print("Missing files:", len(missing_train), len(missing_val), len(missing_test))

if len(missing_train) + len(missing_val) + len(missing_test) > 0:
    issues = pd.concat([
        missing_train.assign(split="train"),
        missing_val.assign(split="val"),
        missing_test.assign(split="test"),
    ])
    issues.to_csv(ARTIFACTS_DIR / "missing_files.csv", index=False)
    display(issues.head(20))
    print("Saved:", ARTIFACTS_DIR / "missing_files.csv")

Missing files: 0 0 0


Leak detection (split overlap checks)

In [7]:
train_paths = set(train_df["path"].astype(str))
val_paths   = set(val_df["path"].astype(str))
test_paths  = set(test_df["path"].astype(str))

overlap_train_val  = train_paths & val_paths
overlap_train_test = train_paths & test_paths
overlap_val_test   = val_paths & test_paths

print("Overlap train∩val :", len(overlap_train_val))
print("Overlap train∩test:", len(overlap_train_test))
print("Overlap val∩test  :", len(overlap_val_test))

if overlap_train_val or overlap_train_test or overlap_val_test:
    overlap_report = {
        "train_val": sorted(list(overlap_train_val))[:50],
        "train_test": sorted(list(overlap_train_test))[:50],
        "val_test": sorted(list(overlap_val_test))[:50],
        "note": "Lists truncated to 50 each."
    }
    with open(ARTIFACTS_DIR / "split_overlaps.json", "w", encoding="utf-8") as f:
        json.dump(overlap_report, f, indent=2)
    print("Saved:", ARTIFACTS_DIR / "split_overlaps.json")

Overlap train∩val : 0
Overlap train∩test: 0
Overlap val∩test  : 0


See if it opens the image correctly.

In [8]:
from tqdm import tqdm

def open_image_ok(path_str: str) -> tuple[bool, str]:
    try:
        p = resolve_path(path_str)
        with Image.open(p) as im:
            im.verify()
        return True, ""
    except Exception as e:
        return False, f"{type(e).__name__}: {e}"

def scan_openability(df: pd.DataFrame, max_items: int | None = None):
    errs = []
    items = df["path"].astype(str).tolist()
    if max_items:
        items = items[:max_items]
    for p in tqdm(items):
        ok, err = open_image_ok(p)
        if not ok:
            errs.append({"path": p, "error": err})
    return pd.DataFrame(errs)

bad_train = scan_openability(train_df)
bad_val   = scan_openability(val_df)
bad_test  = scan_openability(test_df)

print("Unreadable images:", len(bad_train), len(bad_val), len(bad_test))

if len(bad_train) + len(bad_val) + len(bad_test) > 0:
    bad = pd.concat([
        bad_train.assign(split="train"),
        bad_val.assign(split="val"),
        bad_test.assign(split="test"),
    ])
    bad.to_csv(ARTIFACTS_DIR / "unreadable_images.csv", index=False)
    display(bad.head(20))
    print("Saved:", ARTIFACTS_DIR / "unreadable_images.csv")

100%|██████████| 686/686 [00:00<00:00, 1314.52it/s]
100%|██████████| 147/147 [00:00<00:00, 1332.86it/s]
100%|██████████| 148/148 [00:00<00:00, 1360.74it/s]

Unreadable images: 0 0 0





Quick size/mode sanity to check for grayscale vs color images, and size variance

In [9]:
import random

def sample_image_stats(df: pd.DataFrame, n: int = 50, seed: int = 42):
    random.seed(seed)
    paths = df["path"].astype(str).tolist()
    sample = random.sample(paths, k=min(n, len(paths)))

    rows = []
    for p in sample:
        with Image.open(Path(resolve_path(p))) as im:
            rows.append({"path": p, "mode": im.mode, "width": im.size[0], "height": im.size[1]})
    return pd.DataFrame(rows)

stats_train = sample_image_stats(train_df, n=50)
display(stats_train["mode"].value_counts())
display(stats_train[["width","height"]].describe())

mode
L    50
Name: count, dtype: int64

Unnamed: 0,width,height
count,50.0,50.0
mean,48.0,48.0
std,0.0,0.0
min,48.0,48.0
25%,48.0,48.0
50%,48.0,48.0
75%,48.0,48.0
max,48.0,48.0


Reporting on all the tests done so far for this notebook

In [10]:
report = {
    "counts": {
        "train": len(train_df),
        "val": len(val_df),
        "test": len(test_df),
        "total": len(train_df) + len(val_df) + len(test_df),
    },
    "missing_files": {
        "train": int(len(missing_train)),
        "val": int(len(missing_val)),
        "test": int(len(missing_test)),
    },
    "overlaps": {
        "train_val": int(len(overlap_train_val)),
        "train_test": int(len(overlap_train_test)),
        "val_test": int(len(overlap_val_test)),
    },
    "unreadable_images": {
        "train": int(len(bad_train)),
        "val": int(len(bad_val)),
        "test": int(len(bad_test)),
    }
}

with open(ARTIFACTS_DIR / "notebook00_integrity_report.json", "w", encoding="utf-8") as f:
    json.dump(report, f, indent=2)

report


{'counts': {'train': 686, 'val': 147, 'test': 148, 'total': 981},
 'missing_files': {'train': 0, 'val': 0, 'test': 0},
 'overlaps': {'train_val': 0, 'train_test': 0, 'val_test': 0},
 'unreadable_images': {'train': 0, 'val': 0, 'test': 0}}