# 03 — Data Preparation

**Objective**  
Create a deterministic train/validation/test split and persist split manifests for reproducibility.

**Inputs**  
- Image dataset: `inputs/cherry_leaves_dataset/`  
  - Subfolders: `healthy/`, `powdery_mildew/`

**Outputs**  
- CSV manifests for each split under `inputs/manifests/v1/`:  
  - `train.csv`, `val.csv`, `test.csv`  
  Each file contains `filepath` and `label` columns.

**Notes**  
Splits are stratified by class with a fixed random seed to ensure reproducibility.

In [5]:
from pathlib import Path
import sys

def find_project_root(start: Path) -> Path:
    """Walk up until a folder containing 'src' is found, else return start."""
    p = start
    for _ in range(4):
        if (p / "src").exists():
            return p
        p = p.parent
    return start

PROJECT_ROOT = find_project_root(Path.cwd())
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from src.paths import DATA_DIR, MANIFESTS_DIR

print("PROJECT_ROOT:", PROJECT_ROOT)
print("DATA_DIR:", DATA_DIR)
print("MANIFESTS_DIR:", MANIFESTS_DIR)

PROJECT_ROOT: c:\Users\ksstr\Documents\Coding\milestone-project-5
DATA_DIR: C:\Users\ksstr\Documents\Coding\milestone-project-5\inputs\cherry_leaves_dataset
MANIFESTS_DIR: C:\Users\ksstr\Documents\Coding\milestone-project-5\inputs\manifests\v1


### Create deterministic stratified train/val/test splits
Build reproducible splits (70/15/15) and persist CSV manifests with absolute filepaths and labels.

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Configuration
MANIFESTS_DIR.mkdir(parents=True, exist_ok=True)

ALLOWED = {".jpg", ".jpeg", ".png", ".JPG", ".JPEG", ".PNG"}
CLASSES = ("healthy", "powdery_mildew")
SEED = 42

# Collect files and labels
rows = []
for label in CLASSES:
    cls_dir = DATA_DIR / label
    for p in cls_dir.iterdir():
        if p.is_file() and p.suffix in ALLOWED:
            rows.append({"filepath": str(p.resolve()), "label": label})

df = pd.DataFrame(rows)
assert not df.empty, "No images found to split."
print("Total images:", len(df))
print(df["label"].value_counts(), "\n")

# Stratified train/temp split (70/30)
df_train, df_temp = train_test_split(
    df,
    test_size=0.30,
    random_state=SEED,
    shuffle=True,
    stratify=df["label"],
)

# Stratified val/test split from temp (15/15 overall)
df_val, df_test = train_test_split(
    df_temp,
    test_size=0.50,
    random_state=SEED,
    shuffle=True,
    stratify=df_temp["label"],
)

# Persist manifests
out_train = MANIFESTS_DIR / "train.csv"
out_val = MANIFESTS_DIR / "val.csv"
out_test = MANIFESTS_DIR / "test.csv"

df_train.to_csv(out_train, index=False)
df_val.to_csv(out_val, index=False)
df_test.to_csv(out_test, index=False)

print("Saved:", out_train.resolve())
print("Saved:", out_val.resolve())
print("Saved:", out_test.resolve())

Total images: 4208
label
healthy           2104
powdery_mildew    2104
Name: count, dtype: int64 

Saved: C:\Users\ksstr\Documents\Coding\milestone-project-5\inputs\manifests\v1\train.csv
Saved: C:\Users\ksstr\Documents\Coding\milestone-project-5\inputs\manifests\v1\val.csv
Saved: C:\Users\ksstr\Documents\Coding\milestone-project-5\inputs\manifests\v1\test.csv


### Verify split sizes and class balance
Quick validation to confirm that the train/val/test splits follow the 70/15/15 ratio and maintain the same class distribution across all subsets.

In [7]:
for name, df_split in [("train", df_train), ("val", df_val), ("test", df_test)]:
    print(f"{name:>5} size:", len(df_split))
    print(df_split["label"].value_counts(normalize=True).round(3), "\n")

train size: 2945
label
powdery_mildew    0.5
healthy           0.5
Name: proportion, dtype: float64 

  val size: 631
label
healthy           0.501
powdery_mildew    0.499
Name: proportion, dtype: float64 

 test size: 632
label
powdery_mildew    0.5
healthy           0.5
Name: proportion, dtype: float64 



In [8]:
from src.data_management import load_manifest, batch_iterator, ImageSpec

train_df = load_manifest(Path("inputs/manifests/v1/train.csv"))
spec = ImageSpec(width=100, height=100)

it = batch_iterator(train_df, batch_size=16, spec=spec, shuffle=True, seed=42)
X, y = next(it)
X.shape, y.shape, float(X.min()), float(X.max()), y[:5]

((16, 100, 100, 3),
 (16,),
 0.0,
 0.9882352948188782,
 array([1, 1, 0, 1, 0], dtype=int64))