# 03 — Data Preparation

**Objective**  
Create a deterministic train/validation/test split and persist split manifests for reproducibility.

**Inputs**  
- Image dataset: `inputs/cherry_leaves_dataset/`  
  - Subfolders: `healthy/`, `powdery_mildew/`

**Outputs**  
- CSV manifests for each split under `inputs/manifests/v1/`:  
  - `train.csv`, `val.csv`, `test.csv`  
  Each file contains `filepath` and `label` columns.

**Notes**  
Splits are stratified by class with a fixed random seed to ensure reproducibility.

In [1]:
from pathlib import Path
import os

# Ensure working directory is project root
nb_cwd = Path.cwd()
project_root = nb_cwd if (nb_cwd / "inputs").exists() else nb_cwd.parent
os.chdir(project_root)

DATA_DIR = Path("inputs/cherry_leaves_dataset")
CLASSES = ("healthy", "powdery_mildew")

MANIFESTS_DIR = Path("inputs") / "manifests" / "v1"
MANIFESTS_DIR.mkdir(parents=True, exist_ok=True)

print("CWD:", Path.cwd())
print("DATA_DIR:", DATA_DIR.resolve())
print("MANIFESTS_DIR:", MANIFESTS_DIR.resolve())
for cls in CLASSES:
    print(f"{cls:>16} ->", (DATA_DIR / cls).exists())

CWD: c:\Users\ksstr\Documents\Coding\milestone-project-5
DATA_DIR: C:\Users\ksstr\Documents\Coding\milestone-project-5\inputs\cherry_leaves_dataset
MANIFESTS_DIR: C:\Users\ksstr\Documents\Coding\milestone-project-5\inputs\manifests\v1
         healthy -> True
  powdery_mildew -> True


### Create deterministic stratified train/val/test splits
Build reproducible splits (70/15/15) and persist CSV manifests with absolute filepaths and labels.

In [None]:
from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split

# Configuration
DATA_DIR = Path("inputs/cherry_leaves_dataset")
MANIFESTS_DIR = Path("inputs") / "manifests" / "v1"
MANIFESTS_DIR.mkdir(parents=True, exist_ok=True)

ALLOWED = {".jpg", ".jpeg", ".png", ".JPG", ".JPEG", ".PNG"}
CLASSES = ("healthy", "powdery_mildew")
SEED = 42

# Collect files and labels
rows = []
for label in CLASSES:
    cls_dir = DATA_DIR / label
    for p in cls_dir.iterdir():
        if p.is_file() and p.suffix in ALLOWED:
            rows.append({"filepath": str(p.resolve()), "label": label})

df = pd.DataFrame(rows)
assert not df.empty, "No images found to split."
print("Total images:", len(df))
print(df["label"].value_counts(), "\n")

# Stratified train/temp split (70/30)
df_train, df_temp = train_test_split(
    df,
    test_size=0.30,
    random_state=SEED,
    shuffle=True,
    stratify=df["label"],
)

# Stratified val/test split from temp (15/15 overall)
df_val, df_test = train_test_split(
    df_temp,
    test_size=0.50,
    random_state=SEED,
    shuffle=True,
    stratify=df_temp["label"],
)

# Persist manifests
out_train = MANIFESTS_DIR / "train.csv"
out_val = MANIFESTS_DIR / "val.csv"
out_test = MANIFESTS_DIR / "test.csv"

df_train.to_csv(out_train, index=False)
df_val.to_csv(out_val, index=False)
df_test.to_csv(out_test, index=False)

print("Saved:", out_train.resolve())
print("Saved:", out_val.resolve())
print("Saved:", out_test.resolve())

### Verify split sizes and class balance
Quick validation to confirm that the train/val/test splits follow the 70/15/15 ratio and maintain the same class distribution across all subsets.

In [None]:
for name, df_split in [("train", df_train), ("val", df_val), ("test", df_test)]:
    print(f"{name:>5} size:", len(df_split))
    print(df_split["label"].value_counts(normalize=True).round(3), "\n")