Notebook header & imports

In [57]:
# 01_data_labeling.ipynb
# Goal:
# - Scan the PlantVillage dataset
# - Build a unified label_mapping.json
# - Build dataset_index.json for all PV images (with splits)
# - Prepare structure to later add field images

from pathlib import Path
import json
from collections import defaultdict
import random

# Optional: for stratified splits (recommended)
try:
    from sklearn.model_selection import train_test_split
    SKLEARN_AVAILABLE = True
except ImportError:
    SKLEARN_AVAILABLE = False
    print("sklearn not found, will use simple splitting.")


Paths & basic config

In [58]:
# Path to PlantVillage
# /data/Plant_leave_diseases_dataset_without_augmentation/
#    ├── Apple___Apple_scab/
#    ├── Apple___Black_rot/
#    └── ...

PV_ROOT = Path("data/Plant_leave_diseases_dataset_without_augmentation")  
OUTPUT_DIR = Path("./metadata")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Random seed for reproducibility
RANDOM_SEED = 42
random.seed(RANDOM_SEED)


Scan folders and build initial class list

In [59]:
# Scan PlantVillage folders
class_folders = sorted([d for d in PV_ROOT.iterdir() if d.is_dir()])
print(f"Found {len(class_folders)} class folders.")

for d in class_folders[:5]:
    print("-", d.name)


Found 39 class folders.
- Apple___Apple_scab
- Apple___Black_rot
- Apple___Cedar_apple_rust
- Apple___healthy
- Background_without_leaves


Build label_mapping structure

In [60]:
# Build the initial label mapping:
# - id: numeric class id
# - canonical_label: a cleaner name derived from folder name
# - pv_folders: list of folders that map to this label (for now 1:1)
# - pv_count: number of PV images in this class
# - field_count: 0 (we'll update later when adding field data)

def make_canonical_label(folder_name: str) -> str:
    """
    Convert 'Apple___Apple_scab' -> 'apple_scab'
    'Cherry_(including_sour)___Powdery_mildew' -> 'cherry_powdery_mildew'
    """
    # Split on triple underscore if present
    parts = folder_name.split("___")
    # Keep last part (disease) + first part (crop) to avoid ultra-long labels
    if len(parts) == 2:
        crop, disease = parts
        label = f"{crop}_{disease}"
    else:
        label = folder_name

    # Clean up a bit
    label = label.replace("(", "").replace(")", "")
    label = label.replace(" ", "_")
    label = label.replace("-", "_")
    return label.lower()

classes = []
id_by_folder = {}

for idx, folder in enumerate(class_folders):
    folder_name = folder.name
    canonical_label = make_canonical_label(folder_name)

    # count images (you can extend to png, jpeg, etc.)
    img_paths = (
        list(folder.glob("*.jpg")) +
        list(folder.glob("*.jpeg")) +
        list(folder.glob("*.png"))
    )

    cls_entry = {
        "id": idx,
        "canonical_label": canonical_label,
        "pv_folders": [folder_name],
        "pv_count": len(img_paths),
        "field_count": 0
    }

    classes.append(cls_entry)
    id_by_folder[folder_name] = idx

print(f"Total classes: {len(classes)}")
classes[:3]


Total classes: 39


[{'id': 0,
  'canonical_label': 'apple_apple_scab',
  'pv_folders': ['Apple___Apple_scab'],
  'pv_count': 630,
  'field_count': 0},
 {'id': 1,
  'canonical_label': 'apple_black_rot',
  'pv_folders': ['Apple___Black_rot'],
  'pv_count': 621,
  'field_count': 0},
 {'id': 2,
  'canonical_label': 'apple_cedar_apple_rust',
  'pv_folders': ['Apple___Cedar_apple_rust'],
  'pv_count': 275,
  'field_count': 0}]

Save label_mapping.json

In [61]:
label_mapping = {
    "classes": classes,
    "meta": {
        "source": "PlantVillage",
        "description": "Unified label mapping for PlantVillage + future field datasets",
        "version": 1
    }
}

label_mapping_path = OUTPUT_DIR / "label_mapping.json"
with open(label_mapping_path, "w") as f:
    json.dump(label_mapping, f, indent=2)

print("Saved label_mapping.json to:", label_mapping_path)


Saved label_mapping.json to: metadata\label_mapping.json


Build dataset_index for PV (paths + split)

In [62]:
# We'll create a JSON list like:
# {
#   "path": "PlantVillage/Apple___Apple_scab/image_001.png",
#   "class_id": 0,
#   "domain": "pv",
#   "split": "train"
# }

# Gather all image paths per class
per_class_images = defaultdict(list)

for folder in class_folders:
    folder_name = folder.name
    class_id = id_by_folder[folder_name]

    img_paths = (
        list(folder.glob("*.jpg")) +
        list(folder.glob("*.jpeg")) +
        list(folder.glob("*.png"))
    )

    for p in img_paths:
        per_class_images[class_id].append(p)

# Build dataset_index with train/val/test splits per class
dataset_index = []

train_ratio = 0.8
val_ratio = 0.1
test_ratio = 0.1

for class_id, img_list in per_class_images.items():
    # Convert to list of paths and sort for reproducibility
    img_list = sorted(img_list)

    if SKLEARN_AVAILABLE and len(img_list) >= 3:
        # Split into train and temp
        img_train, img_temp = train_test_split(
            img_list,
            train_size=train_ratio,
            random_state=RANDOM_SEED,
            shuffle=True
        )
        # Split temp into val and test
        val_size_rel = val_ratio / (val_ratio + test_ratio)
        img_val, img_test = train_test_split(
            img_temp,
            train_size=val_size_rel,
            random_state=RANDOM_SEED,
            shuffle=True
        )
    else:
        # Simple split if sklearn not available
        n = len(img_list)
        n_train = int(n * train_ratio)
        n_val = int(n * val_ratio)
        img_train = img_list[:n_train]
        img_val = img_list[n_train:n_train + n_val]
        img_test = img_list[n_train + n_val:]

    def add_records(imgs, split_name):
        for p in imgs:
            dataset_index.append({
                "path": str(p),        # or str(p.relative_to(PV_ROOT.parent)) if you prefer relative paths
                "class_id": class_id,
                "domain": "pv",
                "split": split_name
            })

    add_records(img_train, "train")
    add_records(img_val, "val")
    add_records(img_test, "test")

len(dataset_index)


55448

Save dataset_index.json

In [63]:
dataset_index_path = OUTPUT_DIR / "dataset_index.json"
with open(dataset_index_path, "w") as f:
    json.dump(dataset_index, f, indent=2)

print("Saved dataset_index.json to:", dataset_index_path)


Saved dataset_index.json to: metadata\dataset_index.json


Helper functions for adding field images later