
# ðŸ“¦ `setup_datasets.ipynb` â€” clean runner
This notebook **only calls helper functions** from your `util_dataset_helper_functions.py` so the notebook stays clean.
It expects the following helpers (in your module) at minimum:

- `read_datasets_json() -> (path, dict)`
- `list_datasets(cfg: dict) -> list[str]`
- `train_dataset_select(cfg: dict, name: str) -> dict`
- `util_download_datasets(entry: dict) -> pathlib.Path`
- (Optional) `prepare_dataset(entry: dict, raw_root: Path, prep_root: Path) -> dict`
- (Or) for Food-101: `is_food101_root(path: Path) -> bool` and `prepare_food101_for_yolo_cls(...)`

> Tip: If the import fails, adjust the **IMPORT PATHS** block below to match where your utils live (e.g. `./utilities/`).


In [7]:

# --- IMPORT PATHS (edit if needed) ---
import sys
from pathlib import Path

CWD = Path.cwd()
CANDIDATE_DIRS = [
    CWD,
    CWD / "utilities",
    CWD / "src",
]

for p in CANDIDATE_DIRS:
    sys.path.insert(0, str(p))

# --- Import your utilities module ---
try:
    import util_dataset_helper_functions as uds
except Exception as e_root:
    try:
        from utilities import util_dataset_helper_functions as uds
    except Exception as e_sub:
        raise ImportError(
            "Could not import 'util_dataset_helper_functions'. "
            "Place it in project root or in ./utilities/ and re-run.\n"
            f"Errors:\n - root import: {e_root}\n - utilities import: {e_sub}"
        )

# Sanity-print available attributes (helps verify versions)
print("Loaded util module:", uds.__file__)
print("Available helpers:", [n for n in dir(uds) if not n.startswith('_')][:25], "...")


Loaded util module: /home/kristoffel/utilities/util_dataset_helper_functions.py
Available helpers: ['Any', 'DATASETS_DIR', 'DATASETS_JSON_CANDIDATES', 'Dict', 'HOME', 'Iterable', 'List', 'Optional', 'PREP_DIR', 'Path', 'Set', 'Tuple', 'annotations', 'detect_yolo_detection_layout', 'ensure_dir', 'is_food101_root', 'json', 'kagglehub', 'kagglehub_download', 'list_datasets', 'mirror_to_target', 'os', 'prepare_dataset', 'prepare_food101_for_yolo_cls', 'prepare_yolo_detection_passthrough'] ...


In [8]:

import os
from pathlib import Path

# Dataset you want to prepare (must exist in datasets.json)
SELECTED_DATASET = "Food-101"     # <-- change here if needed
YOLO_READY_SUFFIX = "-yolo"       # output suffix
DATASETS_DIR = Path(os.environ.get("IKT524_DATASETS_DIR", Path.home() / "datasets")).expanduser()

print("DATASETS_DIR:", DATASETS_DIR)
print("SELECTED_DATASET:", SELECTED_DATASET)


DATASETS_DIR: /home/kristoffel/datasets
SELECTED_DATASET: Food-101


In [9]:

# Expect read_datasets_json and list_datasets in your utils
assert hasattr(uds, "read_datasets_json"), "utils missing: read_datasets_json()"
assert hasattr(uds, "list_datasets"), "utils missing: list_datasets(cfg)"
assert hasattr(uds, "train_dataset_select"), "utils missing: train_dataset_select(cfg, name)"

cfg_path, cfg = uds.read_datasets_json()
print("Loaded datasets.json from:", cfg_path)
print("Available datasets:", uds.list_datasets(cfg))

entry = uds.train_dataset_select(cfg, SELECTED_DATASET)
print("Selected entry:\n", entry)


Loaded datasets.json from: /home/kristoffel/datasets.json
Available datasets: ['Nutrition5K', 'Food101', 'Food11', 'FooDD', 'iFood 2019 FGVC6', 'ISIA Food-500', 'Large-scale Food Recognition', 'Food-Ingredient-Dataset-51', 'UECFoodPix & UECFoodPixComplete']


KeyError: "Dataset named 'Food-101' not found in datasets.json"

In [6]:

assert hasattr(uds, "util_download_datasets"), "utils missing: util_download_datasets(entry)"
raw_root = uds.util_download_datasets(entry)
print("Raw dataset root:", raw_root)

prep_root = DATASETS_DIR / f"{SELECTED_DATASET}{YOLO_READY_SUFFIX}"
print("Prepared output folder will be:", prep_root)


NameError: name 'entry' is not defined

In [None]:

from pathlib import Path

summary = None

# Preferred generic API if your utils provide it
if hasattr(uds, "prepare_dataset"):
    summary = uds.prepare_dataset(entry, Path(raw_root), Path(prep_root))

else:
    # Food-101 specific fallback (kept minimal)
    fmt = entry.get("format", "").lower()
    if fmt == "food101" or (hasattr(uds, "is_food101_root") and uds.is_food101_root(Path(raw_root))):
        assert hasattr(uds, "prepare_food101_for_yolo_cls"), \
            "utils missing: prepare_food101_for_yolo_cls(food101_root, out_root)"
        summary = uds.prepare_food101_for_yolo_cls(Path(raw_root), Path(prep_root))
    else:
        raise NotImplementedError(
            "No generic prepare_dataset() provided and dataset format is not Food-101.\n"
            "Add a handler to utils (e.g., prepare_<format>...) and expose it via prepare_dataset()."
        )

print("\\n=== SUMMARY ===")
if isinstance(summary, dict):
    for k, v in summary.items():
        if isinstance(v, list):
            print(f"{k}: {len(v)} items")
        else:
            print(f"{k}: {v}")
else:
    print(summary)



## âœ… Next steps
- Train (Ultralytics classification example):
  ```bash
  yolo task=classify mode=train model=yolo11n-cls.pt data="<path to prepared folder or YAML>"
  ```

- Switch `SELECTED_DATASET` at the top to prepare a different dataset defined in `datasets.json`.

- If you add new dataset formats, expose a single entry point `prepare_dataset(entry, raw_root, prep_root)` in your utils.  
  This notebook will pick it up automatically and stay clean.
