
# Nutrition5k √ó YOLO11 ‚Äî Classification Training (v3, Excel + aggregation support)

This notebook is a **drop‚Äëin** trainer for Nutrition5k that:
1) **Auto‚Äëdiscovers** CSV/Parquet/Feather/Pickle/**Excel** tables anywhere under your dataset root.  
2) **Loads pickles safely** even if they were created with NumPy‚â•2 (`numpy._core`).  
3) Uses dish‚Äëlevel totals from **`dishes.xlsx`** when present; otherwise, **aggregates** ingredient‚Äëlevel macros from `dish_ingredients.xlsx`.  
4) **Materializes JPEGs** from `dish_images.pkl` if images are stored as bytes.  
5) **Splits by dish**, trains **`yolo11n‚Äëcls.pt`**, and evaluates on **val** and **test**.


In [None]:

# --- 1) Configuration ----------------------------------------------------------
DATASET_ROOT = "/home/kristoffel/datasets/dataset-01-Nutrition5k"  # ‚Üê edit if needed
MODEL_DIR    = "/home/kristoffel/models"

# Labeling: 'primary_macro' (carb/protein/fat) or 'calorie_bin' (5 quantiles)
LABEL_STRATEGY = "primary_macro"   # ‚Üê edit me

# Split ratios (per-dish)
VAL_RATIO  = 0.10
TEST_RATIO = 0.10
SEED = 42

# Training hyperparameters
EPOCHS = 30
IMGSZ  = 224
BATCH  = 64
RUN_NAME_BASE = "nutrition5k_yolo11n_cls"

# Where to put materialized JPEGs if decoding from a table is needed
RGB_DIR = f"{DATASET_ROOT}/images_rgb"

import os, random
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
random.seed(SEED)

for p in [DATASET_ROOT, MODEL_DIR]:
    assert os.path.isdir(p), f"Missing directory: {p}"
print("Paths OK.")


In [None]:

# --- 2) Environment (optional installs) ---------------------------------------
# If you need dependencies for Excel/Parquet, uncomment:
# !pip install -U ultralytics pyarrow openpyxl pandas numpy pillow

import torch
from ultralytics import YOLO
import ultralytics
print("Ultralytics:", ultralytics.__version__)
print("PyTorch:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("Device:", torch.cuda.get_device_name(0))


In [None]:

# --- 3) Discovery: find nutrition & image tables, handle Excel & pickles ------
from pathlib import Path
import os, pickle, pandas as pd

# Custom unpickler: map 'numpy._core' -> 'numpy.core' (incl. nested paths)
class _RemapUnpickler(pickle.Unpickler):
    def find_class(self, module, name):
        if module == "numpy._core" or module.startswith("numpy._core."):
            module = module.replace("numpy._core", "numpy.core", 1)
        return super().find_class(module, name)

def safe_pickle_columns(p: Path):
    try:
        with open(p, "rb") as f:
            obj = _RemapUnpickler(f).load()
        if hasattr(obj, "columns"):
            return list(obj.columns)
        return []
    except Exception:
        return []

def sniff_table_columns(p: Path, max_rows=5):
    low = p.name.lower()
    if low.endswith(".parquet"):
        try:
            import pyarrow.parquet as pq
            return [str(n) for n in pq.read_schema(p).names]
        except Exception:
            return []
    if low.endswith(".feather"):
        try:
            import pyarrow.feather as ft
            with ft.FeatherReader(str(p)) as reader:
                return [str(reader.get_column_name(i)) for i in range(reader.num_columns)]
        except Exception:
            return []
    if low.endswith(".csv"):
        try:
            return list(pd.read_csv(p, nrows=max_rows).columns)
        except Exception:
            return []
    if low.endswith((".pkl",".pickle")):
        return safe_pickle_columns(p)
    if low.endswith((".xlsx",".xls")):
        try:
            return list(pd.read_excel(p, nrows=1).columns)
        except Exception:
            return []
    return []

def list_candidate_files(root: str):
    exts = ("*.csv","*.parquet","*.feather","*.pkl","*.pickle","*.xlsx","*.xls")
    files = []
    for ext in exts:
        files.extend(Path(root).rglob(ext))
    return sorted(set(files))

def norm_cols(cols):
    out = {}
    for c in cols:
        k = str(c).lower().strip()
        for ch in [" ", "(", ")", "-", "_", "/"]:
            k = k.replace(ch, "")
        out[k] = c
    return out

# Keys to detect nutrition or image tables
MACRO_KEYS = {
    # calories / energy
    "calories","kcal","energykcal","energy","totalcalories","caloriestotal","calorie","totalkcal",
    # protein
    "protein","proteing","proteingram","proteingrams","totalprotein","proteintotal",
    # fat
    "fat","fatg","fatgram","fatgrams","totalfat","fattotal","lipid","lipids",
    # carbs
    "carbohydrate","carbohydrates","carbs","carb","carbsg","carbgram","carbgrams",
    "totalcarb","carbohydratetotal","carbstotal"
}
IMG_KEYS = {"rgb","rgbbytes","image","imagebytes","rgbimage","rgbimagebytes","depth","depthbytes","depthimage"}

def discover_tables_robust(root: str):
    nutr, img, ingr = None, None, None
    cands = list_candidate_files(root)
    print(f"Found {len(cands)} candidate tables under {root}. Scanning schemas...")
    for p in cands:
        cols = sniff_table_columns(p)
        if not cols:
            continue
        nmap = norm_cols(cols)
        keys = set(nmap.keys())
        # ingredient file hint
        if ingr is None and any(k.startswith("ingr") for k in keys):
            ingr = str(p)
        # dish-level nutrition present?
        if nutr is None and any(k in keys for k in MACRO_KEYS):
            nutr = str(p)
        # image table present?
        if img is None and any(k in keys for k in IMG_KEYS):
            img = str(p)
        if nutr and img and ingr:
            break
    return nutr, img, ingr

# JPEG root discovery
JPEG_ROOT = None
for cand in [f"{DATASET_ROOT}/images", f"{DATASET_ROOT}/images_rgb", DATASET_ROOT]:
    if os.path.isdir(cand) and any(Path(cand).rglob("*.jpg")):
        JPEG_ROOT = cand
        break

nutr_file, img_table, ingr_table = discover_tables_robust(DATASET_ROOT)

# Fallbacks
if nutr_file is None and img_table is not None:
    nutr_file = img_table
    print("[patch] Using image table as nutrition table:", nutr_file)

# Manual filename fallbacks commonly seen in mirrors
for guess in ["dishes.xlsx","dishes.csv","dish_images.pkl","dish_ingredients.xlsx"]:
    g = Path(DATASET_ROOT) / guess
    if nutr_file is None and g.exists():
        nutr_file = str(g); print("[manual nutr] Using:", g)
    if img_table is None and g.exists() and g.suffix in [".pkl",".pickle"]:
        img_table  = str(g); print("[manual imgs] Using:", g)
    if ingr_table is None and "ingredient" in guess and g.exists():
        ingr_table = str(g); print("[manual ingr] Using:", g)

print("nutrition file:", nutr_file)
print("image table   :", img_table)
print("ingredients   :", ingr_table)
print("jpeg root     :", JPEG_ROOT)

assert nutr_file or img_table, "No usable table (nutrition or images) found under dataset root."


In [None]:
%pip install -U openpyxl

In [5]:
# --- 4) Load nutrition (dish-level) with fallbacks, then derive labels --------
import pandas as pd
import numpy as np
import pickle

# Reuse unpickler
class _RemapUnpickler(pickle.Unpickler):
    def find_class(self, module, name):
        if module == "numpy._core" or module.startswith("numpy._core."):
            module = module.replace("numpy._core", "numpy.core", 1)
        return super().find_class(module, name)

def read_pickle_compat(path: str) -> pd.DataFrame:
    with open(path, "rb") as f:
        obj = _RemapUnpickler(f).load()
    if isinstance(obj, pd.DataFrame):
        return obj
    try:
        return pd.DataFrame(obj)
    except Exception as e:
        raise TypeError(f"Unsupported pickle payload type: {type(obj)} from {path}") from e

def read_any_table(path: str, usecols=None) -> pd.DataFrame:
    low = path.lower()
    if low.endswith(".csv"):
        return pd.read_csv(path, usecols=usecols)
    if low.endswith(".parquet"):
        try:
            import pyarrow.parquet as pq
            if usecols is None:
                return pd.read_parquet(path)
            table = pq.read_table(path, columns=usecols)
            return table.to_pandas()
        except Exception:
            return pd.read_parquet(path, columns=usecols)
    if low.endswith(".feather"):
        try:
            import pyarrow.feather as ft
            tbl = ft.read_table(path)
            if usecols:
                tbl = tbl.select(usecols)
            return tbl.to_pandas()
        except Exception:
            return pd.read_feather(path, columns=usecols)
    if low.endswith((".pkl",".pickle")):
        df = read_pickle_compat(path)
        if usecols is not None:
            keep = [c for c in usecols if c in df.columns]
            df = df[keep]
        return df
    if low.endswith((".xlsx",".xls")):
        try:
            return pd.read_excel(path, usecols=usecols)
        except Exception:
            # try engine fallback
            return pd.read_excel(path, usecols=usecols, engine="openpyxl")
    raise ValueError(f"Unsupported table format: {path}")

def _norm(s: str) -> str:
    s = str(s).lower().strip()
    for ch in [" ", "(", ")", "-", "_", "/"]:
        s = s.replace(ch, "")
    return s

def pick_from(df_cols, *alts):
    nmap = {_norm(c): c for c in df_cols}
    for a in alts:
        if _norm(a) in nmap:
            return nmap[_norm(a)]
    return None

# Load a small sample first to detect columns
probe = read_any_table(nutr_file)
cols = list(probe.columns)
print("Detected columns in nutr_file (first 25):", cols[:25])

# ‚îÄ‚îÄ Try dish id and totals (FIXED: add cal_col; unify carb_col) ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
dish_col = pick_from(cols, "dish_id","plate_id","sample_id","id","dish","plateid","dishid")
cal_col  = pick_from(cols, "calories","kcal","energy_kcal","energykcal","energy",
                     "total_calories","calories_total","total_kcal","calorie","totalcalories")  # ‚Üê added back
pro_col  = pick_from(cols, "protein","protein_g","proteing","total_protein","protein_total","totalprotein")
fat_col  = pick_from(cols, "fat","fat_g","fatg","total_fat","fat_total","totalfat")
carb_col = pick_from(cols, "carb","carbohydrate","carbohydrates","carbs","carb_g",
                     "carbohydrates_g","carbohydrateg","total_carb","carbs_total","totalcarb")  # ‚Üê single line

print("Dish/Nutrition columns ->",
      "dish:", dish_col, "| kcal:", cal_col, "| protein:", pro_col, "| fat:", fat_col, "| carbs:", carb_col)

# If no totals present, but we have an ingredient table, aggregate it
if (cal_col is None or pro_col is None or fat_col is None or carb_col is None) and 'ingr_table' in globals() and ingr_table:
    print("[aggregate] Dish totals not found in nutr_file; attempting aggregation from ingredient table:", ingr_table)
    df_ing = read_any_table(ingr_table)
    icolumns = list(df_ing.columns)
    print("Ingredient columns (first 25):", icolumns[:25])

    di_col = pick_from(icolumns, "dish_id","plate_id","sample_id","id","dish","plateid","dishid")
    ical   = pick_from(icolumns, "calories","kcal","energy_kcal","energykcal","energy",
                       "ingr_calories","ingrcalories")
    ipro   = pick_from(icolumns, "protein","protein_g","proteing","ingr_protein","ingrprotein")
    ifat   = pick_from(icolumns, "fat","fat_g","fatg","totalfat","ingr_fat","ingrfat")
    icrb   = pick_from(icolumns, "carb","carbohydrate","carbohydrates","carbs","carb_g",
                       "ingr_carb","ingr_carbohydrate","ingrcarb")
    assert di_col is not None, "Ingredient table lacks a dish id column."

    # Coerce numeric and aggregate (robust: skip missing)
    for c in [ical, ipro, ifat, icrb]:
        if c is not None:
            df_ing[c] = pd.to_numeric(df_ing[c], errors="coerce")

    agg_spec = {}
    if ical: agg_spec[ical] = "sum"
    if ipro: agg_spec[ipro] = "sum"
    if ifat: agg_spec[ifat] = "sum"
    if icrb: agg_spec[icrb] = "sum"
    if not agg_spec:
        raise RuntimeError("No numeric ingredient columns to aggregate.")

    df_tot = df_ing.groupby(di_col, as_index=False).agg(agg_spec)

    rename_map = {}
    if ical: rename_map[ical] = "total_calories"
    if ipro: rename_map[ipro] = "total_protein"
    if ifat: rename_map[ifat] = "total_fat"
    if icrb: rename_map[icrb] = "total_carb"
    df_tot = df_tot.rename(columns=rename_map)

    if di_col != "dish_id":
        df_tot = df_tot.rename(columns={di_col: "dish_id"})

    # Update pointers only for fields we actually built
    dish_col = "dish_id"
    if ical: cal_col  = "total_calories"
    if ipro: pro_col  = "total_protein"
    if ifat: fat_col  = "total_fat"
    if icrb: carb_col = "total_carb"

    df_nutr = df_tot.copy()
else:
    # Use nutr_file as-is
    df_nutr = probe.copy()

# Ensure required columns exist
if dish_col is None:
    raise RuntimeError("Could not find a 'dish id' column. Please inspect the printed columns.")
need_any = [cal_col, pro_col, fat_col, carb_col]
if not any(need_any):
    raise RuntimeError("Neither calories nor macro columns were found. Please verify your dataset files (expecting dishes.xlsx or dish_ingredients.xlsx).")

# Build labels
df_nutr = df_nutr.dropna(subset=[dish_col]).copy()

if LABEL_STRATEGY == "primary_macro" and all(c is not None for c in [pro_col, fat_col, carb_col]):
    def _tofloat(x):
        try: return float(x)
        except Exception: return np.nan
    def primary_macro(r):
        p = _tofloat(r[pro_col]); f = _tofloat(r[fat_col]); c = _tofloat(r[carb_col])
        p = 0.0 if np.isnan(p) else p
        f = 0.0 if np.isnan(f) else f
        c = 0.0 if np.isnan(c) else c
        return max([(p,"protein"), (f,"fat"), (c,"carb")])[1]
    df_nutr["label"] = df_nutr.apply(primary_macro, axis=1)
    class_names = ["carb","protein","fat"]

elif LABEL_STRATEGY == "calorie_bin" and cal_col is not None:
    df_nutr = df_nutr.dropna(subset=[cal_col])
    df_nutr["label"] = pd.qcut(pd.to_numeric(df_nutr[cal_col], errors="coerce"),
                               q=5, duplicates="drop").astype(str)
    class_names = sorted(df_nutr["label"].dropna().unique().tolist())

else:
    # Fallback to calorie bins if macros missing but calories exist
    if cal_col is not None:
        df_nutr = df_nutr.dropna(subset=[cal_col])
        df_nutr["label"] = pd.qcut(pd.to_numeric(df_nutr[cal_col], errors="coerce"),
                                   q=5, duplicates="drop").astype(str)
        LABEL_STRATEGY = "calorie_bin"
        class_names = sorted(df_nutr["label"].dropna().unique().tolist())
        print("Fallback: using LABEL_STRATEGY='calorie_bin'")
    else:
        raise RuntimeError("No usable columns for labeling.")

# Final label table
df_lbl = (df_nutr[[dish_col, "label"]]
          .dropna()
          .drop_duplicates()
          .rename(columns={dish_col: "dish_id"})
          .reset_index(drop=True))

print("Label distribution (top 10):")
print(df_lbl["label"].value_counts().head(10))

# Export core vars
globals().update({
    "dish_col": dish_col, "cal_col": cal_col, "pro_col": pro_col,
    "fat_col": fat_col, "carb_col": carb_col, "df_lbl": df_lbl,
    "LABEL_STRATEGY": LABEL_STRATEGY
})


Detected columns in nutr_file (first 25): ['dish_id', 'ingr_id', 'ingr_name', 'grams', 'calories', 'fat', 'carb', 'protein']
Dish/Nutrition columns -> dish: dish_id | kcal: calories | protein: protein | fat: fat | carbs: carb
Label distribution (top 10):
carb       4380
fat        2818
protein    2379
Name: label, dtype: int64


In [6]:

# --- 4) Load nutrition (dish-level) with fallbacks, then derive labels --------
import pandas as pd
import numpy as np
import pickle

# Reuse unpickler
class _RemapUnpickler(pickle.Unpickler):
    def find_class(self, module, name):
        if module == "numpy._core" or module.startswith("numpy._core."):
            module = module.replace("numpy._core", "numpy.core", 1)
        return super().find_class(module, name)

def read_pickle_compat(path: str) -> pd.DataFrame:
    with open(path, "rb") as f:
        obj = _RemapUnpickler(f).load()
    if isinstance(obj, pd.DataFrame):
        return obj
    try:
        return pd.DataFrame(obj)
    except Exception as e:
        raise TypeError(f"Unsupported pickle payload type: {type(obj)} from {path}") from e

def read_any_table(path: str, usecols=None) -> pd.DataFrame:
    low = path.lower()
    if low.endswith(".csv"):
        return pd.read_csv(path, usecols=usecols)
    if low.endswith(".parquet"):
        try:
            import pyarrow.parquet as pq
            if usecols is None:
                return pd.read_parquet(path)
            table = pq.read_table(path, columns=usecols)
            return table.to_pandas()
        except Exception:
            return pd.read_parquet(path, columns=usecols)
    if low.endswith(".feather"):
        try:
            import pyarrow.feather as ft
            tbl = ft.read_table(path)
            if usecols:
                tbl = tbl.select(usecols)
            return tbl.to_pandas()
        except Exception:
            return pd.read_feather(path, columns=usecols)
    if low.endswith((".pkl",".pickle")):
        df = read_pickle_compat(path)
        if usecols is not None:
            keep = [c for c in usecols if c in df.columns]
            df = df[keep]
        return df
    if low.endswith((".xlsx",".xls")):
        try:
            return pd.read_excel(path, usecols=usecols)
        except Exception:
            # try engine fallback
            return pd.read_excel(path, usecols=usecols, engine="openpyxl")
    raise ValueError(f"Unsupported table format: {path}")

def _norm(s: str) -> str:
    s = str(s).lower().strip()
    for ch in [" ", "(", ")", "-", "_", "/"]:
        s = s.replace(ch, "")
    return s

def pick_from(df_cols, *alts):
    nmap = {_norm(c): c for c in df_cols}
    for a in alts:
        if _norm(a) in nmap:
            return nmap[_norm(a)]
    return None

# Load a small sample first to detect columns
probe = read_any_table(nutr_file)
cols = list(probe.columns)
print("Detected columns in nutr_file (first 25):", cols[:25])

# Try dish id and totals
dish_col = pick_from(cols, "dish_id","plate_id","sample_id","id","dish","plateid","dishid")
carb_col = pick_from(cols, "carb","carbohydrate","carbohydrates","carbs","carb_g","carbohydrates_g","carbohydrateg")
pro_col  = pick_from(cols, "protein","protein_g","proteing","total_protein","protein_total","totalprotein")
fat_col  = pick_from(cols, "fat","fat_g","fatg","total_fat","fat_total","totalfat")
carb_col = pick_from(cols, "carbohydrate","carbohydrates","carbs","carb_g","carbohydrates_g","carbohydrateg",
                     "total_carb","carbs_total","totalcarb")

print("Dish/Nutrition columns ->",
      "dish:", dish_col, "| kcal:", cal_col, "| protein:", pro_col, "| fat:", fat_col, "| carbs:", carb_col)

# If no totals present, but we have an ingredient table, aggregate it
if (cal_col is None or pro_col is None or fat_col is None or carb_col is None) and 'ingr_table' in globals() and ingr_table:
    print("[aggregate] Dish totals not found in nutr_file; attempting aggregation from ingredient table:", ingr_table)
    df_ing = read_any_table(ingr_table)
    icolumns = list(df_ing.columns)
    print("Ingredient columns (first 25):", icolumns[:25])
    di_col = pick_from(icolumns, "dish_id","plate_id","sample_id","id","dish","plateid","dishid")
    ical  = pick_from(icolumns, "ingr_calories","calories","kcal","energy_kcal","energykcal","ingrcalories")
    ipro  = pick_from(icolumns, "ingr_protein","protein","protein_g","proteing","ingrprotein")
    ifat  = pick_from(icolumns, "ingr_fat","fat","fat_g","fatg","ingrfat")
    icrb  = pick_from(icolumns, "carb","ingr_carb","ingr_carbohydrate","carbohydrate","carbohydrates","carbs","carb_g","ingrcarb")
    assert di_col is not None, "Ingredient table lacks a dish id column."
    # Coerce numeric and aggregate
    for c in [ical, ipro, ifat, icrb]:
        if c is not None:
            df_ing[c] = pd.to_numeric(df_ing[c], errors="coerce")
    agg_spec = {}
    if ical: agg_spec[ical] = "sum"
    if ipro: agg_spec[ipro] = "sum"
    if ifat: agg_spec[ifat] = "sum"
    if icrb: agg_spec[icrb] = "sum"
    if not agg_spec:
        raise RuntimeError("No numeric ingredient columns to aggregate.")

    df_tot = df_ing.groupby(di_col, as_index=False).agg(agg_spec)

    rename_map = {}
    if ical: rename_map[ical] = "total_calories"
    if ipro: rename_map[ipro] = "total_protein"
    if ifat: rename_map[ifat] = "total_fat"
    if icrb: rename_map[icrb] = "total_carb"

    df_tot = df_tot.rename(columns=rename_map)
    if di_col != "dish_id":
        df_tot = df_tot.rename(columns={di_col: "dish_id"})
    dish_col = "dish_id"; cal_col = "total_calories"; pro_col = "total_protein"; fat_col = "total_fat"; carb_col = "total_carb"
    df_nutr = df_tot.copy()
else:
    # Use nutr_file as-is
    df_nutr = probe.copy()

# Ensure required columns exist
if dish_col is None:
    raise RuntimeError("Could not find a 'dish id' column. Please inspect the printed columns.")
need_any = [cal_col, pro_col, fat_col, carb_col]
if not any(need_any):
    raise RuntimeError("Neither calories nor macro columns were found. Please verify your dataset files (expecting dishes.xlsx or dish_ingredients.xlsx).")

# Build labels
df_nutr = df_nutr.dropna(subset=[dish_col]).copy()

if LABEL_STRATEGY == "primary_macro" and all(c is not None for c in [pro_col, fat_col, carb_col]):
    def _tofloat(x):
        try: return float(x)
        except Exception: return np.nan
    def primary_macro(r):
        p = _tofloat(r[pro_col]); f = _tofloat(r[fat_col]); c = _tofloat(r[carb_col])
        p = 0.0 if np.isnan(p) else p
        f = 0.0 if np.isnan(f) else f
        c = 0.0 if np.isnan(c) else c
        return max([(p,"protein"), (f,"fat"), (c,"carb")])[1]
    df_nutr["label"] = df_nutr.apply(primary_macro, axis=1)
    class_names = ["carb","protein","fat"]

elif LABEL_STRATEGY == "calorie_bin" and cal_col is not None:
    df_nutr = df_nutr.dropna(subset=[cal_col])
    df_nutr["label"] = pd.qcut(pd.to_numeric(df_nutr[cal_col], errors="coerce"),
                               q=5, duplicates="drop").astype(str)
    class_names = sorted(df_nutr["label"].dropna().unique().tolist())

else:
    # Fallback to calorie bins if macros missing but calories exist
    if cal_col is not None:
        df_nutr = df_nutr.dropna(subset=[cal_col])
        df_nutr["label"] = pd.qcut(pd.to_numeric(df_nutr[cal_col], errors="coerce"),
                                   q=5, duplicates="drop").astype(str)
        LABEL_STRATEGY = "calorie_bin"
        class_names = sorted(df_nutr["label"].dropna().unique().tolist())
        print("Fallback: using LABEL_STRATEGY='calorie_bin'")
    else:
        raise RuntimeError("No usable columns for labeling.")

# Final label table
df_lbl = (df_nutr[[dish_col, "label"]]
          .dropna()
          .drop_duplicates()
          .rename(columns={dish_col: "dish_id"})
          .reset_index(drop=True))

print("Label distribution (top 10):")
print(df_lbl["label"].value_counts().head(10))

# Export core vars
globals().update({
    "dish_col": dish_col, "cal_col": cal_col, "pro_col": pro_col,
    "fat_col": fat_col, "carb_col": carb_col, "df_lbl": df_lbl,
    "LABEL_STRATEGY": LABEL_STRATEGY
})


Detected columns in nutr_file (first 25): ['dish_id', 'ingr_id', 'ingr_name', 'grams', 'calories', 'fat', 'carb', 'protein']
Dish/Nutrition columns -> dish: dish_id | kcal: calories | protein: protein | fat: fat | carbs: None
[aggregate] Dish totals not found in nutr_file; attempting aggregation from ingredient table: /home/kristoffel/datasets/dataset-01-Nutrition5k/dish_ingredients.xlsx
Ingredient columns (first 25): ['dish_id', 'ingr_id', 'ingr_name', 'grams', 'calories', 'fat', 'carb', 'protein']
Label distribution (top 10):
carb       2907
protein    1244
fat         855
Name: label, dtype: int64


In [7]:

# --- 5) Materialize JPEGs if needed -------------------------------------------
from io import BytesIO
from PIL import Image
import base64, ast

# Prefer ready-made JPEGs
JPEG_ROOT = globals().get("JPEG_ROOT", None)
if JPEG_ROOT is None:
    # If we have an image table, infer columns and decode
    img_table = globals().get("img_table", None)
    assert img_table, "No JPEGs found and no image table located. Provide an images table or mirror with JPEGs."
    # Detect columns
    df_img = read_any_table(img_table)
    cnorm = {str(c).lower().strip().replace(" ", "").replace("(", "").replace(")", "").replace("-", "").replace("_",""): c
             for c in df_img.columns}
    dish_img_col = None
    for a in ["dish_id","plate_id","sample_id","id","dish","plateid","dishid"]:
        k = a.lower().replace(" ", "").replace("(", "").replace(")", "").replace("-", "").replace("_","")
        if k in cnorm:
            dish_img_col = cnorm[k]; break
    assert dish_img_col is not None, "Could not find a dish id column in images table."

    rgb_col = None
    for a in ["rgb","rgb_bytes","image","image_bytes","rgbimage","rgbimagebytes","rgb_image"]:
        k = a.lower().replace(" ", "").replace("(", "").replace(")", "").replace("-", "").replace("_","")
        if k in cnorm:
            rgb_col = cnorm[k]; break
    assert rgb_col is not None, "Could not find an RGB image bytes column."

    out_root = Path(RGB_DIR); out_root.mkdir(parents=True, exist_ok=True)

    def decode_to_bytes(x):
        if isinstance(x, (bytes, bytearray)):
            return bytes(x)
        if isinstance(x, memoryview):
            return x.tobytes()
        if isinstance(x, str):
            s = x.strip()
            if (s.startswith("b'") or s.startswith('b"')) and s.endswith(("'",'"')):
                try: return ast.literal_eval(s)
                except Exception: pass
            try:
                return base64.b64decode(s, validate=True)
            except Exception:
                pass
        try:
            return bytes(x)
        except Exception:
            raise ValueError("Unsupported RGB encoding type")

    count, skipped = 0, 0
    for ridx, row in df_img.iterrows():
        dish_id = str(row[dish_img_col])
        try:
            rgb_bytes = decode_to_bytes(row[rgb_col])
            img = Image.open(BytesIO(rgb_bytes)).convert("RGB")
        except Exception:
            skipped += 1
            continue
        dish_dir = out_root / dish_id
        dish_dir.mkdir(exist_ok=True, parents=True)
        out_path = dish_dir / f"img_{len(list(dish_dir.glob('img_*.jpg'))):05d}.jpg"
        img.save(out_path, format="JPEG", quality=90)
        count += 1
        if count % 500 == 0:
            print("saved", count, "images...")
    print(f"Saved {count} images to {out_root} ({skipped} skipped).")
    JPEG_ROOT = str(out_root)
else:
    print("Found JPEGs under:", JPEG_ROOT)


saved 500 images...
saved 1000 images...
saved 1500 images...
saved 2000 images...
saved 2500 images...
saved 3000 images...
Saved 3490 images to /home/kristoffel/datasets/dataset-01-Nutrition5k/images_rgb (0 skipped).


In [9]:

# --- 6) Build train/val/test splits by dish -----------------------------------
import os, shutil
from pathlib import Path
from collections import defaultdict
import random

# Map dish_id -> list of image paths
dish_to_paths = defaultdict(list)
for p in Path(JPEG_ROOT).rglob("*.jpg"):
    dish_id = p.parent.name
    dish_to_paths[dish_id].append(str(p))

label_map = dict(df_lbl.values)  # dish_id -> label
pairs = [(d, label_map.get(d, None), imgs) for d, imgs in dish_to_paths.items()]
pairs = [(d,l,imgs) for (d,l,imgs) in pairs if l is not None and len(imgs)>0]

print("Dishes with labels & images:", len(pairs))
by_label = defaultdict(list)
for dish, label, imgs in pairs:
    by_label[label].append((dish, imgs))

TRAIN_DIR = f"{DATASET_ROOT}/train"
VAL_DIR   = f"{DATASET_ROOT}/val"
TEST_DIR  = f"{DATASET_ROOT}/test"

# Recreate split directories
for d in (TRAIN_DIR, VAL_DIR, TEST_DIR):
    if os.path.isdir(d):
        shutil.rmtree(d)
    os.makedirs(d, exist_ok=True)
    for cls in sorted(set(df_lbl['label'])):
        os.makedirs(os.path.join(d, cls), exist_ok=True)

def safe_link(src, dst):
    try:
        os.symlink(src, dst)
    except Exception:
        shutil.copy2(src, dst)

splits = {"train": [], "val": [], "test": []}
for label, items in by_label.items():
    random.shuffle(items)
    n = len(items)
    n_val  = max(1, int(n * VAL_RATIO))
    n_test = max(1, int(n * TEST_RATIO))
    splits["val"].extend([(label, d, imgs) for d,imgs in items[:n_val]])
    splits["test"].extend([(label, d, imgs) for d,imgs in items[n_val:n_val+n_test]])
    splits["train"].extend([(label, d, imgs) for d,imgs in items[n_val+n_test:]])

def materialize(split_name):
    cnt = 0
    for label, dish, imgs in splits[split_name]:
        for src in imgs:
            dst = os.path.join(DATASET_ROOT, split_name, label, f"{dish}__{Path(src).name}")
            if not os.path.exists(dst):
                safe_link(src, dst)
                cnt += 1
    print(f"{split_name}: wrote {cnt} image links/files.")

materialize("train"); materialize("val"); materialize("test")

# Clear stale caches for Ultralytics
for split in ["train","val","test"]:
    cache = Path(DATASET_ROOT)/f"{split}.cache"
    if cache.exists():
        cache.unlink()
        print("removed cache:", cache)

print("‚úÖ Splits ready.")


Dishes with labels & images: 3490
train: wrote 2794 image links/files.
val: wrote 348 image links/files.
test: wrote 348 image links/files.
‚úÖ Splits ready.


In [10]:

# --- 7) Train YOLO11n-cls -----------------------------------------------------
from ultralytics import YOLO
import torch

model = YOLO("yolo11n-cls.pt")
results = model.train(
    data=DATASET_ROOT,     # directory with train/val/test
    epochs=EPOCHS,
    imgsz=IMGSZ,
    batch=BATCH,
    lr0=1e-3,
    patience=10,
    project=MODEL_DIR,
    name=RUN_NAME_BASE + ("_macro" if LABEL_STRATEGY=="primary_macro" else "_calbins"),
    plots=True,
    device=0 if torch.cuda.is_available() else "cpu",
)
print("Training run saved to:", results.save_dir)


New https://pypi.org/project/ultralytics/8.3.235 available üòÉ Update with 'pip install -U ultralytics'
Ultralytics 8.3.203 üöÄ Python-3.10.12 torch-2.2.0a0+81ea7a4 CUDA:0 (Tesla V100-SXM3-32GB, 32494MiB)
[34m[1mengine/trainer: [0magnostic_nms=False, amp=True, augment=False, auto_augment=randaugment, batch=64, bgr=0.0, box=7.5, cache=False, cfg=None, classes=None, close_mosaic=10, cls=0.5, compile=False, conf=None, copy_paste=0.0, copy_paste_mode=flip, cos_lr=False, cutmix=0.0, data=/home/kristoffel/datasets/dataset-01-Nutrition5k, degrees=0.0, deterministic=True, device=0, dfl=1.5, dnn=False, dropout=0.0, dynamic=False, embed=None, epochs=30, erasing=0.4, exist_ok=False, fliplr=0.5, flipud=0.0, format=torchscript, fraction=1.0, freeze=None, half=False, hsv_h=0.015, hsv_s=0.7, hsv_v=0.4, imgsz=224, int8=False, iou=0.7, keras=False, kobj=1.0, line_width=None, lr0=0.001, lrf=0.01, mask_ratio=4, max_det=300, mixup=0.0, mode=train, model=yolo11n-cls.pt, momentum=0.937, mosaic=1.0, mul

In [11]:

# --- 8) Evaluate on val and test ----------------------------------------------
from ultralytics import YOLO
import glob, os, torch

RUN_PREFIX = RUN_NAME_BASE
cands = glob.glob(os.path.join(MODEL_DIR, RUN_PREFIX + "*", "weights", "best.pt"))
assert cands, f"No best.pt found under {MODEL_DIR}/{RUN_PREFIX}*/weights/"
best_path = max(cands, key=os.path.getmtime)
print("Using best:", best_path)

model = YOLO(best_path)

metrics_val = model.val(
    data=DATASET_ROOT,
    split="val",
    imgsz=IMGSZ,
    project=MODEL_DIR,
    name=RUN_PREFIX + "_val",
    device=0 if torch.cuda.is_available() else "cpu",
)

metrics_test = model.val(
    data=DATASET_ROOT,
    split="test",
    imgsz=IMGSZ,
    project=MODEL_DIR,
    name=RUN_PREFIX + "_test",
    device=0 if torch.cuda.is_available() else "cpu",
)
print("Done.")


Using best: /home/kristoffel/models/nutrition5k_yolo11n_cls_macro/weights/best.pt
Ultralytics 8.3.203 üöÄ Python-3.10.12 torch-2.2.0a0+81ea7a4 CUDA:0 (Tesla V100-SXM3-32GB, 32494MiB)
YOLO11n-cls summary (fused): 47 layers, 1,529,867 parameters, 0 gradients, 3.2 GFLOPs
[34m[1mtrain:[0m /home/kristoffel/datasets/dataset-01-Nutrition5k/train... found 2794 images in 3 classes ‚úÖ 
[34m[1mval:[0m /home/kristoffel/datasets/dataset-01-Nutrition5k/val... found 348 images in 3 classes ‚úÖ 
[34m[1mtest:[0m /home/kristoffel/datasets/dataset-01-Nutrition5k/test... found 348 images in 3 classes ‚úÖ 
[34m[1mval: [0mFast image access ‚úÖ (ping: 0.0¬±0.0 ms, read: 1136.5¬±320.5 MB/s, size: 39.5 KB)
[K[34m[1mval: [0mScanning /home/kristoffel/datasets/dataset-01-Nutrition5k/val... 348 images, 0 corrupt: 100% ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ 348/348 818.2Kit/s 0.0s
[K               classes   top1_acc   top5_acc: 100% ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ 22/22 33.0it/s 0.7s0.4s
     


## Notes
- If Excel support is missing, run `pip install -U openpyxl` and re-run.  
- Expect files like **`dishes.xlsx`** (dish-level totals) and **`dish_ingredients.xlsx`** (ingredient-level).  
  The notebook prefers `dishes.xlsx`, and **falls back** to aggregating from `dish_ingredients.xlsx`.  
- If your mirror only has `dish_images.pkl`, please re-download to include the metadata sheets (see the Kaggle page).  
- For stronger baselines, try `IMGSZ=256/320`, `BATCH=128`, or `yolo11m-cls.pt`.
