In [1]:
import random
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from sklearn.model_selection import KFold
from pathlib import Path
import shutil

Data downloaded directly from the website is in a format that can only be “read” using datasets.Food101 (split, transform).

In [2]:
# load entire data
_ = datasets.Food101(root=Path.cwd().parent / "Data", download=True)

# images path
images_path = Path.cwd().parent / "Data/food-101/images"

# only to display all classes
all_data = datasets.ImageFolder(
    root=images_path,
    transform=transforms.ToTensor()  # This part will be conducted in dataloader.py for data prepared in cells below 
)

all_data.classes

['apple_pie',
 'baby_back_ribs',
 'baklava',
 'beef_carpaccio',
 'beef_tartare',
 'beet_salad',
 'beignets',
 'bibimbap',
 'bread_pudding',
 'breakfast_burrito',
 'bruschetta',
 'caesar_salad',
 'cannoli',
 'caprese_salad',
 'carrot_cake',
 'ceviche',
 'cheese_plate',
 'cheesecake',
 'chicken_curry',
 'chicken_quesadilla',
 'chicken_wings',
 'chocolate_cake',
 'chocolate_mousse',
 'churros',
 'clam_chowder',
 'club_sandwich',
 'crab_cakes',
 'creme_brulee',
 'croque_madame',
 'cup_cakes',
 'deviled_eggs',
 'donuts',
 'dumplings',
 'edamame',
 'eggs_benedict',
 'escargots',
 'falafel',
 'filet_mignon',
 'fish_and_chips',
 'foie_gras',
 'french_fries',
 'french_onion_soup',
 'french_toast',
 'fried_calamari',
 'fried_rice',
 'frozen_yogurt',
 'garlic_bread',
 'gnocchi',
 'greek_salad',
 'grilled_cheese_sandwich',
 'grilled_salmon',
 'guacamole',
 'gyoza',
 'hamburger',
 'hot_and_sour_soup',
 'hot_dog',
 'huevos_rancheros',
 'hummus',
 'ice_cream',
 'lasagna',
 'lobster_bisque',
 'lobster

In [3]:
AMOUNT_TO_GET = 0.1
N_FOLDS = 5
TARGET_CLASSES = ['apple_pie',
 'baby_back_ribs',
 'baklava',
 'beef_carpaccio',
 'beef_tartare',
 'beet_salad',
 'beignets',
 'bibimbap',
 'bread_pudding',
 'breakfast_burrito',
 'bruschetta',
 'caesar_salad',
 'cannoli',
 'caprese_salad',
 'carrot_cake',
 'ceviche',
 'cheese_plate',
 'cheesecake',
 'chicken_curry',
 'chicken_quesadilla',
 'chicken_wings',
 'chocolate_cake',
 'chocolate_mousse',
 'churros',
 'clam_chowder',
 'club_sandwich',
 'crab_cakes',
 'creme_brulee',
 'croque_madame',
 'cup_cakes',
 'deviled_eggs',
 'donuts',
 'dumplings',
 'edamame',
 'eggs_benedict',
 'escargots',
 'falafel',
 'filet_mignon',
 'fish_and_chips',
 'foie_gras',
 'french_fries',
 'french_onion_soup',
 'french_toast',
 'fried_calamari',
 'fried_rice',
 'frozen_yogurt',
 'garlic_bread',
 'gnocchi',
 'greek_salad',
 'grilled_cheese_sandwich',
 'grilled_salmon',
 'guacamole',
 'gyoza',
 'hamburger',
 'hot_and_sour_soup',
 'hot_dog',
 'huevos_rancheros',
 'hummus',
 'ice_cream',
 'lasagna',
 'lobster_bisque',
 'lobster_roll_sandwich',
 'macaroni_and_cheese',
 'macarons',
 'miso_soup',
 'mussels',
 'nachos',
 'omelette',
 'onion_rings',
 'oysters',
 'pad_thai',
 'paella',
 'pancakes',
 'panna_cotta',
 'peking_duck',
 'pho',
 'pizza',
 'pork_chop',
 'poutine',
 'prime_rib',
 'pulled_pork_sandwich',
 'ramen',
 'ravioli',
 'red_velvet_cake',
 'risotto',
 'samosa',
 'sashimi',
 'scallops',
 'seaweed_salad',
 'shrimp_and_grits',
 'spaghetti_bolognese',
 'spaghetti_carbonara',
 'spring_rolls',
 'steak',
 'strawberry_shortcake',
 'sushi',
 'tacos',
 'takoyaki',
 'tiramisu',
 'tuna_tartare',
 'waffles']  # only chosen classes

SPLITS = {"train": 0.7, "val": 0.15, "test": 0.15} 

In [4]:
data_path = Path.cwd().parent / "Data"
split_str = "_".join(f"{k[:2]}{int(v * 100)}" for k, v in SPLITS.items())
target_dir_name = data_path / f"food-101_{str(int(AMOUNT_TO_GET*100))}%_{split_str}"
target_dir = Path(target_dir_name)

target_dir.mkdir(parents=True, exist_ok=True)

# save classes to txt inside folder
with open(target_dir_name / "classes.txt", "w") as f:
    for name in TARGET_CLASSES:
        f.write(name + "\n")

# DATA PREPARATION - select one

### * Extracting a subset of data from Food-101

In [27]:
# Set random seed for reproducibility
random.seed(42)

# Split and copy
for class_name in TARGET_CLASSES:
    source_class_dir = images_path / class_name
    all_images = list(source_class_dir.glob("*.jpg"))
    num_to_sample = int(len(all_images) * AMOUNT_TO_GET)

    sampled_images = random.sample(all_images, num_to_sample)

    # Calculate split indices
    train_end = int(SPLITS["train"] * num_to_sample)
    val_end = train_end + int(SPLITS["val"] * num_to_sample)

    train_imgs = sampled_images[:train_end]
    val_imgs = sampled_images[train_end:val_end]
    test_imgs = sampled_images[val_end:]

    split_map = {
        "train": train_imgs,
        "val": val_imgs,
        "test": test_imgs
    }

    # Copy to corresponding split directories
    for split_name, split_images in split_map.items():
        split_class_dir = target_dir / split_name / class_name
        split_class_dir.mkdir(parents=True, exist_ok=True)

        for img_path in split_images:
            shutil.copy(img_path, split_class_dir / img_path.name)

print(f"Done! Data saved in splits under: {target_dir}")


Done! Data saved in splits under: /home/kamil-solski/Documents/Python/Projekty_py/Food101/Data/food-101_20%_tr70_va15_te15


### * Extracting a subset of data from Food101 (with K-Fold Cross-validation)

In [5]:
# Ensure reproducibility
random.seed(42)

# K-fold processing
for class_name in TARGET_CLASSES:
    source_class_dir = images_path / class_name
    all_images = list(source_class_dir.glob("*.jpg"))
    num_to_sample = int(len(all_images) * AMOUNT_TO_GET)
    sampled_images = random.sample(all_images, num_to_sample)

    # Global test split
    test_count = int(SPLITS["test"] * len(sampled_images))
    test_images = sampled_images[:test_count]
    remaining_images = sampled_images[test_count:]

    # Save global test set - without that we will 
    test_class_dir = target_dir / "test" / class_name
    test_class_dir.mkdir(parents=True, exist_ok=True)
    for img in test_images:
        shutil.copy(img, test_class_dir / img.name)

    # K-Fold cross-validation on remaining images (train + val only)
    kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=42)
    for fold_idx, (train_idx, val_idx) in enumerate(kf.split(remaining_images)):
        fold_dir = target_dir / f"fold{fold_idx}"
        fold_images = {
            "train": [remaining_images[i] for i in train_idx],
            "val": [remaining_images[i] for i in val_idx]
        }

        for split_name, split_images in fold_images.items():
            split_class_dir = fold_dir / split_name / class_name
            split_class_dir.mkdir(parents=True, exist_ok=True)

            for img in split_images:
                shutil.copy(img, split_class_dir / img.name)
                
print(f"Done! {N_FOLDS}-fold cross-validation saved to: {target_dir}")

Done! 5-fold cross-validation saved to: /home/kamil-solski/Documents/Python/Projekty_py/Food101_MLOps-Lvl_1/Data/food-101_10%_tr70_va15_te15
