In [None]:
pip install icrawler

Collecting icrawler
  Downloading icrawler-0.6.10-py3-none-any.whl.metadata (6.2 kB)
Collecting bs4 (from icrawler)
  Downloading bs4-0.0.2-py2.py3-none-any.whl.metadata (411 bytes)
Downloading icrawler-0.6.10-py3-none-any.whl (36 kB)
Downloading bs4-0.0.2-py2.py3-none-any.whl (1.2 kB)
Installing collected packages: bs4, icrawler
Successfully installed bs4-0.0.2 icrawler-0.6.10


In [None]:
from google.colab import drive
import os
import re
import shutil
import random
from PIL import Image
from icrawler.builtin import GoogleImageCrawler, BingImageCrawler

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:

ingredients = {
    'egg': ['egg', 'raw egg', 'boiled egg'],
    'banana': ['banana', 'sliced banana', 'banana bunch'],
    'tomato': ['tomato', 'sliced tomato', 'tomato on table'],
    'carrot': ['carrot', 'raw carrot', 'carrots on cutting board'],
    'onion': ['onion', 'onion on cutting board', 'raw onion'],
    'milk': ['milk', 'glass of milk', 'milk bottle'],
    'flour': ['flour', 'flour in bowl', 'white flour'],
    'lemon': ['lemon', 'lemon on cutting board', 'yellow lemon'],
    'bread': ['bread', 'loaf of bread', 'whole wheat bread'],
    'rice': ['rice', 'white rice', 'bowl of rice'],
    'chicken-meat': ['raw chicken', 'raw chicken fillet', 'raw chicken breast'],
    'potato': ['potato', 'sliced potato', 'potatoes in basket'],
    'cheese': ['cheese', 'cheese block', 'sliced cheese'],
    'pineapple': ['pineapple', 'whole pineapple', 'pineapple on table'],
    'chocolate': ['chocolate', 'chocolate bar', 'dark chocolate']
}

raw_output_dir = 'raw-food-dataset'
final_output_dir = '/content/drive/MyDrive/organized-food-dataset-fifteen'


TARGET_IMAGES_PER_CLASS = 300
MAX_PER_QUERY = 100

def sanitize_folder_name(text):
    return re.sub(r'[^a-zA-Z0-9_-]', '_', text.lower())

def download_images():
    os.makedirs(raw_output_dir, exist_ok=True)

    for ingredient, queries in ingredients.items():
        ingredient_dir = os.path.join(raw_output_dir, ingredient)
        os.makedirs(ingredient_dir, exist_ok=True)

        total_downloaded = 0
        for query in queries:
            if total_downloaded >= TARGET_IMAGES_PER_CLASS:
                break

            subfolder = os.path.join(ingredient_dir, sanitize_folder_name(query))
            os.makedirs(subfolder, exist_ok=True)

            remaining = TARGET_IMAGES_PER_CLASS - total_downloaded


            google_crawler = GoogleImageCrawler(storage={'root_dir': subfolder})
            google_crawler.crawl(keyword=query, max_num=min(MAX_PER_QUERY, remaining))


            total_downloaded = count_images(ingredient_dir)
            if total_downloaded >= TARGET_IMAGES_PER_CLASS:
                break

            if total_downloaded < TARGET_IMAGES_PER_CLASS:
                remaining = TARGET_IMAGES_PER_CLASS - total_downloaded
                bing_crawler = BingImageCrawler(storage={'root_dir': subfolder})
                bing_crawler.crawl(keyword=query, max_num=min(MAX_PER_QUERY, remaining))

            total_downloaded = count_images(ingredient_dir)


def count_images(folder_path):
    count = 0
    for root, _, files in os.walk(folder_path):
        for file in files:
            if file.lower().endswith(('.jpg', '.jpeg', '.png')):
                count += 1
    return count

def clean_corrupted_images(folder_path):
    removed_count = 0
    for root, _, files in os.walk(folder_path):
        for file in files:
            filepath = os.path.join(root, file)
            try:
                img = Image.open(filepath)
                img.verify()
            except Exception:
                os.remove(filepath)
                removed_count += 1

def merge_class_folders():
    if os.path.exists(final_output_dir):
        shutil.rmtree(final_output_dir)
    os.makedirs(final_output_dir)

    for class_name in os.listdir(raw_output_dir):
        class_dir = os.path.join(raw_output_dir, class_name)
        if not os.path.isdir(class_dir):
            continue

        target_class_dir = os.path.join(final_output_dir, 'all', class_name)
        os.makedirs(target_class_dir, exist_ok=True)

        for subfolder in os.listdir(class_dir):
            subfolder_path = os.path.join(class_dir, subfolder)
            if os.path.isdir(subfolder_path):
                for img_file in os.listdir(subfolder_path):
                    src = os.path.join(subfolder_path, img_file)
                    dst = os.path.join(target_class_dir, img_file)

                    base, ext = os.path.splitext(dst)
                    i = 1
                    while os.path.exists(dst):
                        dst = f"{base}_{i}{ext}"
                        i += 1

                    shutil.copy2(src, dst)

def split_dataset(split_ratios=(0.8, 0.1, 0.1)):
    random.seed(42)

    for split in ['train', 'val', 'test']:
        for class_name in ingredients.keys():
            os.makedirs(os.path.join(final_output_dir, split, class_name), exist_ok=True)

    for class_name in os.listdir(os.path.join(final_output_dir, 'all')):
        class_folder = os.path.join(final_output_dir, 'all', class_name)
        images = os.listdir(class_folder)
        random.shuffle(images)

        total = len(images)
        train_end = int(total * split_ratios[0])
        val_end = train_end + int(total * split_ratios[1])

        splits = {
            'train': images[:train_end],
            'val': images[train_end:val_end],
            'test': images[val_end:]
        }

        for split, files in splits.items():
            for file in files:
                src = os.path.join(class_folder, file)
                dst = os.path.join(final_output_dir, split, class_name, file)
                shutil.copy2(src, dst)

    shutil.rmtree(os.path.join(final_output_dir, 'all'))

download_images()
clean_corrupted_images(raw_output_dir)
merge_class_folders()
clean_corrupted_images(final_output_dir)
split_dataset()

In [None]:
import shutil

shutil.copytree('raw-food-dataset', '/content/drive/MyDrive/organized-food-dataset-fifteen', dirs_exist_ok=True)


'/content/drive/MyDrive/cleaned-food-dataset-expanded-fifteen'