In [2]:
!pip install icrawler

Collecting icrawler
  Downloading icrawler-0.6.10-py3-none-any.whl.metadata (6.2 kB)
Collecting bs4 (from icrawler)
  Downloading bs4-0.0.2-py2.py3-none-any.whl.metadata (411 bytes)
Collecting lxml (from icrawler)
  Downloading lxml-6.0.2-cp312-cp312-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl.metadata (3.6 kB)
Downloading icrawler-0.6.10-py3-none-any.whl (36 kB)
Downloading bs4-0.0.2-py2.py3-none-any.whl (1.2 kB)
Downloading lxml-6.0.2-cp312-cp312-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl (5.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.3/5.3 MB[0m [31m10.4 MB/s[0m  [33m0:00:00[0meta [36m0:00:01[0m
[?25hInstalling collected packages: lxml, bs4, icrawler
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3/3[0m [icrawler]
[1A[2KSuccessfully installed bs4-0.0.2 icrawler-0.6.10 lxml-6.0.2
[0m

In [13]:
from pathlib import Path
import os
from PIL import Image
from icrawler.builtin import BingImageCrawler

TF_SUPPORTED_FORMATS = {"JPEG", "PNG", "GIF", "BMP"}

def _count_supported_images(folder: Path) -> int:
    count = 0
    for p in folder.rglob("*"):
        if not p.is_file():
            continue
        try:
            with Image.open(p) as img:
                img.verify()
                fmt = img.format
            if fmt in TF_SUPPORTED_FORMATS:
                count += 1
        except Exception:
            pass
    return count

def _clean_folder_tf_supported_and_min_size(folder: Path, min_w=224, min_h=224) -> tuple[int, int, int]:
    """
    Deletes:
      - non-images/corrupted images
      - formats not supported by TF (JPEG/PNG/GIF/BMP)
      - images smaller than min_w x min_h

    Returns: (checked, removed, kept)
    """
    checked = removed = kept = 0

    for root, _, files in os.walk(folder):
        for f in files:
            path = Path(root) / f
            checked += 1
            try:
                with Image.open(path) as img:
                    img.verify()
                    fmt = img.format

                if fmt not in TF_SUPPORTED_FORMATS:
                    path.unlink(missing_ok=True)
                    removed += 1
                    continue

                # Re-open after verify() to read size
                with Image.open(path) as img2:
                    w, h = img2.size

                if w < min_w or h < min_h:
                    path.unlink(missing_ok=True)
                    removed += 1
                    continue

                kept += 1
            except Exception:
                # corrupted/unreadable/not an image
                path.unlink(missing_ok=True)
                removed += 1

    return checked, removed, kept

def download_images_for_class_tf_ready(
    class_name: str,
    n_images: int,
    target_root: str,
    min_size=(224, 224),
    bing_filters=None,
    buffer_factor: float = 2.0,
    max_rounds: int = 3,
):
    """
    Downloads images for `class_name` into target_root/class_name
    then cleans the folder to keep only TF-supported formats and min resolution.

    Notes:
      - Because search results include webp/avif/svg/corrupt files, we download extra
        (buffer_factor) and then delete the bad ones.
      - We optionally retry a few rounds until we have >= n_images valid images.
    """
    save_dir = Path(target_root) / class_name
    save_dir.mkdir(parents=True, exist_ok=True)

    min_w, min_h = min_size
    filters = bing_filters or {"type": "photo", "size": "large"}  # size is a hint, not a guarantee

    for round_idx in range(1, max_rounds + 1):
        current_valid = _count_supported_images(save_dir)
        if current_valid >= n_images:
            break

        need = n_images - current_valid
        to_download = max(int(need * buffer_factor), need)

        crawler = BingImageCrawler(storage={"root_dir": str(save_dir)})
        crawler.crawl(keyword=class_name, max_num=to_download, filters=filters)

        checked, removed, kept = _clean_folder_tf_supported_and_min_size(save_dir, min_w, min_h)
        current_valid = _count_supported_images(save_dir)

        print(
            f"[{class_name}] round {round_idx}/{max_rounds} | "
            f"downloaded≈{to_download} | checked={checked}, removed={removed}, kept={kept} | "
            f"valid_now={current_valid}/{n_images}"
        )

    final_valid = _count_supported_images(save_dir)
    if final_valid < n_images:
        print(f"Warning: only {final_valid} TF-ready images found for '{class_name}'. "
              f"Try increasing max_rounds/buffer_factor or tweak search term/filters.")
    else:
        print(f"Done: {final_valid} TF-ready images for '{class_name}' in {save_dir}")

    return str(save_dir)


In [14]:
download_images_for_class_tf_ready(
    class_name="pizza",
    n_images=100,
    target_root="../data/pizza_steak/valid",
    min_size=(224, 224)
)

download_images_for_class_tf_ready(
    class_name="steak",
    n_images=100,
    target_root="../data/pizza_steak/valid",
    min_size=(224, 224)
)


[pizza] round 1/3 | downloaded≈200 | checked=164, removed=2, kept=162 | valid_now=162/100
Done: 162 TF-ready images for 'pizza' in ../data/pizza_steak/valid/pizza
[steak] round 1/3 | downloaded≈200 | checked=179, removed=0, kept=179 | valid_now=179/100
Done: 179 TF-ready images for 'steak' in ../data/pizza_steak/valid/steak


'../data/pizza_steak/valid/steak'