🔧 1. Import & Setup

In [1]:
# Required libraries
import os
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from webdriver_manager.chrome import ChromeDriverManager
import urllib.request


🕷️ 2. Define the Crawler

In [None]:
def crawl_images(keyword, save_dir, count=200, delay=1.0):
    """
    Download images from Google Image Search using Selenium.
    
    Parameters:
    - keyword: Search term
    - save_dir: Directory to save images
    - count: Number of images to collect
    - delay: Delay between image loads
    """
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    # Headless browser setup
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")

    # Launch Chrome driver
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    driver.get("https://www.google.com/imghp?hl=en")

    # Search the keyword
    search_box = driver.find_element(By.NAME, "q")
    search_box.send_keys(keyword)
    search_box.send_keys(Keys.RETURN)

    # Scroll to load images
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(delay)
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            try:
                driver.find_element(By.CLASS_NAME, "mye4qd").click()
            except:
                break
        last_height = new_height

    # Find image elements
    images = driver.find_elements(By.CSS_SELECTOR, "img.rg_i")

    # Download images
    downloaded = 0
    for img in images:
        if downloaded >= count:
            break
        try:
            img.click()
            time.sleep(delay)
            actual_images = driver.find_elements(By.CSS_SELECTOR, "img.n3VNCb")
            for actual_img in actual_images:
                src = actual_img.get_attribute("src")
                if src and "http" in src:
                    try:
                        filename = os.path.join(save_dir, f"{keyword}_{downloaded}.jpg")
                        urllib.request.urlretrieve(src, filename)
                        downloaded += 1
                        break
                    except:
                        continue
        except:
            continue

    driver.quit()

🚀 3. Start Crawling by Category

In [None]:
# List of categories (modify as needed)
categories = [
    "fruit apple", "fruit banana", "fruit kiwi", "fruit pineapple",
    "vegetable carrot", "vegetable cabbage", "vegetable capsicum", "vegetable corn",
    "vegetable cauliflower", "vegetable beetroot", "vegetable bell pepper", "vegetable chili pepper"
]

# Start crawling for each category
for category in categories:
    folder_name = category.split()[-1]
    crawl_images(keyword=category, save_dir=f"./images/{folder_name}", count=200)