In [1]:
import os
import time
import urllib.request
from PIL import Image, UnidentifiedImageError

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


In [2]:
def crawl_images(keyword, save_dir, count=500, delay=1.0):
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    # Chrome 설정
    options = Options()
    # options.add_argument("--headless")  # 필요시 주석 제거하면 창 없이 실행
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")

    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    driver.set_window_size(1920, 1080)

    # 이미지 검색 페이지 직접 진입
    search_url = f"https://www.google.com/search?q={keyword.replace(' ', '+')}&tbm=isch"
    driver.get(search_url)
    time.sleep(2)

    # 스크롤 반복
    for _ in range(20):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(delay)
        try:
            driver.find_element(By.CLASS_NAME, "mye4qd").click()
        except:
            pass

    # 이미지 요소 탐색
    try:
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "img")))
        time.sleep(2)
        images = driver.find_elements(By.CSS_SELECTOR, "img")
        print(f"🔍 Found {len(images)} image elements for '{keyword}'")
    except Exception as e:
        print(f"❌ Failed to locate image elements: {e}")
        images = []

    # 이미지 다운로드 (클릭 없이 바로 src 저장)
    downloaded = 0
    seen_src = set()

    for img in images:
        if downloaded >= count:
            break
        try:
            src = img.get_attribute("src") or img.get_attribute("data-src")
            if src and "http" in src and src not in seen_src:
                seen_src.add(src)
                filename = os.path.join(save_dir, f"{keyword.replace(' ', '_')}_{downloaded}.jpg")
                urllib.request.urlretrieve(src, filename)
                print(f"✅ Downloaded {downloaded + 1}/{count}")
                downloaded += 1
        except Exception as e:
            print(f"⚠️ Error downloading: {e}")
            continue

    print(f"🎉 Done: Collected {downloaded} images for '{keyword}'")
    driver.quit()


In [3]:
activity_map = {
    "brushing_teeth": "person brushing teeth",
    "drinking": "person drinking water",
    "eating": "person eating food",
    "typing": "person typing on laptop",
    "sleeping": "person sleeping in bed",
    "reading": "person reading book",
    "washing_face": "person washing face",
    "walking": "person walking outside"
}

for label, keyword in activity_map.items():
    folder = f"./images/{label}"
    print(f"\n🖼️ Start Crawling: '{keyword}' → 📁 {folder}")
    crawl_images(keyword=keyword, save_dir=folder, count=500, delay=1.2)



🖼️ Start Crawling: 'person brushing teeth' → 📁 ./images/brushing_teeth
🔍 Found 1991 image elements for 'person brushing teeth'
✅ Downloaded 1/500
✅ Downloaded 2/500
✅ Downloaded 3/500
✅ Downloaded 4/500
✅ Downloaded 5/500
✅ Downloaded 6/500
✅ Downloaded 7/500
✅ Downloaded 8/500
✅ Downloaded 9/500
✅ Downloaded 10/500
✅ Downloaded 11/500
✅ Downloaded 12/500
✅ Downloaded 13/500
✅ Downloaded 14/500
✅ Downloaded 15/500
✅ Downloaded 16/500
✅ Downloaded 17/500
✅ Downloaded 18/500
✅ Downloaded 19/500
✅ Downloaded 20/500
✅ Downloaded 21/500
✅ Downloaded 22/500
✅ Downloaded 23/500
✅ Downloaded 24/500
✅ Downloaded 25/500
✅ Downloaded 26/500
✅ Downloaded 27/500
✅ Downloaded 28/500
✅ Downloaded 29/500
✅ Downloaded 30/500
✅ Downloaded 31/500
✅ Downloaded 32/500
✅ Downloaded 33/500
✅ Downloaded 34/500
✅ Downloaded 35/500
✅ Downloaded 36/500
✅ Downloaded 37/500
✅ Downloaded 38/500
✅ Downloaded 39/500
✅ Downloaded 40/500
✅ Downloaded 41/500
✅ Downloaded 42/500
✅ Downloaded 43/500
✅ Downloaded 44/500
✅

✅ Downloaded 381/500
✅ Downloaded 382/500
✅ Downloaded 383/500
✅ Downloaded 384/500
✅ Downloaded 385/500
✅ Downloaded 386/500
✅ Downloaded 387/500
✅ Downloaded 388/500
✅ Downloaded 389/500
✅ Downloaded 390/500
✅ Downloaded 391/500
✅ Downloaded 392/500
✅ Downloaded 393/500
✅ Downloaded 394/500
✅ Downloaded 395/500
✅ Downloaded 396/500
✅ Downloaded 397/500
✅ Downloaded 398/500
✅ Downloaded 399/500
✅ Downloaded 400/500
✅ Downloaded 401/500
✅ Downloaded 402/500
✅ Downloaded 403/500
✅ Downloaded 404/500
✅ Downloaded 405/500
✅ Downloaded 406/500
✅ Downloaded 407/500
✅ Downloaded 408/500
✅ Downloaded 409/500
✅ Downloaded 410/500
✅ Downloaded 411/500
✅ Downloaded 412/500
✅ Downloaded 413/500
✅ Downloaded 414/500
✅ Downloaded 415/500
✅ Downloaded 416/500
✅ Downloaded 417/500
✅ Downloaded 418/500
✅ Downloaded 419/500
✅ Downloaded 420/500
✅ Downloaded 421/500
✅ Downloaded 422/500
✅ Downloaded 423/500
✅ Downloaded 424/500
✅ Downloaded 425/500
✅ Downloaded 426/500
✅ Downloaded 427/500
✅ Downloaded 

✅ Downloaded 266/500
✅ Downloaded 267/500
✅ Downloaded 268/500
✅ Downloaded 269/500
✅ Downloaded 270/500
✅ Downloaded 271/500
✅ Downloaded 272/500
✅ Downloaded 273/500
✅ Downloaded 274/500
✅ Downloaded 275/500
✅ Downloaded 276/500
✅ Downloaded 277/500
✅ Downloaded 278/500
✅ Downloaded 279/500
✅ Downloaded 280/500
✅ Downloaded 281/500
✅ Downloaded 282/500
✅ Downloaded 283/500
✅ Downloaded 284/500
✅ Downloaded 285/500
✅ Downloaded 286/500
✅ Downloaded 287/500
✅ Downloaded 288/500
✅ Downloaded 289/500
✅ Downloaded 290/500
✅ Downloaded 291/500
⚠️ Error downloading: HTTP Error 404: Not Found
✅ Downloaded 292/500
✅ Downloaded 293/500
✅ Downloaded 294/500
✅ Downloaded 295/500
✅ Downloaded 296/500
✅ Downloaded 297/500
✅ Downloaded 298/500
✅ Downloaded 299/500
✅ Downloaded 300/500
✅ Downloaded 301/500
✅ Downloaded 302/500
✅ Downloaded 303/500
✅ Downloaded 304/500
✅ Downloaded 305/500
✅ Downloaded 306/500
✅ Downloaded 307/500
✅ Downloaded 308/500
✅ Downloaded 309/500
✅ Downloaded 310/500
✅ Downl

✅ Downloaded 151/500
✅ Downloaded 152/500
✅ Downloaded 153/500
✅ Downloaded 154/500
✅ Downloaded 155/500
✅ Downloaded 156/500
✅ Downloaded 157/500
✅ Downloaded 158/500
✅ Downloaded 159/500
✅ Downloaded 160/500
✅ Downloaded 161/500
✅ Downloaded 162/500
✅ Downloaded 163/500
✅ Downloaded 164/500
✅ Downloaded 165/500
✅ Downloaded 166/500
✅ Downloaded 167/500
✅ Downloaded 168/500
✅ Downloaded 169/500
✅ Downloaded 170/500
✅ Downloaded 171/500
✅ Downloaded 172/500
✅ Downloaded 173/500
✅ Downloaded 174/500
✅ Downloaded 175/500
✅ Downloaded 176/500
✅ Downloaded 177/500
✅ Downloaded 178/500
✅ Downloaded 179/500
✅ Downloaded 180/500
✅ Downloaded 181/500
✅ Downloaded 182/500
✅ Downloaded 183/500
✅ Downloaded 184/500
✅ Downloaded 185/500
✅ Downloaded 186/500
✅ Downloaded 187/500
✅ Downloaded 188/500
✅ Downloaded 189/500
✅ Downloaded 190/500
✅ Downloaded 191/500
✅ Downloaded 192/500
✅ Downloaded 193/500
✅ Downloaded 194/500
✅ Downloaded 195/500
✅ Downloaded 196/500
✅ Downloaded 197/500
✅ Downloaded 

✅ Downloaded 28/500
✅ Downloaded 29/500
✅ Downloaded 30/500
✅ Downloaded 31/500
✅ Downloaded 32/500
✅ Downloaded 33/500
✅ Downloaded 34/500
✅ Downloaded 35/500
✅ Downloaded 36/500
✅ Downloaded 37/500
✅ Downloaded 38/500
⚠️ Error downloading: HTTP Error 404: Not Found
✅ Downloaded 39/500
✅ Downloaded 40/500
✅ Downloaded 41/500
✅ Downloaded 42/500
✅ Downloaded 43/500
✅ Downloaded 44/500
✅ Downloaded 45/500
✅ Downloaded 46/500
✅ Downloaded 47/500
✅ Downloaded 48/500
✅ Downloaded 49/500
✅ Downloaded 50/500
✅ Downloaded 51/500
✅ Downloaded 52/500
✅ Downloaded 53/500
✅ Downloaded 54/500
✅ Downloaded 55/500
✅ Downloaded 56/500
✅ Downloaded 57/500
✅ Downloaded 58/500
✅ Downloaded 59/500
✅ Downloaded 60/500
✅ Downloaded 61/500
✅ Downloaded 62/500
✅ Downloaded 63/500
✅ Downloaded 64/500
✅ Downloaded 65/500
✅ Downloaded 66/500
✅ Downloaded 67/500
✅ Downloaded 68/500
✅ Downloaded 69/500
✅ Downloaded 70/500
✅ Downloaded 71/500
✅ Downloaded 72/500
✅ Downloaded 73/500
✅ Downloaded 74/500
✅ Downloaded

✅ Downloaded 415/500
✅ Downloaded 416/500
✅ Downloaded 417/500
✅ Downloaded 418/500
✅ Downloaded 419/500
✅ Downloaded 420/500
✅ Downloaded 421/500
✅ Downloaded 422/500
✅ Downloaded 423/500
✅ Downloaded 424/500
✅ Downloaded 425/500
✅ Downloaded 426/500
✅ Downloaded 427/500
✅ Downloaded 428/500
✅ Downloaded 429/500
✅ Downloaded 430/500
✅ Downloaded 431/500
✅ Downloaded 432/500
✅ Downloaded 433/500
✅ Downloaded 434/500
✅ Downloaded 435/500
✅ Downloaded 436/500
✅ Downloaded 437/500
✅ Downloaded 438/500
✅ Downloaded 439/500
✅ Downloaded 440/500
✅ Downloaded 441/500
✅ Downloaded 442/500
✅ Downloaded 443/500
✅ Downloaded 444/500
✅ Downloaded 445/500
✅ Downloaded 446/500
✅ Downloaded 447/500
✅ Downloaded 448/500
✅ Downloaded 449/500
✅ Downloaded 450/500
✅ Downloaded 451/500
✅ Downloaded 452/500
✅ Downloaded 453/500
✅ Downloaded 454/500
✅ Downloaded 455/500
✅ Downloaded 456/500
✅ Downloaded 457/500
✅ Downloaded 458/500
✅ Downloaded 459/500
✅ Downloaded 460/500
✅ Downloaded 461/500
✅ Downloaded 

✅ Downloaded 300/500
✅ Downloaded 301/500
✅ Downloaded 302/500
✅ Downloaded 303/500
✅ Downloaded 304/500
✅ Downloaded 305/500
✅ Downloaded 306/500
✅ Downloaded 307/500
✅ Downloaded 308/500
✅ Downloaded 309/500
✅ Downloaded 310/500
✅ Downloaded 311/500
✅ Downloaded 312/500
✅ Downloaded 313/500
✅ Downloaded 314/500
✅ Downloaded 315/500
✅ Downloaded 316/500
✅ Downloaded 317/500
✅ Downloaded 318/500
✅ Downloaded 319/500
✅ Downloaded 320/500
✅ Downloaded 321/500
✅ Downloaded 322/500
✅ Downloaded 323/500
✅ Downloaded 324/500
✅ Downloaded 325/500
✅ Downloaded 326/500
✅ Downloaded 327/500
✅ Downloaded 328/500
✅ Downloaded 329/500
✅ Downloaded 330/500
✅ Downloaded 331/500
✅ Downloaded 332/500
✅ Downloaded 333/500
✅ Downloaded 334/500
✅ Downloaded 335/500
✅ Downloaded 336/500
✅ Downloaded 337/500
✅ Downloaded 338/500
✅ Downloaded 339/500
✅ Downloaded 340/500
✅ Downloaded 341/500
✅ Downloaded 342/500
✅ Downloaded 343/500
✅ Downloaded 344/500
✅ Downloaded 345/500
✅ Downloaded 346/500
✅ Downloaded 

✅ Downloaded 185/500
✅ Downloaded 186/500
✅ Downloaded 187/500
✅ Downloaded 188/500
✅ Downloaded 189/500
✅ Downloaded 190/500
✅ Downloaded 191/500
✅ Downloaded 192/500
✅ Downloaded 193/500
✅ Downloaded 194/500
✅ Downloaded 195/500
✅ Downloaded 196/500
✅ Downloaded 197/500
✅ Downloaded 198/500
✅ Downloaded 199/500
✅ Downloaded 200/500
✅ Downloaded 201/500
✅ Downloaded 202/500
✅ Downloaded 203/500
✅ Downloaded 204/500
✅ Downloaded 205/500
✅ Downloaded 206/500
✅ Downloaded 207/500
✅ Downloaded 208/500
✅ Downloaded 209/500
✅ Downloaded 210/500
✅ Downloaded 211/500
✅ Downloaded 212/500
✅ Downloaded 213/500
✅ Downloaded 214/500
✅ Downloaded 215/500
⚠️ Error downloading: HTTP Error 404: Not Found
✅ Downloaded 216/500
✅ Downloaded 217/500
✅ Downloaded 218/500
✅ Downloaded 219/500
✅ Downloaded 220/500
✅ Downloaded 221/500
✅ Downloaded 222/500
✅ Downloaded 223/500
✅ Downloaded 224/500
✅ Downloaded 225/500
✅ Downloaded 226/500
✅ Downloaded 227/500
✅ Downloaded 228/500
✅ Downloaded 229/500
✅ Downl

✅ Downloaded 67/500
✅ Downloaded 68/500
✅ Downloaded 69/500
✅ Downloaded 70/500
✅ Downloaded 71/500
✅ Downloaded 72/500
✅ Downloaded 73/500
✅ Downloaded 74/500
✅ Downloaded 75/500
✅ Downloaded 76/500
✅ Downloaded 77/500
✅ Downloaded 78/500
✅ Downloaded 79/500
✅ Downloaded 80/500
✅ Downloaded 81/500
✅ Downloaded 82/500
✅ Downloaded 83/500
✅ Downloaded 84/500
✅ Downloaded 85/500
✅ Downloaded 86/500
✅ Downloaded 87/500
✅ Downloaded 88/500
✅ Downloaded 89/500
✅ Downloaded 90/500
✅ Downloaded 91/500
✅ Downloaded 92/500
✅ Downloaded 93/500
✅ Downloaded 94/500
✅ Downloaded 95/500
✅ Downloaded 96/500
✅ Downloaded 97/500
✅ Downloaded 98/500
✅ Downloaded 99/500
✅ Downloaded 100/500
✅ Downloaded 101/500
✅ Downloaded 102/500
✅ Downloaded 103/500
✅ Downloaded 104/500
✅ Downloaded 105/500
✅ Downloaded 106/500
✅ Downloaded 107/500
✅ Downloaded 108/500
✅ Downloaded 109/500
✅ Downloaded 110/500
✅ Downloaded 111/500
✅ Downloaded 112/500
✅ Downloaded 113/500
✅ Downloaded 114/500
✅ Downloaded 115/500
✅ Do

✅ Downloaded 459/500
✅ Downloaded 460/500
✅ Downloaded 461/500
✅ Downloaded 462/500
✅ Downloaded 463/500
✅ Downloaded 464/500
✅ Downloaded 465/500
✅ Downloaded 466/500
✅ Downloaded 467/500
✅ Downloaded 468/500
✅ Downloaded 469/500
✅ Downloaded 470/500
✅ Downloaded 471/500
✅ Downloaded 472/500
✅ Downloaded 473/500
✅ Downloaded 474/500
✅ Downloaded 475/500
✅ Downloaded 476/500
✅ Downloaded 477/500
✅ Downloaded 478/500
✅ Downloaded 479/500
✅ Downloaded 480/500
✅ Downloaded 481/500
✅ Downloaded 482/500
✅ Downloaded 483/500
✅ Downloaded 484/500
✅ Downloaded 485/500
✅ Downloaded 486/500
✅ Downloaded 487/500
✅ Downloaded 488/500
✅ Downloaded 489/500
✅ Downloaded 490/500
✅ Downloaded 491/500
✅ Downloaded 492/500
✅ Downloaded 493/500
✅ Downloaded 494/500
✅ Downloaded 495/500
✅ Downloaded 496/500
✅ Downloaded 497/500
✅ Downloaded 498/500
✅ Downloaded 499/500
✅ Downloaded 500/500
🎉 Done: Collected 500 images for 'person washing face'

🖼️ Start Crawling: 'person walking outside' → 📁 ./images/walki

In [6]:
def clean_corrupt_images(root_dir="./images"):
    broken = []
    for cls in os.listdir(root_dir):
        cls_path = os.path.join(root_dir, cls)
        if not os.path.isdir(cls_path):
            continue
        for fname in os.listdir(cls_path):
            fpath = os.path.join(cls_path, fname)
            try:
                with Image.open(fpath) as img:
                    img.verify()
            except (UnidentifiedImageError, OSError):
                broken.append(fpath)
                os.remove(fpath)
    print(f"🧹 Removed {len(broken)} broken images")

# Run after crawling
clean_corrupt_images()

🧹 Removed 8 broken images
