In [1]:
import os
import time
import urllib.request
import hashlib
from PIL import Image, UnidentifiedImageError

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

# 기존 이미지 해시 얻기
def get_existing_image_hashes(directory):
    existing_hashes = set()
    for fname in os.listdir(directory):
        fpath = os.path.join(directory, fname)
        try:
            with open(fpath, "rb") as f:
                content = f.read()
                hash_val = hashlib.sha256(content).hexdigest()
                existing_hashes.add(hash_val)
        except:
            continue
    return existing_hashes

# 이미지 크롤링 함수
def crawl_images(keyword, save_dir, count=500, delay=1.0):
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    existing_hashes = get_existing_image_hashes(save_dir)

    options = Options()
    # options.add_argument("--headless")  # 필요 시 주석 해제
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")

    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    driver.set_window_size(1920, 1080)

    search_url = f"https://www.google.com/search?q={keyword.replace(' ', '+')}&tbm=isch"
    driver.get(search_url)
    time.sleep(2)

    for _ in range(30):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(delay)
        try:
            driver.find_element(By.CLASS_NAME, "mye4qd").click()
        except:
            pass

    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "img")))
    time.sleep(2)
    images = driver.find_elements(By.CSS_SELECTOR, "img")
    print(f"🔍 Found {len(images)} image elements for '{keyword}'")

    downloaded = 0
    seen_src = set()
    total = len(os.listdir(save_dir))

    for img in images:
        if downloaded >= count:
            break
        try:
            src = img.get_attribute("src") or img.get_attribute("data-src")
            if src and "http" in src and src not in seen_src:
                seen_src.add(src)
                temp_path = os.path.join(save_dir, f"temp_{downloaded}.jpg")
                urllib.request.urlretrieve(src, temp_path)

                with open(temp_path, "rb") as f:
                    hash_val = hashlib.sha256(f.read()).hexdigest()

                if hash_val not in existing_hashes:
                    filename = os.path.join(save_dir, f"{keyword.replace(' ', '_')}_{total + downloaded}.jpg")
                    os.rename(temp_path, filename)
                    existing_hashes.add(hash_val)
                    downloaded += 1
                    print(f"✅ Downloaded new {downloaded}/{count}")
                else:
                    os.remove(temp_path)
        except Exception as e:
            print(f"⚠️ Error: {e}")
            continue

    print(f"🎉 Done: Added {downloaded} new images for '{keyword}'")
    driver.quit()

# 📍 크롤링 대상 설정
activity_map = {
    "brushing_teeth": "person brushing teeth",
    "drinking": "person drinking water",
    "eating": "person eating food",
    "typing": "person typing on laptop",
    "sleeping": "person sleeping in bed",
    "reading": "person reading book",
    "washing_face": "person washing face",
    "walking": "person walking outside"
}

# 🔁 크롤링 실행
for label, keyword in activity_map.items():
    folder = f"./images/{label}"
    print(f"\n🖼️ Start Crawling: '{keyword}' → 📁 {folder}")
    crawl_images(keyword=keyword, save_dir=folder, count=500, delay=1.2)


🖼️ Start Crawling: 'person brushing teeth' → 📁 ./images/brushing_teeth
🔍 Found 1440 image elements for 'person brushing teeth'
✅ Downloaded new 1/500
✅ Downloaded new 2/500
✅ Downloaded new 3/500
✅ Downloaded new 4/500
✅ Downloaded new 5/500
✅ Downloaded new 6/500
✅ Downloaded new 7/500
✅ Downloaded new 8/500
✅ Downloaded new 9/500
✅ Downloaded new 10/500
✅ Downloaded new 11/500
✅ Downloaded new 12/500
✅ Downloaded new 13/500
✅ Downloaded new 14/500
✅ Downloaded new 15/500
✅ Downloaded new 16/500
✅ Downloaded new 17/500
✅ Downloaded new 18/500
✅ Downloaded new 19/500
✅ Downloaded new 20/500
✅ Downloaded new 21/500
✅ Downloaded new 22/500
✅ Downloaded new 23/500
✅ Downloaded new 24/500
✅ Downloaded new 25/500
✅ Downloaded new 26/500
✅ Downloaded new 27/500
✅ Downloaded new 28/500
✅ Downloaded new 29/500
✅ Downloaded new 30/500
✅ Downloaded new 31/500
✅ Downloaded new 32/500
✅ Downloaded new 33/500
✅ Downloaded new 34/500
✅ Downloaded new 35/500
✅ Downloaded new 36/500
✅ Downloaded new 

✅ Downloaded new 324/500
✅ Downloaded new 325/500
✅ Downloaded new 326/500
⚠️ Error: HTTP Error 404: Not Found
✅ Downloaded new 327/500
✅ Downloaded new 328/500
✅ Downloaded new 329/500
✅ Downloaded new 330/500
✅ Downloaded new 331/500
✅ Downloaded new 332/500
✅ Downloaded new 333/500
✅ Downloaded new 334/500
✅ Downloaded new 335/500
✅ Downloaded new 336/500
✅ Downloaded new 337/500
✅ Downloaded new 338/500
✅ Downloaded new 339/500
✅ Downloaded new 340/500
✅ Downloaded new 341/500
✅ Downloaded new 342/500
✅ Downloaded new 343/500
✅ Downloaded new 344/500
✅ Downloaded new 345/500
✅ Downloaded new 346/500
✅ Downloaded new 347/500
✅ Downloaded new 348/500
✅ Downloaded new 349/500
✅ Downloaded new 350/500
✅ Downloaded new 351/500
✅ Downloaded new 352/500
✅ Downloaded new 353/500
✅ Downloaded new 354/500
✅ Downloaded new 355/500
✅ Downloaded new 356/500
✅ Downloaded new 357/500
✅ Downloaded new 358/500
✅ Downloaded new 359/500
✅ Downloaded new 360/500
✅ Downloaded new 361/500
✅ Downloaded n

✅ Downloaded new 146/500
✅ Downloaded new 147/500
✅ Downloaded new 148/500
✅ Downloaded new 149/500
✅ Downloaded new 150/500
✅ Downloaded new 151/500
✅ Downloaded new 152/500
✅ Downloaded new 153/500
✅ Downloaded new 154/500
✅ Downloaded new 155/500
✅ Downloaded new 156/500
✅ Downloaded new 157/500
✅ Downloaded new 158/500
✅ Downloaded new 159/500
✅ Downloaded new 160/500
✅ Downloaded new 161/500
✅ Downloaded new 162/500
✅ Downloaded new 163/500
✅ Downloaded new 164/500
✅ Downloaded new 165/500
✅ Downloaded new 166/500
✅ Downloaded new 167/500
✅ Downloaded new 168/500
✅ Downloaded new 169/500
✅ Downloaded new 170/500
✅ Downloaded new 171/500
✅ Downloaded new 172/500
✅ Downloaded new 173/500
✅ Downloaded new 174/500
⚠️ Error: HTTP Error 404: Not Found
✅ Downloaded new 175/500
✅ Downloaded new 176/500
✅ Downloaded new 177/500
✅ Downloaded new 178/500
✅ Downloaded new 179/500
✅ Downloaded new 180/500
✅ Downloaded new 181/500
✅ Downloaded new 182/500
✅ Downloaded new 183/500
✅ Downloaded n

✅ Downloaded new 471/500
✅ Downloaded new 472/500
✅ Downloaded new 473/500
✅ Downloaded new 474/500
✅ Downloaded new 475/500
✅ Downloaded new 476/500
✅ Downloaded new 477/500
✅ Downloaded new 478/500
✅ Downloaded new 479/500
✅ Downloaded new 480/500
✅ Downloaded new 481/500
✅ Downloaded new 482/500
✅ Downloaded new 483/500
✅ Downloaded new 484/500
✅ Downloaded new 485/500
✅ Downloaded new 486/500
✅ Downloaded new 487/500
✅ Downloaded new 488/500
✅ Downloaded new 489/500
✅ Downloaded new 490/500
✅ Downloaded new 491/500
✅ Downloaded new 492/500
✅ Downloaded new 493/500
✅ Downloaded new 494/500
✅ Downloaded new 495/500
✅ Downloaded new 496/500
✅ Downloaded new 497/500
✅ Downloaded new 498/500
✅ Downloaded new 499/500
✅ Downloaded new 500/500
🎉 Done: Added 500 new images for 'person drinking water'

🖼️ Start Crawling: 'person eating food' → 📁 ./images/eating
🔍 Found 2424 image elements for 'person eating food'
✅ Downloaded new 1/500
✅ Downloaded new 2/500
✅ Downloaded new 3/500
✅ Download

✅ Downloaded new 297/500
✅ Downloaded new 298/500
✅ Downloaded new 299/500
✅ Downloaded new 300/500
✅ Downloaded new 301/500
✅ Downloaded new 302/500
✅ Downloaded new 303/500
✅ Downloaded new 304/500
✅ Downloaded new 305/500
✅ Downloaded new 306/500
✅ Downloaded new 307/500
✅ Downloaded new 308/500
✅ Downloaded new 309/500
✅ Downloaded new 310/500
✅ Downloaded new 311/500
✅ Downloaded new 312/500
✅ Downloaded new 313/500
✅ Downloaded new 314/500
✅ Downloaded new 315/500
✅ Downloaded new 316/500
✅ Downloaded new 317/500
✅ Downloaded new 318/500
✅ Downloaded new 319/500
✅ Downloaded new 320/500
✅ Downloaded new 321/500
✅ Downloaded new 322/500
✅ Downloaded new 323/500
✅ Downloaded new 324/500
✅ Downloaded new 325/500
✅ Downloaded new 326/500
✅ Downloaded new 327/500
✅ Downloaded new 328/500
✅ Downloaded new 329/500
✅ Downloaded new 330/500
✅ Downloaded new 331/500
✅ Downloaded new 332/500
⚠️ Error: HTTP Error 404: Not Found
✅ Downloaded new 333/500
✅ Downloaded new 334/500
✅ Downloaded n

✅ Downloaded new 118/500
✅ Downloaded new 119/500
✅ Downloaded new 120/500
✅ Downloaded new 121/500
✅ Downloaded new 122/500
✅ Downloaded new 123/500
✅ Downloaded new 124/500
✅ Downloaded new 125/500
✅ Downloaded new 126/500
✅ Downloaded new 127/500
✅ Downloaded new 128/500
✅ Downloaded new 129/500
✅ Downloaded new 130/500
✅ Downloaded new 131/500
✅ Downloaded new 132/500
✅ Downloaded new 133/500
✅ Downloaded new 134/500
✅ Downloaded new 135/500
✅ Downloaded new 136/500
✅ Downloaded new 137/500
✅ Downloaded new 138/500
✅ Downloaded new 139/500
✅ Downloaded new 140/500
✅ Downloaded new 141/500
✅ Downloaded new 142/500
✅ Downloaded new 143/500
✅ Downloaded new 144/500
✅ Downloaded new 145/500
✅ Downloaded new 146/500
✅ Downloaded new 147/500
✅ Downloaded new 148/500
✅ Downloaded new 149/500
✅ Downloaded new 150/500
✅ Downloaded new 151/500
✅ Downloaded new 152/500
✅ Downloaded new 153/500
✅ Downloaded new 154/500
✅ Downloaded new 155/500
✅ Downloaded new 156/500
✅ Downloaded new 157/500


✅ Downloaded new 439/500
✅ Downloaded new 440/500
✅ Downloaded new 441/500
✅ Downloaded new 442/500
✅ Downloaded new 443/500
✅ Downloaded new 444/500
✅ Downloaded new 445/500
✅ Downloaded new 446/500
✅ Downloaded new 447/500
✅ Downloaded new 448/500
✅ Downloaded new 449/500
✅ Downloaded new 450/500
✅ Downloaded new 451/500
✅ Downloaded new 452/500
✅ Downloaded new 453/500
✅ Downloaded new 454/500
✅ Downloaded new 455/500
✅ Downloaded new 456/500
✅ Downloaded new 457/500
✅ Downloaded new 458/500
✅ Downloaded new 459/500
✅ Downloaded new 460/500
✅ Downloaded new 461/500
✅ Downloaded new 462/500
✅ Downloaded new 463/500
✅ Downloaded new 464/500
✅ Downloaded new 465/500
✅ Downloaded new 466/500
✅ Downloaded new 467/500
✅ Downloaded new 468/500
✅ Downloaded new 469/500
✅ Downloaded new 470/500
✅ Downloaded new 471/500
✅ Downloaded new 472/500
✅ Downloaded new 473/500
✅ Downloaded new 474/500
✅ Downloaded new 475/500
✅ Downloaded new 476/500
✅ Downloaded new 477/500
✅ Downloaded new 478/500


✅ Downloaded new 264/500
✅ Downloaded new 265/500
✅ Downloaded new 266/500
✅ Downloaded new 267/500
✅ Downloaded new 268/500
✅ Downloaded new 269/500
✅ Downloaded new 270/500
✅ Downloaded new 271/500
✅ Downloaded new 272/500
✅ Downloaded new 273/500
✅ Downloaded new 274/500
✅ Downloaded new 275/500
✅ Downloaded new 276/500
✅ Downloaded new 277/500
✅ Downloaded new 278/500
✅ Downloaded new 279/500
✅ Downloaded new 280/500
✅ Downloaded new 281/500
✅ Downloaded new 282/500
✅ Downloaded new 283/500
✅ Downloaded new 284/500
✅ Downloaded new 285/500
✅ Downloaded new 286/500
✅ Downloaded new 287/500
✅ Downloaded new 288/500
✅ Downloaded new 289/500
✅ Downloaded new 290/500
✅ Downloaded new 291/500
✅ Downloaded new 292/500
✅ Downloaded new 293/500
✅ Downloaded new 294/500
✅ Downloaded new 295/500
✅ Downloaded new 296/500
✅ Downloaded new 297/500
✅ Downloaded new 298/500
✅ Downloaded new 299/500
✅ Downloaded new 300/500
✅ Downloaded new 301/500
✅ Downloaded new 302/500
✅ Downloaded new 303/500


✅ Downloaded new 87/500
✅ Downloaded new 88/500
✅ Downloaded new 89/500
✅ Downloaded new 90/500
✅ Downloaded new 91/500
✅ Downloaded new 92/500
✅ Downloaded new 93/500
✅ Downloaded new 94/500
✅ Downloaded new 95/500
✅ Downloaded new 96/500
✅ Downloaded new 97/500
✅ Downloaded new 98/500
✅ Downloaded new 99/500
✅ Downloaded new 100/500
✅ Downloaded new 101/500
✅ Downloaded new 102/500
✅ Downloaded new 103/500
✅ Downloaded new 104/500
✅ Downloaded new 105/500
✅ Downloaded new 106/500
✅ Downloaded new 107/500
✅ Downloaded new 108/500
✅ Downloaded new 109/500
✅ Downloaded new 110/500
✅ Downloaded new 111/500
✅ Downloaded new 112/500
✅ Downloaded new 113/500
✅ Downloaded new 114/500
✅ Downloaded new 115/500
✅ Downloaded new 116/500
✅ Downloaded new 117/500
✅ Downloaded new 118/500
✅ Downloaded new 119/500
✅ Downloaded new 120/500
✅ Downloaded new 121/500
✅ Downloaded new 122/500
✅ Downloaded new 123/500
✅ Downloaded new 124/500
✅ Downloaded new 125/500
✅ Downloaded new 126/500
✅ Downloaded 

✅ Downloaded new 413/500
✅ Downloaded new 414/500
✅ Downloaded new 415/500
✅ Downloaded new 416/500
✅ Downloaded new 417/500
✅ Downloaded new 418/500
✅ Downloaded new 419/500
✅ Downloaded new 420/500
✅ Downloaded new 421/500
✅ Downloaded new 422/500
✅ Downloaded new 423/500
✅ Downloaded new 424/500
✅ Downloaded new 425/500
✅ Downloaded new 426/500
✅ Downloaded new 427/500
✅ Downloaded new 428/500
✅ Downloaded new 429/500
✅ Downloaded new 430/500
✅ Downloaded new 431/500
✅ Downloaded new 432/500
✅ Downloaded new 433/500
✅ Downloaded new 434/500
✅ Downloaded new 435/500
✅ Downloaded new 436/500
✅ Downloaded new 437/500
✅ Downloaded new 438/500
✅ Downloaded new 439/500
✅ Downloaded new 440/500
✅ Downloaded new 441/500
✅ Downloaded new 442/500
✅ Downloaded new 443/500
✅ Downloaded new 444/500
✅ Downloaded new 445/500
✅ Downloaded new 446/500
✅ Downloaded new 447/500
✅ Downloaded new 448/500
✅ Downloaded new 449/500
✅ Downloaded new 450/500
✅ Downloaded new 451/500
✅ Downloaded new 452/500


✅ Downloaded new 235/500
✅ Downloaded new 236/500
✅ Downloaded new 237/500
✅ Downloaded new 238/500
✅ Downloaded new 239/500
✅ Downloaded new 240/500
✅ Downloaded new 241/500
✅ Downloaded new 242/500
✅ Downloaded new 243/500
✅ Downloaded new 244/500
✅ Downloaded new 245/500
✅ Downloaded new 246/500
✅ Downloaded new 247/500
✅ Downloaded new 248/500
✅ Downloaded new 249/500
✅ Downloaded new 250/500
✅ Downloaded new 251/500
✅ Downloaded new 252/500
✅ Downloaded new 253/500
✅ Downloaded new 254/500
✅ Downloaded new 255/500
✅ Downloaded new 256/500
✅ Downloaded new 257/500
✅ Downloaded new 258/500
✅ Downloaded new 259/500
✅ Downloaded new 260/500
✅ Downloaded new 261/500
✅ Downloaded new 262/500
✅ Downloaded new 263/500
✅ Downloaded new 264/500
✅ Downloaded new 265/500
✅ Downloaded new 266/500
✅ Downloaded new 267/500
✅ Downloaded new 268/500
✅ Downloaded new 269/500
✅ Downloaded new 270/500
✅ Downloaded new 271/500
✅ Downloaded new 272/500
✅ Downloaded new 273/500
✅ Downloaded new 274/500


✅ Downloaded new 59/500
✅ Downloaded new 60/500
✅ Downloaded new 61/500
✅ Downloaded new 62/500
✅ Downloaded new 63/500
✅ Downloaded new 64/500
✅ Downloaded new 65/500
✅ Downloaded new 66/500
✅ Downloaded new 67/500
✅ Downloaded new 68/500
✅ Downloaded new 69/500
✅ Downloaded new 70/500
✅ Downloaded new 71/500
✅ Downloaded new 72/500
✅ Downloaded new 73/500
✅ Downloaded new 74/500
✅ Downloaded new 75/500
✅ Downloaded new 76/500
✅ Downloaded new 77/500
✅ Downloaded new 78/500
✅ Downloaded new 79/500
✅ Downloaded new 80/500
✅ Downloaded new 81/500
✅ Downloaded new 82/500
✅ Downloaded new 83/500
✅ Downloaded new 84/500
✅ Downloaded new 85/500
✅ Downloaded new 86/500
✅ Downloaded new 87/500
✅ Downloaded new 88/500
✅ Downloaded new 89/500
✅ Downloaded new 90/500
✅ Downloaded new 91/500
✅ Downloaded new 92/500
✅ Downloaded new 93/500
✅ Downloaded new 94/500
✅ Downloaded new 95/500
✅ Downloaded new 96/500
✅ Downloaded new 97/500
✅ Downloaded new 98/500
✅ Downloaded new 99/500
✅ Downloaded new