In [1]:
import requests
import hashlib
import time
import io
import os

from bs4 import BeautifulSoup
from PIL import Image
from selenium import webdriver

In [2]:
def fetch_image_urls(query:str, max_links_to_fetch:int, wd:webdriver, sleep_between_interactions:int=1):
    def scroll_to_end(wd):
        wd.execute_script('window.scrollTo(0, document.body.scrollHeight);')
        time.sleep(sleep_between_interactions)
        
    # Build te google query
    search_url = 'https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img'
    
    # Load the page
    wd.get(search_url.format(q=query))
    
    image_urls = set()
    image_count = 0
    results_start = 0
    while image_count < max_links_to_fetch:
        scroll_to_end(wd)
        
        # Get all image thumbnail results
        thumbnail_results = wd.find_elements_by_css_selector('img.Q4LuWd')
        number_results = len(thumbnail_results)
        
        print(f'Found: {number_results} search results. Extracting links from {results_start}:{number_results}')
        
        for img in thumbnail_results[results_start:number_results]:
            # Try to click every thumbnail such that we can get the real image behind it
            try:
                img.click()
                time.sleep(sleep_between_interactions)
            except Exception:
                continue
                
            # Extract image urls
            actual_images = wd.find_elements_by_css_selector('img.n3VNCb')
            for actual_image in actual_images:
                if actual_image.get_attribute('src') and 'http' in actual_image.get_attribute('src'):
                    image_urls.add(actual_image.get_attribute('src'))
                    
            image_count = len(image_urls)
            
            if len(image_urls) >= max_links_to_fetch:
                print(f'Found: {len(image_urls)} image links, done!')
                break
            
        else:
            print('Found:', len(image_urls), 'image links, looking for more ...')
            time.sleep(30)
            
            load_more_button = wd.find_element_by_css_selector('.mye4qd')
            if load_more_button:
                wd.execute_script("document.querySelector('.mye4qd').click();")
        
        # Move the result startpoint further down
        results_start = len(thumbnail_results)
    
    return image_urls

In [3]:
def persist_image(folder_path:str, url:str):
    try:
        image_content = requests.get(url).content
        
    except Exception as e:
        print(f'ERROR - Could not download {url} - {e}')
        
    
    try:
        image_file = io.BytesIO(image_content)
        image = Image.open(image_file).convert('RGB')
        file_path = os.path.join(folder_path, hashlib.sha1(image_content).hexdigest()[:10] + '.jpg')
        
        with open(file_path, 'wb') as f:
            image.save(f, 'JPEG', quality=85)
        
        print(f'SUCCESS - saved {url} - as {file_path}')
    
    except Exception as e:
        print(f'ERROR - Could not save {url} - {e}')

In [4]:
def search_and_download(search_term:str, driver_path:str, target_path:str, number_images:int):
    target_folder = os.path.join(target_path, '_'.join(search_term.lower().split(' ')))
    
    if not os.path.exists(target_folder):
        os.makedirs(target_folder)
        
    with webdriver.Chrome(executable_path=driver_path) as wd:
        res = fetch_image_urls(search_term, number_images, wd=wd, sleep_between_interactions=0.5)
        
    for elem in res:
        persist_image(target_folder, elem)

In [13]:
driver_path = 'chromedriver'

search_term = 'owen grady'
number_images = 50
target_path = './dataset'

search_and_download(search_term, driver_path, target_path, number_images)

Found: 100 search results. Extracting links from 0:100
Found: 50 image links, done!
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQiHMJvo_wtDDfzukWsyUEkmXZf-Maj5liZuQ&usqp=CAU - as ./dataset\owen_grady\b35c146280.jpg
SUCCESS - saved https://pbs.twimg.com/profile_images/611589683986849792/lN6tQUSW_400x400.jpg - as ./dataset\owen_grady\3ff828bfaa.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQyNTPfTtm1ZO1frllJIDv9A_xbPqQPY-7tIg&usqp=CAU - as ./dataset\owen_grady\6bfb1c0b62.jpg
SUCCESS - saved https://i.pinimg.com/originals/6d/89/8c/6d898c67aa4b914263f0de418fa99f78.png - as ./dataset\owen_grady\84ac82bfbf.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSLeSX3yJgIodGmZJjcgCb1B16WdJNHWBpWpg&usqp=CAU - as ./dataset\owen_grady\5b0b38cbd6.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSa6-AiiEj8o0uhEfwibwIryLZf1DDJX8Tx7A&usqp=CAU - as ./dataset\owen_grady\d41542d5a7.jpg
SUCCESS - saved http