In [4]:
import requests
import hashlib
import time
import io
import os

from bs4 import BeautifulSoup
from PIL import Image
from selenium import webdriver

In [5]:
def fetch_image_urls(query:str, max_links_to_fetch:int, wd:webdriver, sleep_between_interactions:int=1):
    def scroll_to_end(wd):
        wd.execute_script('window.scrollTo(0, document.body.scrollHeight);')
        time.sleep(sleep_between_interactions)
        
    # Build te google query
    search_url = 'https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img'
    
    # Load the page
    wd.get(search_url.format(q=query))
    
    image_urls = set()
    image_count = 0
    results_start = 0
    while image_count < max_links_to_fetch:
        scroll_to_end(wd)
        
        # Get all image thumbnail results
        thumbnail_results = wd.find_elements_by_css_selector('img.Q4LuWd')
        number_results = len(thumbnail_results)
        
        print(f'Found: {number_results} search results. Extracting links from {results_start}:{number_results}')
        
        for img in thumbnail_results[results_start:number_results]:
            # Try to click every thumbnail such that we can get the real image behind it
            try:
                img.click()
                time.sleep(sleep_between_interactions)
            except Exception:
                continue
                
            # Extract image urls
            actual_images = wd.find_elements_by_css_selector('img.n3VNCb')
            for actual_image in actual_images:
                if actual_image.get_attribute('src') and 'http' in actual_image.get_attribute('src'):
                    image_urls.add(actual_image.get_attribute('src'))
                    
            image_count = len(image_urls)
            
            if len(image_urls) >= max_links_to_fetch:
                print(f'Found: {len(image_urls)} image links, done!')
                break
            
        else:
            print('Found:', len(image_urls), 'image links, looking for more ...')
            time.sleep(30)
            
            load_more_button = wd.find_element_by_css_selector('.mye4qd')
            if load_more_button:
                wd.execute_script("document.querySelector('.mye4qd').click();")
        
        # Move the result startpoint further down
        results_start = len(thumbnail_results)
    
    return image_urls

In [6]:
def persist_image(folder_path:str, url:str):
    try:
        image_content = requests.get(url).content
        
    except Exception as e:
        print(f'ERROR - Could not download {url} - {e}')
        
    
    try:
        image_file = io.BytesIO(image_content)
        image = Image.open(image_file).convert('RGB')
        file_path = os.path.join(folder_path, hashlib.sha1(image_content).hexdigest()[:10] + '.jpg')
        
        with open(file_path, 'wb') as f:
            image.save(f, 'JPEG', quality=85)
        
        print(f'SUCCESS - saved {url} - as {file_path}')
    
    except Exception as e:
        print(f'ERROR - Could not save {url} - {e}')

In [7]:
def search_and_download(search_term:str, driver_path:str, target_path:str, number_images:int):
    target_folder = os.path.join(target_path, '_'.join(search_term.lower().split(' ')))
    
    if not os.path.exists(target_folder):
        os.makedirs(target_folder)
        
    with webdriver.Chrome(executable_path=driver_path) as wd:
        res = fetch_image_urls(search_term, number_images, wd=wd, sleep_between_interactions=0.5)
        
    for elem in res:
        persist_image(target_folder, elem)

In [8]:
driver_path = 'chromedriver'

search_term = 'goat'
number_images = 5
target_path = './images'

search_and_download(search_term, driver_path, target_path, number_images)

Found: 100 search results. Extracting links from 0:100
Found: 5 image links, done!
SUCCESS - saved https://a0.muscache.com/pictures/lombard/MtTemplate-989763-media_library/original/45c28eeb-0380-4dff-8731-f560bbd09a4b.jpeg - as ./images\goat\f03dceda7f.jpg
SUCCESS - saved https://static.wikia.nocookie.net/jurassicworld-evolution/images/2/28/648350_screenshots_20190711103904_1.jpg/revision/latest/top-crop/width/360/height/450?cb=20200526042521 - as ./images\goat\046257d561.jpg
SUCCESS - saved https://images2.minutemediacdn.com/image/upload/c_crop,h_843,w_1500,x_0,y_10/v1555172614/shape/mentalfloss/iStock-177369626_1.jpg?itok=YfyNMOBR - as ./images\goat\a7217c52b2.jpg
SUCCESS - saved https://upload.wikimedia.org/wikipedia/commons/thumb/b/b2/Hausziege_04.jpg/1200px-Hausziege_04.jpg - as ./images\goat\e17fdad6c6.jpg
SUCCESS - saved https://i.guim.co.uk/img/media/583b1b167c0b51719de576054d524326e577b21f/0_206_4488_2693/master/4488.jpg?width=1200&height=1200&quality=85&auto=format&fit=crop&s