In [1]:
from imutils import paths
import pandas as pd
import argparse
import requests
import cv2
import os
import io
import time
from PIL import Image
import hashlib
from selenium import webdriver
from selenium.webdriver.chrome.options import Options 

In [2]:
def fetch_image_urls(query:str, max_links_to_fetch:int, wd:webdriver, sleep_between_interactions:int=1):
    def scroll_to_end(wd):
        wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(sleep_between_interactions)    
    
    # build the google query
    search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img"

    # load the page
    wd.get(search_url.format(q=query))

    image_urls = set()
    image_count = 0
    results_start = 0
    while image_count < max_links_to_fetch:
        scroll_to_end(wd)

        # get all image thumbnail results
        thumbnail_results = wd.find_elements_by_css_selector("img.Q4LuWd")
        number_results = len(thumbnail_results)
        
        print(f"Found: {number_results} search results. Extracting links from {results_start}:{number_results}")
        
        for img in thumbnail_results[results_start:number_results]:
            # try to click every thumbnail such that we can get the real image behind it
            try:
                img.click()
                time.sleep(sleep_between_interactions)
            except Exception:
                continue

            # extract image urls    
            actual_images = wd.find_elements_by_css_selector('img.n3VNCb')
            for actual_image in actual_images:
                if actual_image.get_attribute('src') and 'http' in actual_image.get_attribute('src'):
                    image_urls.add(actual_image.get_attribute('src'))

            image_count = len(image_urls)

            if len(image_urls) >= max_links_to_fetch:
                print(f"Found: {len(image_urls)} image links, done!")
                break
        else:
            print("Found:", len(image_urls), "image links, looking for more ...")
            time.sleep(30)
            return
            load_more_button = wd.find_element_by_css_selector(".mye4qd")
            if load_more_button:
                wd.execute_script("document.querySelector('.mye4qd').click();")

        # move the result startpoint further down
        results_start = len(thumbnail_results)

    return image_urls

def persist_image(folder_path:str,url:str):
    try:
        image_content = requests.get(url).content

    except Exception as e:
        print(f"ERROR - Could not download {url} - {e}")

    try:
        image_file = io.BytesIO(image_content)
        image = Image.open(image_file).convert('RGB')
        file_path = os.path.join(folder_path,hashlib.sha1(image_content).hexdigest()[:10] + '.jpg')
        with open(file_path, 'wb') as f:
            image.save(f, "JPEG", quality=85)
        print(f"SUCCESS - saved {url} - as {file_path}")
    except Exception as e:
        print(f"ERROR - Could not save {url} - {e}")
        
def search_and_download(search_term:str,driver_path:str,target_path='./images',number_images=5):
    target_folder = os.path.join(target_path,'_'.join(search_term.lower().split(' ')))

    if not os.path.exists(target_folder):
        os.makedirs(target_folder)

    with webdriver.Chrome(executable_path=driver_path) as wd:
        res = fetch_image_urls(search_term, number_images, wd=wd, sleep_between_interactions=0.5)
        
    for elem in res:
        persist_image(target_folder,elem)

In [3]:
# Build list of search terms
vehicles = pd.read_csv('vehicles.csv')

vehicles = vehicles[['make', 'model', 'year']].drop_duplicates()
vehicles = vehicles[vehicles['year']>1999]
vehicles.shape

vehicles['query'] = vehicles['make'] + " " + vehicles['model'] + " " + vehicles['year'].astype(str)

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
vehicles[vehicles['year']==2020]

Unnamed: 0,make,model,year,query
34340,Toyota,Corolla,2020,Toyota Corolla 2020
34341,Toyota,Corolla Hybrid,2020,Toyota Corolla Hybrid 2020
34343,Toyota,Corolla XSE,2020,Toyota Corolla XSE 2020
34346,Toyota,Corolla XLE,2020,Toyota Corolla XLE 2020
34348,Kia,Soul,2020,Kia Soul 2020
34349,Kia,Soul Eco dynamics,2020,Kia Soul Eco dynamics 2020
34355,Kia,Sportage FWD,2020,Kia Sportage FWD 2020
34357,Kia,Telluride FWD,2020,Kia Telluride FWD 2020
34359,Kia,Sportage AWD,2020,Kia Sportage AWD 2020
34361,Kia,Telluride AWD,2020,Kia Telluride AWD 2020


In [None]:
chrome_options = Options()  
chrome_options.add_argument("--headless")
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('start-maximized') # 
chrome_options.add_argument('disable-infobars')
chrome_options.add_argument("--disable-extensions")
chrome_options.headless = True

# chrome_options.binary_location = '/Applications/Google Chrome   Canary.app/Contents/MacOS/Google Chrome Canary'`    
# Start webdriver
DRIVER_PATH = "chromedriver.exe"
wd = webdriver.Chrome(executable_path=DRIVER_PATH,  chrome_options=chrome_options)

# TOOD: handle errors with try
def find_index():
    path = "../scraped_images"
    files = folders = 0
    for _, dirnames, filenames in os.walk(path):
    #   ^ this idiom means "we won't be using this value"
        files += len(filenames)
        folders += len(dirnames)
    print("{:,} files, {:,} folders".format(files, folders))
    
    return folders

folders = find_index()
while True:
    try:
        for term in vehicles['query'][folders:]:
            search_and_download(search_term=term,driver_path="chromedriver.exe",target_path='../scraped_images',number_images=30)
    except:
        print("No images found, trying again...")
        folders = find_index()