In this notebook I scrape image data on Google Chrome using Selenium. All credit goes to this post for containing all the info required to pull this off: [https://medium.com/p/a96feda8af2d](https://medium.com/p/a96feda8af2d)

In [30]:
from PIL import Image
import os
from selenium import webdriver
import time
import io
import requests
import hashlib

In [31]:
search_term = 'aster'

In [69]:
DRIVER_PATH = '/Users/jonathanflorez/Desktop/Scraping/chromedriver'

wd = webdriver.Chrome(executable_path=DRIVER_PATH)

In [33]:
wd.get('https://google.com')

In [34]:
def fetch_image_urls(query:str, max_links_to_fetch:int, wd:webdriver, sleep_between_interactions:int=1):
    def scroll_to_end(wd):
        wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(sleep_between_interactions)    
    
    # build the google query
    search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img"

    # load the page
    wd.get(search_url.format(q=query))

    image_urls = set()
    image_count = 0
    results_start = 0
    while image_count < max_links_to_fetch:
        scroll_to_end(wd)

        # get all image thumbnail results
        thumbnail_results = wd.find_elements_by_css_selector("img.Q4LuWd")
        number_results = len(thumbnail_results)
        
        print(f"Found: {number_results} search results. Extracting links from {results_start}:{number_results}")
        
        for img in thumbnail_results[results_start:number_results]:
            # try to click every thumbnail such that we can get the real image behind it
            try:
                img.click()
                time.sleep(sleep_between_interactions)
            except Exception:
                continue

            # extract image urls    
            actual_images = wd.find_elements_by_css_selector('img.n3VNCb')
            for actual_image in actual_images:
                if actual_image.get_attribute('src') and 'http' in actual_image.get_attribute('src'):
                    image_urls.add(actual_image.get_attribute('src'))

            image_count = len(image_urls)

            if len(image_urls) >= max_links_to_fetch:
                print(f"Found: {len(image_urls)} image links, done!")
                break
        else:
            print("Found:", len(image_urls), "image links, looking for more ...")
            time.sleep(30)
            return
            load_more_button = wd.find_element_by_css_selector(".mye4qd")
            if load_more_button:
                wd.execute_script("document.querySelector('.mye4qd').click();")

        # move the result startpoint further down
        results_start = len(thumbnail_results)

    return image_urls

In [35]:
def persist_image(folder_path:str,url:str):
    try:
        image_content = requests.get(url).content

    except Exception as e:
        print(f"ERROR - Could not download {url} - {e}")

    try:
        image_file = io.BytesIO(image_content)
        image = Image.open(image_file).convert('RGB')
        file_path = os.path.join(folder_path,hashlib.sha1(image_content).hexdigest()[:10] + '.jpg')
        with open(file_path, 'wb') as f:
            image.save(f, "JPEG")
        print(f"SUCCESS - saved {url} - as {file_path}")
    except Exception as e:
        print(f"ERROR - Could not save {url} - {e}")

In [38]:
def search_and_download(search_term:str,driver_path:str,target_path='./flowers',number_images=162):
    target_folder = os.path.join(target_path,'_'.join(search_term.lower().split(' ')))

    if not os.path.exists(target_folder):
        os.makedirs(target_folder)

    with webdriver.Chrome(executable_path=driver_path) as wd:
        res = fetch_image_urls(search_term, number_images, wd=wd, sleep_between_interactions=0.5)
        
    for elem in res:
        persist_image(target_folder,elem)

In [39]:
img_path = 'flowers/' + search_term + '/'

if search_term not in os.listdir('flowers/.'): 
    os.mkdir(img_path)

search_and_download(search_term=search_term,driver_path=DRIVER_PATH)

Found: 100 search results. Extracting links from 0:100
Found: 162 image links, done!
SUCCESS - saved https://trulyexperiences.com/blog/wp-content/uploads/2021/01/asters-scaled-e1610706505960.jpg - as ./flowers/aster/b790c15aeb.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSbXZggLK3fRo2QLR7-xm77EQfspMXvi17nMQ&usqp=CAU - as ./flowers/aster/7368e3dc0c.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSlwZmd4bX-yg3lB-r4HXtComG157zK3IIdIQ&usqp=CAU - as ./flowers/aster/b8a28085c8.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSBDwWUdkAlLiIdrZrmyLxUXfxgSLh9KVqIDw&usqp=CAU - as ./flowers/aster/a5f8e02283.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTbvmhEYKTM5YQ26tbz4Ni2qVoD_7GySawr5A&usqp=CAU - as ./flowers/aster/378fa90683.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRWjQNOFoqDC_MxpsDW7ambS1UBFS8DPya_hA&usqp=CAU - as ./flowers/aster/71ef1dcbfb.jpg
SUCCESS -

SUCCESS - saved https://www.prairiemoon.com/mm5/graphics/00000001/symphyotrichum-novae-angliae-new-england-aster_main_467x705.jpg - as ./flowers/aster/9e1747363b.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTOaGWt15NHHefqtJ5bS_whSpqG-tOlIE_4-Q&usqp=CAU - as ./flowers/aster/42c2264e07.jpg
SUCCESS - saved https://order.eurobulb.nl/3254-large_default/dahlia-white-aster-10167.jpg - as ./flowers/aster/179ce079aa.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTKn3dPK-PWpWzMyfoOWpPZ-5dlc54cXuzFjg&usqp=CAU - as ./flowers/aster/5e196daddf.jpg
SUCCESS - saved https://www.everwilde.com/media/1000/Aster-novae-angliae-01.gif - as ./flowers/aster/1d94d0ddc1.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRivDWYza7oUslSSE2JAVASsMr_N3stNJ3FsA&usqp=CAU - as ./flowers/aster/74ebb39598.jpg
SUCCESS - saved https://www.anniesannuals.com/signs/a/images/aster_chilensis_purple_haze_02.jpg - as ./flowers/aster/52aa4ee122.jpg
SUCC



SUCCESS - saved https://cdn.shopify.com/s/files/1/1698/1675/products/Flower_Aster_Crego.jpg?v=1557505206 - as ./flowers/aster/ffc77e0784.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSuRaW4Y2nBedwOiEbc5oFiqpQ-_Im1D4SREA&usqp=CAU - as ./flowers/aster/c15604e61d.jpg
SUCCESS - saved https://www.growjoy.com/store/pc/catalog/believer_aster_plant_1409_detail.jpg - as ./flowers/aster/778df6c51d.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQk4EEH16osfzTlAhh3uSIk0KRocgLvlJAZlQ&usqp=CAU - as ./flowers/aster/884573b019.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcT7Yh9SVkSeefeSDEJeLSdlS5D8ndCiJYJ7IA&usqp=CAU - as ./flowers/aster/62f321f880.jpg
SUCCESS - saved https://www.gardenia.net/storage/app/public/uploads/images/detail/MkO7Tw6C5isTWodSVHBseYFfS4cAmq7lTsecmtNO.jpeg - as ./flowers/aster/69eee382ed.jpg
SUCCESS - saved https://www.outsidepride.com/images/products/detail/nova/astergremlinviolet.jpg - as ./flower

SUCCESS - saved https://www.prairienursery.com/media/catalog/product/cache/a1a7806eca65eb3c6d8bc204c15c6853/3/1/31320-03-a_319.jpg - as ./flowers/aster/c8757e11c3.jpg
SUCCESS - saved https://study.com/cimages/multimages/16/aster-4001328_640.jpg - as ./flowers/aster/e89de75525.jpg
SUCCESS - saved https://www.everwilde.com/media/1000/Aster-ericoides.gif - as ./flowers/aster/621a14758c.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQLzD-XXZYY181k0pXHXEDbUrSYtMkvYQFB2w&usqp=CAU - as ./flowers/aster/1c1f112733.jpg
SUCCESS - saved https://www.bluestoneperennials.com/img/ASWB/650/ASWB_0_Aster_Woods_Blue.1491332233.jpg - as ./flowers/aster/10542b9b5f.jpg
ERROR - Could not save https://www.gardenstylesanantonio.com/wp-content/uploads/2020/05/fall-aster-symphotrichum-oblongifolium-sabg-ofg-jr-1920x1080-img_2499-600x600.jpg - cannot identify image file <_io.BytesIO object at 0x7fee9181d590>
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSkSmy6WtCn

In [28]:
run flickrGetUrl.py aster 500

Fetching url for image number 1
Fetching url for image number 2
Fetching url for image number 3
Fetching url for image number 4
Fetching url for image number 5
Fetching url for image number 6
Fetching url for image number 7
Fetching url for image number 8
Fetching url for image number 9
Fetching url for image number 10
Fetching url for image number 11
Fetching url for image number 12
Fetching url for image number 13
Fetching url for image number 14
Fetching url for image number 15
Fetching url for image number 16
Fetching url for image number 17
Fetching url for image number 18
Fetching url for image number 19
Fetching url for image number 20
Fetching url for image number 21
Fetching url for image number 22
Fetching url for image number 23
Fetching url for image number 24
Fetching url for image number 25
Fetching url for image number 26
Fetching url for image number 27
Fetching url for image number 28
Fetching url for image number 29
Fetching url for image number 30
Fetching url for im

Fetching url for image number 251
Fetching url for image number 252
Fetching url for image number 253
Fetching url for image number 254
Fetching url for image number 255
Fetching url for image number 256
Fetching url for image number 257
Fetching url for image number 258
Fetching url for image number 259
Fetching url for image number 260
Fetching url for image number 261
Fetching url for image number 262
Fetching url for image number 263
Fetching url for image number 264
Fetching url for image number 265
Fetching url for image number 266
Fetching url for image number 267
Fetching url for image number 268
Fetching url for image number 269
Fetching url for image number 270
Fetching url for image number 271
Fetching url for image number 272
Fetching url for image number 273
Fetching url for image number 274
Fetching url for image number 275
Fetching url for image number 276
Fetching url for image number 277
Fetching url for image number 278
Fetching url for image number 279
Fetching url f

Done fetching urls, fetched 500 urls out of 500
Writing out the urls in the current directory
Done!!!


In [29]:
run get_images.py image_urls.csv aster

Starting download 1 of  324
Done downloading 1 of 324
Starting download 2 of  324
Done downloading 2 of 324
Starting download 3 of  324
Done downloading 3 of 324
Starting download 4 of  324
Done downloading 4 of 324
Starting download 5 of  324
Done downloading 5 of 324
Starting download 6 of  324
Done downloading 6 of 324
Starting download 7 of  324
Done downloading 7 of 324
Starting download 8 of  324
Done downloading 8 of 324
Starting download 9 of  324
Done downloading 9 of 324
Starting download 10 of  324
Done downloading 10 of 324
Starting download 11 of  324
Done downloading 11 of 324
Starting download 12 of  324
Done downloading 12 of 324
Starting download 13 of  324
Done downloading 13 of 324
Starting download 14 of  324
Done downloading 14 of 324
Starting download 15 of  324
Done downloading 15 of 324
Starting download 16 of  324
Done downloading 16 of 324
Starting download 17 of  324
Done downloading 17 of 324
Starting download 18 of  324
Done downloading 18 of 324
Starting d

Done downloading 146 of 324
Starting download 147 of  324
Done downloading 147 of 324
Starting download 148 of  324
Done downloading 148 of 324
Starting download 149 of  324
Done downloading 149 of 324
Starting download 150 of  324
Done downloading 150 of 324
Starting download 151 of  324
Done downloading 151 of 324
Starting download 152 of  324
Done downloading 152 of 324
Starting download 153 of  324
Done downloading 153 of 324
Starting download 154 of  324
Done downloading 154 of 324
Starting download 155 of  324
Done downloading 155 of 324
Starting download 156 of  324
Done downloading 156 of 324
Starting download 157 of  324
Done downloading 157 of 324
Starting download 158 of  324
Done downloading 158 of 324
Starting download 159 of  324
Done downloading 159 of 324
Starting download 160 of  324
Done downloading 160 of 324
Starting download 161 of  324
Done downloading 161 of 324
Starting download 162 of  324
Done downloading 162 of 324
Starting download 163 of  324
Done downloadi

Done downloading 288 of 324
Starting download 289 of  324
Done downloading 289 of 324
Starting download 290 of  324
Done downloading 290 of 324
Starting download 291 of  324
Done downloading 291 of 324
Starting download 292 of  324
Done downloading 292 of 324
Starting download 293 of  324
Done downloading 293 of 324
Starting download 294 of  324
Done downloading 294 of 324
Starting download 295 of  324
Done downloading 295 of 324
Starting download 296 of  324
Done downloading 296 of 324
Starting download 297 of  324
Done downloading 297 of 324
Starting download 298 of  324
Done downloading 298 of 324
Starting download 299 of  324
Done downloading 299 of 324
Starting download 300 of  324
Done downloading 300 of 324
Starting download 301 of  324
Done downloading 301 of 324
Starting download 302 of  324
Done downloading 302 of 324
Starting download 303 of  324
Done downloading 303 of 324
Starting download 304 of  324
Done downloading 304 of 324
Starting download 305 of  324
Done downloadi

In [71]:
! yandex-images-download Chrome --keywords "vodka, bears, balalaika" --limit 10 

Message: 'chromedriver' executable needs to be in PATH. Please see https://sites.google.com/a/chromium.org/chromedriver/home
Traceback (most recent call last):
  File "/Users/jonathanflorez/anaconda3/lib/python3.7/site-packages/selenium/webdriver/common/service.py", line 76, in start
    stdin=PIPE)
  File "/Users/jonathanflorez/anaconda3/lib/python3.7/subprocess.py", line 775, in __init__
    restore_signals, start_new_session)
  File "/Users/jonathanflorez/anaconda3/lib/python3.7/subprocess.py", line 1522, in _execute_child
    raise child_exception_type(errno_num, err_msg, err_filename)
FileNotFoundError: [Errno 2] No such file or directory: 'chromedriver': 'chromedriver'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/jonathanflorez/anaconda3/lib/python3.7/site-packages/yandex_images_download/yandex_images_download.py", line 76, in main
    scrap(args)
  File "/Users/jonathanflorez/anaconda3/lib/