# Download Dataset

In this notebook, I use selenium to search for and then download images from google images. I use several variable search terms and save some metadata for each image.  

References ...
- https://www.geeksforgeeks.org/download-google-image-using-python-and-selenium/
- https://dev.to/ericchapman/python-get-and-save-google-images-with-selenium-42i1
- https://stackoverflow.com/a/52545788/2869043

In [None]:
import os
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options 
import requests
import time

In [None]:
# function for scrolling to the bottom of results
def scroll_to_bottom(driver, wait=3):

    last_height = driver.execute_script('\
    return document.body.scrollHeight')

    while True:
        driver.execute_script('\
        window.scrollTo(0,document.body.scrollHeight)')

        # wait for results to load
        time.sleep(wait)

        new_height = driver.execute_script('\
        return document.body.scrollHeight')

        # click on "Show more results" (if exists)
        try:
            driver.find_element_by_css_selector(".YstHxe input").click()

            # wait for results to load
            time.sleep(wait)

        except:
            pass

        # check for bottom of page
        if new_height == last_height:
            break

        last_height = new_height

In [None]:
# define variables of interest
all_species = ('orc', 'elf', 'dwarf', 'halfling', 'human', 'dragonborn', 'gnome', 'tiefling', 'goblin')
all_classes = ('barbarian', 'bard', 'cleric', 'druid', 'fighter', 'monk', 'paladin', 'ranger', 'rogue', 'sorcerer', 'warlock', 'wizard')
all_genders = ('male', 'female', 'nonbinary')
n_per_category = 100

# define directory to store images
img_dir = '../data/raw'

# create webdriver instance
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("no-sandbox")
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument("--headless")
driver_path = '/usr/bin/chromedriver'
driver = webdriver.Chrome(executable_path=driver_path, chrome_options=chrome_options)

# maximize the screen
driver.maximize_window()

In [None]:
# list to store metadata
metadata = []

# iterate through search criteria
for species_ in all_species:
    for class_ in all_classes:
        for gender_ in all_genders:

            query = f'{species_} {class_} {gender_}'
            out_file = f'{img_dir}/{species_}_{class_}_{gender_}_%09d.jpg'

            # open google images
            driver.get('https://images.google.com/')

            # find the search box and query
            box = driver.find_element_by_xpath('//*[@id="sbtc"]/div/div[2]/input')
            box.send_keys(query)
            box.send_keys(Keys.ENTER)

            # scroll to the bottom of the results
            scroll_to_bottom(driver)

            # find image elements
            elements = driver.find_elements_by_class_name('rg_i')
            counter = 0
            
            for e in elements:

                try:
                    # create action chain object
                    action = ActionChains(driver)

                    # perform click operation
                    action.move_to_element(e).click().perform()

                    # get images source url
                    time.sleep(1)
                    element = driver.find_elements_by_class_name('v4dQwb')

                    # Google image web site logic
                    if counter == 0:
                        big_img = element[0].find_element_by_class_name('n3VNCb')
                    else:
                        big_img = element[1].find_element_by_class_name('n3VNCb')

                    url = big_img.get_attribute("src")

                    # write image to file
                    reponse = requests.get(url)
                    out_file_ = out_file % counter

                    if reponse.status_code == 200:
                        with open(out_file_, "wb") as file:
                            file.write(reponse.content)

                    metadata.append({
                        'url': url,
                        'fname': out_file_,
                        'species': species_,
                        'class': class_,
                        'gender': gender_,
                        'counter': counter
                    })

                except Exception as e:
                    print(e)

                counter += 1

                if counter == n_per_category:
                    break

            # save metadata
            pd.DataFrame(metadata).to_csv('../data/raw_image_metadata.csv')

driver.close()