# This notebook <a href="https://colab.research.google.com/drive/1-QBxwf1ybV-dKqZ-STZhVHVRS6lzoSbS#scrollTo=-xgZtneQRp6r" target="_parent\"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a> scrape public domain images from [rawpixel website](https://www.rawpixel.com/category/53/public-domain)

# Mount google drive to save your data


In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Install Selenium and chrome webdriver

In [None]:
!pip install selenium
!apt-get update
!apt install chromium-chromedriver
# --------------------------------------
import sys
sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')

# Make output directory

In [None]:
import os

if os.path.isdir("/content/drive/MyDrive/rawpixel-images"):
    %cd "/content/drive/MyDrive/rawpixel-images"
else:
    %cd "/content/drive/MyDrive/"
    !mkdir rawpixel-images
    %cd rawpixel-images

# Upload cookies file from your local filesystem

In [25]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving example.txt to example (3).txt
User uploaded file "example.txt" with length 12 bytes


# Import webdriver and other libraries

In [6]:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.chrome.options import Options
from tqdm import tqdm
import time
import pickle

In [None]:
# url to public domain album from rawpixel website
URL = "https://www.rawpixel.com/board/574376/les-roses-pierre-joseph-redoute-free-cc0-roses-illustrations?sort=curated&mode=shop&page=1"
# path to where you want to download the images in google drive
output_path = r'/content/drive/MyDrive/rawpixel-images'

In [None]:
def webdriver_instantiation(download_path):
    PATH = "chromedriver"

    options = Options()
    options.add_argument("--headless")
    # options.add_argument('--no-sandbox')
    # options.add_argument('--disable-dev-shm-usage')

    browser_driver = webdriver.Chrome(PATH, options=options)

    params = {'behavior': 'allow', 'downloadPath': download_image}
    browser_driver.execute_cdp_cmd('Page.setDownloadBehavior', params)

    return browser_driver


def download_image(browser_driver, link):
    browser_driver.get(link)
    try:
        button = WebDriverWait(browser_driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, 'button.btn.download'))
        )
    except TimeoutException:
        print("TimeOut error!!!")

    button.click()
    time.sleep(5)

def add_cookies(browser_driver, cookies_file):
    cookies = pickle.load(open(cookies_file, "rb"))
    for cookie in cookies:
        browser_driver.add_cookie(cookie)

    time.sleep(3)
    browser_driver.refresh()

def scroll_down(browser_driver):
    jscript = """
        window.scrollTo(0, document.body.scrollHeight);
        var page_depth = document.body.scrollHeight;
        return page_depth;
    """

    page_depth = browser_driver.execute_script(jscript)
    match = False
    while(match == False):
        current_count = page_depth
        time.sleep(3)
        page_depth = browser_driver.execute_script(jscript)
        if(current_count == page_depth):
            match = True


if __name__ == '__main__':
    url = URL

    driver = webdriver_instantiation(output_path)
    driver.get(url)
    
    add_cookies(driver, "cookies.pkl")
    scroll_down(driver)

    elements = driver.find_elements_by_css_selector('div.container-full.page-content figure a.img-link')
    links = [element.get_attribute('href') for element in elements]

    print(f"There is {len(links)} image to download")

    for link in tqdm(links):
        download_image(driver, link)

# Check the download directory

In [None]:
import os

def find(path):
    for root, dirs, files in os.walk(path):
        pass
    return len(files)

print(f"The number of images is {find("/content/drive/MyDrive/StyleGAN/rawpixel-selenium-scraper/images")}")