<a href="https://colab.research.google.com/github/joedockrill/image-scraper/blob/master/ImageScraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Image Scraper**

In [43]:
#@title Code setup, RUN THIS CELL.
#@markdown This notebook can scrape from Google and DuckDuckGo, but the results from DuckDuckGo  
#@markdown are much better. The thumbnails are much larger and the results include the original 
#@markdown (even bigger) url (although I don't currently use it).
#@markdown 
#@markdown I've really only left the Google scraping here in case the DuckDuckGo code breaks in the 
#@markdown future so there's something else here which works.
#@markdown 
#@markdown I'd also love to add params for DuckDuckGo to constrain searches by layouts and 
#@markdown colours etc like you can do in the GUI but I can't currently see how to do this via their 
#@markdown i.js interface and it's completely undocumented. Check back later.
#@markdown 
#@markdown If you're new to colab and you want to see the code, click on the ... menu in the top
#@markdown right of this cell and click "Form" then "Hide Form"
#@markdown 
#@markdown Workflow:
#@markdown - Write some search functions in the "Download your images here" cell
#@markdown - Run the image cleaner to delete rubbish
#@markdown - Zip it all up
#@markdown - Download it or copy it to Google Drive
#@markdown 
#@markdown Feel free to copy/share/modify as you see fit.
#@markdown 
#@markdown Hugs & kisses, Joe Dockrill. 
#@markdown 
#@markdown credits: https://github.com/deepanprabhu/duckduckgo-images-api for the base DuckDuckGo code
#@markdown 
import os
import requests
import re
import json
import time
from bs4 import BeautifulSoup
from PIL import Image
import ipywidgets as widgets
from ipywidgets import interactive
from IPython.display import display
import shutil

BASE_FOLDER = "images"

def google_scrape_urls(keywords, max_results):
  BASE_URL = "https://www.google.com/search?site=&tbm=isch&source=hp&biw=1873&bih=990&q="

  HEADERS = {
      'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
      'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
      'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
      'Accept-Encoding': 'none',
      'Accept-Language': 'en-US,en;q=0.8',
      'Connection': 'keep-alive',
  }
  
  searchurl = BASE_URL + keywords
  resp = requests.get(searchurl, headers=HEADERS)
  html = resp.text
  
  soup = BeautifulSoup(html, "html.parser")
  results = soup.findAll("img", {"data-src":True}, limit=max_results)
  
  links = []
  for re in results:
    links.append(re["data-src"])

  return links  
  
def duckduckgo_scrape_urls(keywords, max_results):
    BASE_URL = 'https://duckduckgo.com/'
    params = {
    	'q': keywords
    };
    results = 0
    links = []

    resp = requests.post(BASE_URL, data=params)
    match = re.search(r'vqd=([\d-]+)\&', resp.text, re.M|re.I)
    assert match is not None, "Failed to obtain search token"

    HEADERS = {
        'authority': 'duckduckgo.com',
        'accept': 'application/json, text/javascript, */*; q=0.01',
        'sec-fetch-dest': 'empty',
        'x-requested-with': 'XMLHttpRequest',
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36',
        'sec-fetch-site': 'same-origin',
        'sec-fetch-mode': 'cors',
        'referer': 'https://duckduckgo.com/',
        'accept-language': 'en-US,en;q=0.9',
    }

    PARAMS = (
        ('l', 'us-en'),
        ('o', 'json'),
        ('q', keywords),
        ('vqd', match.group(1)),
        ('f', ',,,'),
        ('p', '1'),
        ('v7exp', 'a'),
    )

    requestUrl = BASE_URL + "i.js"

    while True:
        while True:
            try:
                resp = requests.get(requestUrl, headers=HEADERS, params=PARAMS)
                data = json.loads(resp.text)
                break
            except ValueError as e:
                print("Hit request throttle, sleeping and retrying")
                time.sleep(5); #seems a lot but ok...
                continue

        #result["thumbnail"] is normally big enough for most purposes
        #result["width"], result["height"] are for the full size img in result["image"]
        #result["image"] url to full size img on orig site (so may be less reliable) 
        #result["url"], result["title"].encode('utf-8') from the page the img came from
        
        for result in data["results"]:
          links.append(result["thumbnail"])
          if(max_results is not None):
            if(len(links) >= max_results) : return links
  
        if "next" not in data:
            #no next page, all done
            return links

        requestUrl = BASE_URL + data["next"]

def download_urls(label, links):
  if(len(links) == 0):
    print("Nothing to download!"); return

  print("Downloading", len(links), "images for", label)

  folder = os.path.join(BASE_FOLDER, label)
  if not os.path.exists(folder): os.makedirs(folder)

  bar = widgets.IntProgress(0, 0, len(links) - 1)
  display(bar)

  for i, link in enumerate(links):
      resp = requests.get(link)      
      filename = os.path.join(folder, label + str(i+1).zfill(3) + ".jpg")
      with open(filename, "wb") as file:
          file.write(resp.content)

      try:
        img = Image.open(filename)
        img.verify()
        img.close()
      except:
        print(filename, "is invalid")
        shutil.os.remove(filename)

      bar.value += 1

  bar.bar_style = "success"

def google_search(label, keywords, max_results=100):
  links = google_scrape_urls(keywords,max_results)
  download_urls(label, links)

def duckduckgo_search(label, keywords, max_results=100):
  links = duckduckgo_scrape_urls(keywords,max_results)
  download_urls(label, links)


**Run this cell to delete all image files (to create another dataset or reset)**

In [48]:
!rm -r images/*

**Download your images here**

In [None]:
DATASET_NAME = "images" #change this to something more meaningful
duckduckgo_search("label", "query keywords", max_results=100)

# EG:
# DATASET_NAME = "Clowns"
# duckduckgo_search("Nice", "nice clowns", max_results=150)
# duckduckgo_search("Scary", "scary clowns", max_results=150)

# you can also use google_search() if you prefer or if the ddg code breaks.


In [None]:
#@title Quick & Dirty Dataset Cleaner 
#@markdown Run this cell for a quick image cleaner. When you hit delete it's done immediately but you'll need to run the cell again or swap folders to refresh the view.

#@markdown This is SLOW at loading more than a handful of images. I assume it would be a decent bit faster if it was running locally. 

def click_handler(btn):
  shutil.os.remove(btn.tag)
  btn.disabled = True

def render_image_cleaner(folder):
  items = []
  path = os.path.join(BASE_FOLDER, folder)
  
  for filename in os.listdir(path):
      if filename.endswith(".jpg"):
          file = open(os.path.join(path, filename), "rb")
          fstream = file.read()
          img = widgets.Image(value=fstream, format='jpg')
          img.layout.width="150px"
          btn = widgets.Button(description="Delete")
          btn.tag = os.path.join(path, filename)
          btn.on_click(click_handler)
          box = widgets.VBox(children=(img,btn))
          box.layout.margin = "5px"
          items.append(box)
  
  grid = widgets.GridBox(items, layout=widgets.Layout(grid_template_columns="repeat(4, 25%)"))
  grid.layout.margin = "15px"
  display(grid)

folders = next(os.walk(BASE_FOLDER))[1]
folders.sort()

w = interactive(render_image_cleaner, folder=folders)
display(w)
display(w.children[0]) # dropdown top & bottom

**Run this cell to create a zip file**

In [None]:
!rm {DATASET_NAME}.zip
!zip -r {DATASET_NAME}.zip images

**Run one of these cells to get your zip file**

In [None]:
#download to your local system
from google.colab import files
files.download(DATASET_NAME + ".zip")

In [None]:
#copy to google drive (Datasets folder by default, change below)
from google.colab import drive
import shutil

DRIVE_DEST_FOLDER = "Datasets"

drive.mount('/content/drive')
folder = os.path.join("/content/drive/My Drive", DRIVE_DEST_FOLDER)
if not os.path.exists(folder): os.makedirs(folder)

shutil.copyfile(DATASET_NAME + ".zip", os.path.join(folder, DATASET_NAME + ".zip"))