In [None]:
# default_exp core

# core

> Image scraping library for creating deep learning datasets. Uses DuckDuckGo, as the API for searching has more options and better results than the alternatives.

In [None]:
#hide
from nbdev import *

In [None]:
#export

# scraping
from pathlib import Path
from typing import Union
from enum import Enum
import re
import requests
import json
import time
from bs4 import BeautifulSoup

# other
from PIL import Image as PImage
import ipywidgets as widgets
from IPython.display import display
import pandas as pd

## Search filtering

The scrape/search functions can use the following enums as filters for searches. By default the results **should be** square photos. This is what's requested from DDG. Sometimes results may not be quite what you've requested (eg: you may get a bit of clipart or something more or less square but not exactly). No checks are done on what comes back.

In [None]:
#exports
class ImgSize(Enum):
  Thumbs=""
  Small="Small"
  Medium="Medium"
  Large="Large"
  Wallpaper="Wallpaper"

class ImgType(Enum):
  All=""
  Photo="photo"
  Clipart="clipart"
  Gif="gif"
  Transparent="transparent"

class ImgLayout(Enum):
  All=""
  Square="Square"
  Tall="Tall"
  Wide="Wide"
  
class ImgColor(Enum):
  All=""
  Color="color"
  Monochrome="Monochrome"
  Red="Red"
  Orange="Orange"
  Yellow="Yellow"
  Green="Green"
  Blue="Blue"
  Purple="Purple"
  Pink="Pink" 
  Brown="Brown"
  Black="Black" 
  Gray="Gray" 
  Teal="Teal"
  White="White"

## Scraping URLs

In [None]:
#export
def duckduckgo_scrape_urls(keywords: str, max_results: int, 
                           img_size: ImgSize=ImgSize.Thumbs, 
                           img_type: ImgType=ImgType.Photo,
                           img_layout: ImgLayout=ImgLayout.Square,
                           img_color: ImgColor=ImgColor.All) -> list:
  '''scrape urls from duckduckgo image search'''
  BASE_URL = 'https://duckduckgo.com/'
  params = {
    'q': keywords
  };
  results = 0
  links = []

  resp = requests.post(BASE_URL, data=params)
  match = re.search(r'vqd=([\d-]+)\&', resp.text, re.M|re.I)
  assert match is not None, "Failed to obtain search token"

  HEADERS = {
      'authority': 'duckduckgo.com',
      'accept': 'application/json, text/javascript, */*; q=0.01',
      'sec-fetch-dest': 'empty',
      'x-requested-with': 'XMLHttpRequest',
      'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36',
      'sec-fetch-site': 'same-origin',
      'sec-fetch-mode': 'cors',
      'referer': 'https://duckduckgo.com/',
      'accept-language': 'en-US,en;q=0.9',
  }

  filters = ""
  if(img_size != ImgSize.Thumbs): filters +=  "size:" + img_size.name
  filters += ","
  if(img_type != ImgType.All): filters +=  "type:" + img_type.name
  filters += ","
  if(img_layout != ImgLayout.All): filters +=  "layout:" + img_layout.name
  filters += ","
  if(img_color != ImgColor.All): filters +=  "color:" + img_color.name
  
  PARAMS = (
      ('l', 'us-en'),
      ('o', 'json'),
      ('q', keywords),
      ('vqd', match.group(1)),
      ('f', filters),
      ('p', '1'),
      ('v7exp', 'a'),
  )

  requestUrl = BASE_URL + "i.js"

  while True:
      while True:
          try:
              resp = requests.get(requestUrl, headers=HEADERS, params=PARAMS)
              data = json.loads(resp.text)
              break
          except ValueError as e:
              print("Hit request throttle, sleeping and retrying")
              time.sleep(5)
              continue

      #result["thumbnail"] is normally big enough for most purposes
      #result["width"], result["height"] are for the full size img in result["image"]
      #result["image"] url to full size img on orig site (so may be less reliable) 
      #result["url"], result["title"].encode('utf-8') from the page the img came from
      
      for result in data["results"]:
        if(img_size == ImgSize.Thumbs): links.append(result["thumbnail"])
        else:                       links.append(result["image"])

        if(max_results is not None):
          if(len(links) >= max_results) : return links

      if "next" not in data:
          #no next page, all done
          return links

      requestUrl = BASE_URL + data["next"]        

Returns a list of image URLs for this search. At the time of writing, this function will return up to 477 urls for a single search.

In [None]:
from IPython.display import Image as IPImage

def display_img(url):
    display(IPImage(url=url))

In [None]:
links = duckduckgo_scrape_urls("happy clowns", max_results=3)
links

['https://tse1.mm.bing.net/th?id=OIP.LR-2HW7P9ENbMGJ7cZTVGwHaHL&pid=Api',
 'https://tse4.mm.bing.net/th?id=OIP.jgAbDJb9lY-p0Q83Q2xsCgHaI0&pid=Api',
 'https://tse4.mm.bing.net/th?id=OIP.4g2txn6PXyuTbEXcJPI2qQHaIE&pid=Api']

In [None]:
display_img(links[0])

Note that what gets returned by default is actually the image preview you see in the search results, not a thumbnail, and is quite a decent size, but comes from DDG. If you specify a size other than thumbs, then the URL returned is the original source URL, and is therefore more likely to fail a download attempt. The default (as shown above) should generally be sufficient for your needs.

Since the parameters you use are likely to be the same across every image search within your dataset, if you plan on overriding the defaults, you can pass your parameters in using a dictionary like this:

In [None]:
params = {
    "max_results": 3,
    "img_size":    ImgSize.Medium, 
    "img_type":    ImgType.Photo,
    "img_layout":  ImgLayout.All,
    "img_color":   ImgColor.Purple
}

links = duckduckgo_scrape_urls("puppies", **params)
links

['http://4.bp.blogspot.com/-GKGVUan6I3w/UOQtWCzichI/AAAAAAAANs0/mxox-FdrnRA/s1600/019.jpg',
 'https://i.pinimg.com/736x/fa/fd/83/fafd8381375e3724bb2b2842ad175792--alessandra-ambrosio-dip-dyed.jpg',
 'https://i.pinimg.com/originals/7e/a1/5b/7ea15b145096fd73aa95b4cf1ea2d35c.gif']

In [None]:
display_img(links[0])
# why? just why??

## Downloading images

In [None]:
#export
def rmtree(path: Union[str, Path]):
    path = Path(path); assert path.is_dir()
    for p in reversed(list(path.glob('**/*'))):
        if p.is_file():  p.unlink()
        elif p.is_dir(): p.rmdir()

You can use `rmtree()` to scrub your downloaded images, either to create a new dataset or if you just want to "reset" and start over.

In [None]:
root = Path.cwd()/"images"

In [None]:
#export
def download_urls(path: Union[str, Path], links: list) -> list:
  '''downloads urls into the given folder'''
  if(len(links) == 0):
    print("Nothing to download!"); return

  path = Path(path)
  path.mkdir(parents=True, exist_ok=True)

  print("Downloading", len(links), "results into", path)
  bar = widgets.IntProgress(0, 0, len(links) - 1)
  display(bar)

  i = 1
  mk_fp = lambda i: path/(str(i).zfill(3) + ".jpg")
  is_file = lambda i: mk_fp(i).exists()
  while is_file(i): i += 1 # don't overwrite previous searches
  
  results = []
    
  for link in links:
      try:
        resp = requests.get(link)      
        fp = mk_fp(i)
        fp.write_bytes(resp.content)

        try:
          img = PImage.open(fp)
          img.verify()
          img.close()
          results.append(Path(fp))
        except Exception as e:
          # print(e)
          print(fp, "is invalid")
          fp.unlink()
      except:
        print("Exception occured while retrieving", link)

      i += 1
      bar.value += 1

  bar.bar_style = "success"
  return results

Downloads a list of URLs into the given folder. Files will be saved as 001.jpg, 002.jpg etc but images already present will not be overwritten, so you can run multiple searches for the same label (eg: different genres of orchid all under one 'orchid' label) and file numbering will carry on from the last one on disc.

Downloaded files will be checked for validity so you should never end up with corrupt images or truncated downloads.

Returns a list of Path objects for succesfully downloaded images.

In [None]:
download_urls(root/"purple", links)

Downloading 3 results into C:\Users\Joe\Documents\GitHub\jmd_imagescraper\images\purple


IntProgress(value=0, max=2)

[Path('C:/Users/Joe/Documents/GitHub/jmd_imagescraper/images/purple/001.jpg'),
 Path('C:/Users/Joe/Documents/GitHub/jmd_imagescraper/images/purple/002.jpg'),
 Path('C:/Users/Joe/Documents/GitHub/jmd_imagescraper/images/purple/003.jpg')]

In [None]:
#export
def duckduckgo_search(path: Union[str, Path], label: str, keywords: str, max_results: int=100,
                           img_size: ImgSize=ImgSize.Thumbs, 
                           img_type: ImgType=ImgType.Photo,
                           img_layout: ImgLayout=ImgLayout.Square,
                           img_color: ImgColor=ImgColor.All) -> list:
  '''run a duckduckgo search and download the images'''
  
  print("Duckduckgo search:", keywords)
  links = duckduckgo_scrape_urls(keywords, max_results, img_size, img_type, img_layout, img_color)
  return download_urls(Path(path)/label, links)

Run a search and download the images. Returns a list of Path objects for the image files on disc.

In [None]:
duckduckgo_search(root, "Nice", "nice clowns", max_results=3)

Duckduckgo search: nice clowns
Downloading 3 results into C:\Users\Joe\Documents\GitHub\jmd_imagescraper\images\Nice


IntProgress(value=0, max=2)

[Path('C:/Users/Joe/Documents/GitHub/jmd_imagescraper/images/Nice/001.jpg'),
 Path('C:/Users/Joe/Documents/GitHub/jmd_imagescraper/images/Nice/002.jpg'),
 Path('C:/Users/Joe/Documents/GitHub/jmd_imagescraper/images/Nice/003.jpg')]

If you want a list of all the images downloaded across multiple searches you can do it like this:

In [None]:
params = {
    "max_results": 3,
    "img_size":    ImgSize.Thumbs, 
    "img_type":    ImgType.Photo,
    "img_layout":  ImgLayout.Square,
    "img_color":   ImgColor.All
}

imgs = []
imgs.extend(duckduckgo_search(root, "Nice", "nice clowns", **params))
imgs.extend(duckduckgo_search(root, "Scary", "scary clowns", **params))
imgs

Duckduckgo search: nice clowns
Downloading 3 results into C:\Users\Joe\Documents\GitHub\jmd_imagescraper\images\Nice


IntProgress(value=0, max=2)

Duckduckgo search: scary clowns
Downloading 3 results into C:\Users\Joe\Documents\GitHub\jmd_imagescraper\images\Scary


IntProgress(value=0, max=2)

[Path('C:/Users/Joe/Documents/GitHub/jmd_imagescraper/images/Nice/004.jpg'),
 Path('C:/Users/Joe/Documents/GitHub/jmd_imagescraper/images/Nice/005.jpg'),
 Path('C:/Users/Joe/Documents/GitHub/jmd_imagescraper/images/Nice/006.jpg'),
 Path('C:/Users/Joe/Documents/GitHub/jmd_imagescraper/images/Scary/001.jpg'),
 Path('C:/Users/Joe/Documents/GitHub/jmd_imagescraper/images/Scary/002.jpg'),
 Path('C:/Users/Joe/Documents/GitHub/jmd_imagescraper/images/Scary/003.jpg')]

## Creating a CSV dataset

If you want to create a very large dataset with a lot of images but don't want to store and distribute a very large file, you can create a CSV file containing URL/label pairs. Your users can then download the image files themselves.

In [None]:
#export                           
def save_urls_to_csv(path: Union[str, Path], label: str, keywords: str, max_results: int=100,
                       img_size: ImgSize=ImgSize.Thumbs, 
                       img_type: ImgType=ImgType.Photo,
                       img_layout: ImgLayout=ImgLayout.Square,
                       img_color: ImgColor=ImgColor.All) -> None:
  '''run a search and concat the urls to a csv'''
  path = Path(path)
  if(path.exists() == False):
    df = pd.DataFrame(columns=["URL", "Label"])
    df.to_csv(path, index=False)
    
  urls = duckduckgo_scrape_urls(keywords, max_results, img_size, img_type, img_layout, img_color)
  
  rows = []
  for url in urls: rows.append({"URL":url, "Label":label})
    
  df = pd.concat([pd.read_csv(path), pd.DataFrame(rows)]) 
  df.to_csv(path, index=False)

In [None]:
csv = root/"clowns.csv"
save_urls_to_csv(csv, "Nice", "nice clowns", max_results=5)
save_urls_to_csv(csv, "Scary", "scary clowns", max_results=5)

In [None]:
df = pd.read_csv(csv)
df

Unnamed: 0,URL,Label
0,https://tse4.mm.bing.net/th?id=OIP.uFX0ybAs0Hi...,Nice
1,https://tse4.mm.bing.net/th?id=OIP.s3Ie8ax_Fa6...,Nice
2,https://tse1.mm.bing.net/th?id=OIP.lwC5ho3Ta-T...,Nice
3,https://tse4.mm.bing.net/th?id=OIP.glEf94S1eD0...,Nice
4,https://tse1.mm.bing.net/th?id=OIP.9lCTTlLeQV9...,Nice
5,https://tse3.mm.bing.net/th?id=OIP.zMsnePdSfSb...,Scary
6,https://tse3.mm.bing.net/th?id=OIP.yhDrJ18seBC...,Scary
7,https://tse1.mm.bing.net/th?id=OIP.y5tm55MMKcW...,Scary
8,https://tse3.mm.bing.net/th?id=OIP.MWOP-aLPv8D...,Scary
9,https://tse4.mm.bing.net/th?id=OIP.LOPx2ViR4-C...,Scary


In [None]:
#export
def download_images_from_csv(path: Union[str, Path], csv: Union[str, Path], url_col: str="URL", label_col: str="Label"):
    path = Path(path); csv = Path(csv);
    
    df = pd.read_csv(csv)
    labels = df.Label.unique()
    
    for label in labels:
        df_label = df.loc[df[label_col] == label]
        urls = df_label[url_col].to_list()
        download_urls(path/label, urls)

This will (you've guessed it), download the image files from the CSV file we've just created. You can also supply column names if you want to use it on a CSV file created elsewhere with different names.

In [None]:
download_images_from_csv(root, csv)

Downloading 5 results into C:\Users\Joe\Documents\GitHub\jmd_imagescraper\images\Nice


IntProgress(value=0, max=4)

Downloading 5 results into C:\Users\Joe\Documents\GitHub\jmd_imagescraper\images\Scary


IntProgress(value=0, max=4)

In [None]:
notebook2script()

Converted 00_core.ipynb.
Converted index.ipynb.
