In [None]:
# default_exp core

# jmd_imagescraper.core

> Core image scraping functions for creating deep learning datasets

In [None]:
#hide
from nbdev import *

In [None]:
#export

# scraping
from pathlib import Path
from typing import Union
from enum import Enum
import re
import requests
import json
import time
from bs4 import BeautifulSoup
import uuid

# other
from PIL import Image as PImage
from IPython.display import display
import pandas as pd
from fastprogress.fastprogress import progress_bar

## Search filtering

The scrape/search functions can use the following enums as filters for searches. Filtering is normally pretty good, so by default the results **should be** square photos as this is what's requested from DDG. Sometimes results may not be quite what you've requested (eg: you may get a bit of clipart or something more or less square but not exactly). No checks are actually performed on what comes back.

In [None]:
#export
class ImgSize(Enum):
  Cached=""
  Small="Small"
  Medium="Medium"
  Large="Large"
  Wallpaper="Wallpaper"

Using `Cached` as the image size (the default) returns the image cached by DuckDuckGo/Bing. This is a very decent size for deep learning purposes and is much more reliable to download from (no 404s, no hot-linking bans etc). Using any other size will return the original images from the source websites.

In [None]:
#export
class ImgLayout(Enum):
  All=""
  Square="Square"
  Tall="Tall"
  Wide="Wide"

This defaults to `Square` everywhere because that's what your DL models want.

In [None]:
#export
class ImgType(Enum):
  All=""
  Photo="photo"
  Clipart="clipart"
  Gif="gif"
  Transparent="transparent"

Defaults to `Photo` everywhere

In [None]:
#export
class ImgColor(Enum):
  All=""
  Color="color"
  Monochrome="Monochrome"
  Red="Red"
  Orange="Orange"
  Yellow="Yellow"
  Green="Green"
  Blue="Blue"
  Purple="Purple"
  Pink="Pink" 
  Brown="Brown"
  Black="Black" 
  Gray="Gray" 
  Teal="Teal"
  White="White"

Probably unlikely to be of much use to you but it's part of the API so I include it. You never know...

## Scraping URLs

In [None]:
#export
def duckduckgo_scrape_urls(keywords: str, max_results: int, 
                           img_size: ImgSize=ImgSize.Cached, 
                           img_type: ImgType=ImgType.Photo,
                           img_layout: ImgLayout=ImgLayout.Square,
                           img_color: ImgColor=ImgColor.All,
                           timeout: Union[float, tuple]=None) -> list:
  '''Scrapes URLs from DuckDuckGo image search. Returns list of URLs.'''
  BASE_URL = 'https://duckduckgo.com/'
  params = {
    'q': keywords
  };
  results = 0
  links = []

  resp = requests.post(BASE_URL, data=params)
  match = re.search(r'vqd=([\d-]+)\&', resp.text, re.M|re.I)
  assert match is not None, "Failed to obtain search token"

  HEADERS = {
      'authority': 'duckduckgo.com',
      'accept': 'application/json, text/javascript, */*; q=0.01',
      'sec-fetch-dest': 'empty',
      'x-requested-with': 'XMLHttpRequest',
      'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36',
      'sec-fetch-site': 'same-origin',
      'sec-fetch-mode': 'cors',
      'referer': 'https://duckduckgo.com/',
      'accept-language': 'en-US,en;q=0.9',
  }

  filters = ""
  if(img_size != ImgSize.Cached): filters +=  "size:" + img_size.name
  filters += ","
  if(img_type != ImgType.All): filters +=  "type:" + img_type.name
  filters += ","
  if(img_layout != ImgLayout.All): filters +=  "layout:" + img_layout.name
  filters += ","
  if(img_color != ImgColor.All): filters +=  "color:" + img_color.name
  
  PARAMS = (
      ('l', 'us-en'),
      ('o', 'json'),
      ('q', keywords),
      ('vqd', match.group(1)),
      ('f', filters),
      ('p', '1'),
      ('v7exp', 'a'),
  )

  requestUrl = BASE_URL + "i.js"

  while True:
      while True:
          try:
              resp = requests.get(requestUrl, headers=HEADERS, params=PARAMS, timeout=timeout)
              data = json.loads(resp.text)
              break
          except requests.exceptions.Timeout as e:
              print("Timeout while trying to scrape URLs.")
          except ValueError as e:
              print("Hit request throttle, sleeping and retrying")
              time.sleep(5)
              continue

      #result["thumbnail"] is normally big enough for most purposes
      #result["width"], result["height"] are for the full size img in result["image"]
      #result["image"] url to full size img on orig site (so may be less reliable) 
      #result["url"], result["title"].encode('utf-8') from the page the img came from
      
      for result in data["results"]:
        if(img_size == ImgSize.Cached): links.append(result["thumbnail"])
        else:                           links.append(result["image"])

        if(max_results is not None):
          if(len(links) >= max_results) : return links

      if "next" not in data:
          #no next page, all done
          return links

      requestUrl = BASE_URL + data["next"]        

At the time of writing, this function will return up to 477 urls for a single search.

In [None]:
#hide
from IPython.display import Image as IPImage

def display_img(url):
    display(IPImage(url=url))

In [None]:
links = duckduckgo_scrape_urls("happy clowns", max_results=3)
links

['https://tse1.mm.bing.net/th?id=OIP.LR-2HW7P9ENbMGJ7cZTVGwHaHL&pid=Api',
 'https://tse4.mm.bing.net/th?id=OIP.jgAbDJb9lY-p0Q83Q2xsCgHaI0&pid=Api',
 'https://tse4.mm.bing.net/th?id=OIP.4g2txn6PXyuTbEXcJPI2qQHaIE&pid=Api']

In [None]:
display_img(links[0])

This is the kind of size you can expect by default. As you can see it should normally be sufficient for your needs.

Since the parameters you use are likely to be the same across every image search within your dataset, if you plan on overriding the defaults, you can pass your parameters in using a dictionary like this:

In [None]:
params = {
    "max_results": 3,
    "img_size":    ImgSize.Medium, 
    "img_type":    ImgType.Photo,
    "img_layout":  ImgLayout.All,
    "img_color":   ImgColor.Purple
}

links = duckduckgo_scrape_urls("puppies", **params)
links

['https://cdn3.volusion.com/9nxdj.fchy5/v/vspfiles/photos/WR-13710-2T.jpg?1528880561',
 'http://4.bp.blogspot.com/-GKGVUan6I3w/UOQtWCzichI/AAAAAAAANs0/mxox-FdrnRA/s1600/019.jpg',
 'http://www.hahastop.com/thumbsb/The_All_Purple_Dog_b.jpg']

In [None]:
display_img(links[1])
# why? just why??

## Downloading images

In [None]:
#export
def rmtree(path: Union[str, Path]):
    '''Recursively delete a directory tree'''
    path = Path(path); assert path.is_dir()
    for p in reversed(list(path.glob('**/*'))):
        if p.is_file():  p.unlink()
        elif p.is_dir(): p.rmdir()
    path.rmdir()

You can use `rmtree()` to scrub your downloaded images, either to create a new dataset or if you just want to "reset" and start over while experimenting.

In [None]:
#export

def download_urls(path: Union[str, Path], links: list, uuid_names: bool=True, timeout: Union[float, tuple]=None) -> list:
  '''Downloads urls to the given path. Returns a list of Path objects for files downloaded to disc.'''
  if(len(links) == 0):
    print("Nothing to download!"); return

  path = Path(path)
  path.mkdir(parents=True, exist_ok=True)

  print("Downloading results into", path)
  pbar = progress_bar(links)
  pbar.comment = 'Images downloaded'

  i = 1
  mk_uniq = lambda : '_' + str(uuid.uuid4())[:8] if uuid_names else ''
  mk_fp = lambda x: path/(str(x).zfill(3) + mk_uniq() + ".jpg")
  is_file = lambda x: len(list(path.glob(str(x).zfill(3) + '*.jpg'))) > 0
    
  while is_file(i): i += 1 # don't overwrite previous searches
  
  results = []
    
  for link in pbar:
      try:
        resp = requests.get(link, timeout=timeout)
        fp = mk_fp(i)
        fp.write_bytes(resp.content)

        try:
          img = PImage.open(fp)
          img.verify()
          img.close()
          results.append(Path(fp))
        except Exception as e:
          # print(e)
          print(fp, "is invalid")
          fp.unlink()
      except requests.exceptions.Timeout as e:
        print("Timeout while trying to retrieve", link)
      except Exception as e:
        # print(e)
        print("Exception occured while retrieving", link)
        
      i += 1

  return results

Files will be saved as 001.jpg, 002.jpg etc but images already present will not be overwritten, so you can run multiple searches for the same label (eg: different genres of orchid all under one 'orchid' label) and file numbering will carry on from the last one on disc. 

If `uuid_names` parameter is `True`, enough of a uuid is appended to the name of the file (like `001_4cda4d95.jpg`) to ensure filenames are unique across directories. This is for compatibility with tools like `fastai.vision.widgets.ImageClassifierCleaner` which can move images between folders and hence cause name clashes. This is the default everywhere.

Downloaded files will be checked for validity so you should never end up with corrupt images or truncated downloads. (Let me know if anything duff gets through)

In [None]:
root = Path.cwd()/"images"
download_urls(root/"purple", links)

Downloading results into C:\Users\Joe\Documents\GitHub\jmd_imagescraper\images\purple


[Path('C:/Users/Joe/Documents/GitHub/jmd_imagescraper/images/purple/001_0e3cc95b.jpg'),
 Path('C:/Users/Joe/Documents/GitHub/jmd_imagescraper/images/purple/002_6e8b3e7a.jpg'),
 Path('C:/Users/Joe/Documents/GitHub/jmd_imagescraper/images/purple/003_4fddf126.jpg')]

In [None]:
#export
def duckduckgo_search(path: Union[str, Path], label: str, keywords: str, max_results: int=100,
                           img_size: ImgSize=ImgSize.Cached, 
                           img_type: ImgType=ImgType.Photo,
                           img_layout: ImgLayout=ImgLayout.Square,
                           img_color: ImgColor=ImgColor.All, 
                           timeout: Union[float, tuple]=None,
                           scrape_timeout: Union[float, tuple]=None,
                           uuid_names: bool=True) -> list:
  '''Run a DuckDuckGo search and download the images. Returns a list of Path objects for files downloaded to disc.'''
  
  print("Duckduckgo search:", keywords)
  links = duckduckgo_scrape_urls(keywords, max_results, img_size, img_type, img_layout, img_color, timeout=scrape_timeout)
  return download_urls(Path(path)/label, links, uuid_names=uuid_names, timeout=timeout)

In [None]:
duckduckgo_search(root, "Nice", "nice clowns", max_results=3)

Duckduckgo search: nice clowns
Downloading results into C:\Users\Joe\Documents\GitHub\jmd_imagescraper\images\Nice


[Path('C:/Users/Joe/Documents/GitHub/jmd_imagescraper/images/Nice/001_6b99919b.jpg'),
 Path('C:/Users/Joe/Documents/GitHub/jmd_imagescraper/images/Nice/002_8c1451d7.jpg'),
 Path('C:/Users/Joe/Documents/GitHub/jmd_imagescraper/images/Nice/003_d8a0fef5.jpg')]

If you want a list of all the images downloaded across multiple searches you can do it like this:

In [None]:
params = {
    "max_results": 3,
    "img_size":    ImgSize.Cached, 
    "img_type":    ImgType.Photo,
    "img_layout":  ImgLayout.Square,
    "img_color":   ImgColor.All,
    "uuid_names": True
}

imgs = []
imgs.extend(duckduckgo_search(root, "Nice", "nice clowns", **params))
imgs.extend(duckduckgo_search(root, "Scary", "scary clowns", **params))
imgs.extend(duckduckgo_search(root, "Mime", "mimes", **params))
imgs

Duckduckgo search: nice clowns
Downloading results into C:\Users\Joe\Documents\GitHub\jmd_imagescraper\images\Nice


Duckduckgo search: scary clowns
Downloading results into C:\Users\Joe\Documents\GitHub\jmd_imagescraper\images\Scary


Duckduckgo search: mimes
Downloading results into C:\Users\Joe\Documents\GitHub\jmd_imagescraper\images\Mime


[Path('C:/Users/Joe/Documents/GitHub/jmd_imagescraper/images/Nice/007_6af54a70.jpg'),
 Path('C:/Users/Joe/Documents/GitHub/jmd_imagescraper/images/Nice/008_c304d5ca.jpg'),
 Path('C:/Users/Joe/Documents/GitHub/jmd_imagescraper/images/Nice/009_efb040f8.jpg'),
 Path('C:/Users/Joe/Documents/GitHub/jmd_imagescraper/images/Scary/001_b63c3858.jpg'),
 Path('C:/Users/Joe/Documents/GitHub/jmd_imagescraper/images/Scary/002_40398473.jpg'),
 Path('C:/Users/Joe/Documents/GitHub/jmd_imagescraper/images/Scary/003_e801795a.jpg'),
 Path('C:/Users/Joe/Documents/GitHub/jmd_imagescraper/images/Mime/001_f66174ed.jpg'),
 Path('C:/Users/Joe/Documents/GitHub/jmd_imagescraper/images/Mime/002_ee152455.jpg'),
 Path('C:/Users/Joe/Documents/GitHub/jmd_imagescraper/images/Mime/003_32d7762d.jpg')]

## Creating a CSV dataset

If you want to create a very large dataset with a lot of images but don't want to store and distribute a very large file, you can create a CSV file containing URL/label pairs. Your users can then download the image files themselves.

In [None]:
#export                           
def save_urls_to_csv(path: Union[str, Path], label: str, keywords: str, max_results: int=100,
                       img_size: ImgSize=ImgSize.Cached, 
                       img_type: ImgType=ImgType.Photo,
                       img_layout: ImgLayout=ImgLayout.Square,
                       img_color: ImgColor=ImgColor.All,
                       timeout: Union[float, tuple]=None) -> None:
  '''Run a search and concat the URLs to a CSV file'''
  path = Path(path)
  if(path.exists() == False):
    df = pd.DataFrame(columns=["URL", "Label"])
    df.to_csv(path, index=False)
    
  urls = duckduckgo_scrape_urls(keywords, max_results, img_size, img_type, img_layout, img_color, timeout=timeout)
  
  rows = []
  for url in urls: rows.append({"URL":url, "Label":label})
    
  df = pd.concat([pd.read_csv(path), pd.DataFrame(rows)]) 
  df.to_csv(path, index=False)

In [None]:
csv = root/"clowns.csv"
save_urls_to_csv(csv, "Nice", "nice clowns", max_results=5)
save_urls_to_csv(csv, "Scary", "scary clowns", max_results=5)

In [None]:
df = pd.read_csv(csv)
df

Unnamed: 0,URL,Label
0,https://tse4.mm.bing.net/th?id=OIP.uFX0ybAs0Hi...,Nice
1,https://tse4.mm.bing.net/th?id=OIP.s3Ie8ax_Fa6...,Nice
2,https://tse1.mm.bing.net/th?id=OIP.lwC5ho3Ta-T...,Nice
3,https://tse4.mm.bing.net/th?id=OIP.glEf94S1eD0...,Nice
4,https://tse3.mm.bing.net/th?id=OIP.n3504PAjzbN...,Nice
5,https://tse3.mm.bing.net/th?id=OIP.zMsnePdSfSb...,Scary
6,https://tse3.mm.bing.net/th?id=OIP.yhDrJ18seBC...,Scary
7,https://tse1.mm.bing.net/th?id=OIP.y5tm55MMKcW...,Scary
8,https://tse3.mm.bing.net/th?id=OIP.MWOP-aLPv8D...,Scary
9,https://tse3.mm.bing.net/th?id=OIP.AZyYLBgzuTA...,Scary


In [None]:
#export
def download_images_from_csv(path: Union[str, Path], csv: Union[str, Path], url_col: str="URL", label_col: str="Label", uuid_names: bool=True, timeout: Union[float, tuple]=None):
    '''Download the URLs from a CSV file to the given path. Returns a list of Path objects for files downloaded to disc.'''
    path = Path(path); csv = Path(csv);
    
    df = pd.read_csv(csv)
    labels = df.Label.unique()
    imgs = []
    
    for label in labels:
        df_label = df.loc[df[label_col] == label]
        urls = df_label[url_col].to_list()
        imgs.extend(download_urls(path/label, urls, uuid_names=uuid_names, timeout=timeout))
    
    return imgs

This will (you've guessed it), download the image files from the CSV file we've just created. You can also supply column names if you want to use it on a CSV file created elsewhere with different names.

In [None]:
download_images_from_csv(root, csv)

Downloading results into C:\Users\Joe\Documents\GitHub\jmd_imagescraper\images\Nice


Downloading results into C:\Users\Joe\Documents\GitHub\jmd_imagescraper\images\Scary


[Path('C:/Users/Joe/Documents/GitHub/jmd_imagescraper/images/Nice/010_9cbdb8a7.jpg'),
 Path('C:/Users/Joe/Documents/GitHub/jmd_imagescraper/images/Nice/011_2f35c643.jpg'),
 Path('C:/Users/Joe/Documents/GitHub/jmd_imagescraper/images/Nice/012_5af5d807.jpg'),
 Path('C:/Users/Joe/Documents/GitHub/jmd_imagescraper/images/Nice/013_30a96f50.jpg'),
 Path('C:/Users/Joe/Documents/GitHub/jmd_imagescraper/images/Nice/014_b5eef117.jpg'),
 Path('C:/Users/Joe/Documents/GitHub/jmd_imagescraper/images/Scary/004_7813f590.jpg'),
 Path('C:/Users/Joe/Documents/GitHub/jmd_imagescraper/images/Scary/005_23d91904.jpg'),
 Path('C:/Users/Joe/Documents/GitHub/jmd_imagescraper/images/Scary/006_50884c99.jpg'),
 Path('C:/Users/Joe/Documents/GitHub/jmd_imagescraper/images/Scary/007_88334447.jpg'),
 Path('C:/Users/Joe/Documents/GitHub/jmd_imagescraper/images/Scary/008_b3da1cce.jpg')]

In [None]:
#hide
from nbdev.export import *
notebook2script()

Converted 00_core.ipynb.
Converted 01_imagecleaner.ipynb.
Converted index.ipynb.
