In [None]:
import urllib
from pathlib import Path
import os
from random import randint
import time
import pandas as pd

# Where we store a CSV of all images including their class
BUTTERFLIES_ORIG_FILEPATH = Path('./butterflies_original.csv')

# Folder to download images to local disk
IMG_FOLDER = 'butterfly_medium_images'

# How many images of each butterfly type to download? -1 means all available, 300 is a good choice for speed
IMAGES_PER_CLASS = -1

# Search and Download butterfly images
Download photos tagged as two different species of Butterfly in Flickr. If you are following along and want to obtain the same images and filenames that I used, so you can use my bounding boxes, jump straight to the bottom section **Download From CSV** just to download (without a new Flickr search).

Otherwise, you need to obtain your own Flickr API key and secret
https://www.flickr.com/services/api/misc.api_keys.html
which you place in a file called config.py (see config_sample.py for an example).

You will also need to install the FlickrAPI:

!pip install flickrapi

In [None]:
# Copyright 2014-2017 Bert Carremans
# Author: Bert Carremans <bertcarremans.be>
# Original code: https://github.com/bertcarremans/Vlindervinder/blob/master/flickr/download_flickr_photos.ipynb
# Modified: Dan Lester <dan@ideonate.com>
# License: BSD 3 clause

from flickrapi import FlickrAPI, shorturl
import config

def download_flickr_photos(keywords, size='medium', max_nb_img=-1):
    """
    Downloads images based on keyword search on the Flickr website
        
    Parameters
    ----------
    keywords : string, list of strings
        Keyword to search for or a list of keywords should be given.
    size : one of the following strings 'thumbnail', 'square', 'medium', default: 'original'.
        Size of the image to download. In this function we only provide
        four options. More options are explained at 
        http://librdf.org/flickcurl/api/flickcurl-searching-search-extras.html
    max_nb_img : int, default: -1
        Maximum number of images per keyword to download. If given a value of -1, all images
        will be downloaded
    
    Returns
    ------
    Images found based on the keyword are saved in IMG_FOLDER and given new filenames.
    A Pandas-ready dict is returned containing 'filename' as found in IMG_FOLDER, 'class' indicating the 
    search term that found the image, 'original_url' the direct URL on Flickr for the image,
    'flickr_page' containing URL of the human-friendly Flickr page showing the image.
    
    Notes
    -----
    This function uses the Python package flickrapi and its walk method. 
    FlickrAPI.walk has same parameters as FlickrAPI.search
    http://www.flickr.com/services/api/flickr.photos.search.html
    
    To use the Flickr API a set of API keys needs to be created on 
    https://www.flickr.com/services/api/misc.api_keys.html
    """
    if not (isinstance(keywords, str) or isinstance(keywords, list)):
        raise AttributeError('keywords must be a string or a list of strings')
        
    if not (size in ['thumbnail', 'square', 'medium', 'original']):
        raise AttributeError('size must be "thumbnail", "square", "medium" or "original"')
                             
    if not (max_nb_img == -1 or (max_nb_img > 0 and isinstance(max_nb_img, int))):
        raise AttributeError('max_nb_img must be an integer greater than zero or equal to -1')
    
    flickr = FlickrAPI(config.API_KEY, config.API_SECRET)
    
    if isinstance(keywords, str):
        keywords_list = []
        keywords_list.append(keywords)
    else:
        keywords_list = keywords
        
    if size == 'thumbnail':
        size_url = 'url_t'
    elif size == 'square':
        size_url = 'url_q'
    elif size == 'medium':
        size_url = 'url_c'
    elif size == 'original':
        size_url = 'url_o'
        
    results_folder = IMG_FOLDER + '/'
    if not os.path.exists(results_folder):
        os.makedirs(results_folder)
    
    classes = []
    filenames = []
    urls = []
    flickr_pages = []
    
    for ki, keyword in enumerate(keywords_list):
        cls_name = keyword.replace(" ", "_")
        count = 0
        prefix = chr(65+ki)

        photos = flickr.walk(
                     text=keyword,
                     extras=size_url,
                     license='1,2,4,5',
                     per_page=50)
        
        for photo in photos:
            t = randint(1, 3)
            time.sleep(t)
            count += 1
            if max_nb_img != -1:
                if count > max_nb_img:
                    print('Reached maximum number of images to download')
                    break
            try:
                url=photo.get(size_url)
                filename = prefix + str(count) +".jpg"
                
                print(f'Downloading {cls_name} image #{count} as {filename} from url {url}')
                urllib.request.urlretrieve(url,  results_folder + filename)
                
                classes.append(cls_name)
                filenames.append(filename)
                urls.append(url)
                flickr_pages.append("https://flic.kr/p/%s" % (shorturl.encode(photo.get('id')),))
                
            except Exception as e:
                print(e, f'Download failure {url}')
                             
        print("Total images downloaded:", str(count - 1))
    
    return {'class': classes, 'filename': filenames, 'original_url': urls, 'flickr_page': flickr_pages}

Find some Meadow Brown Butterfly images then some Gatekeeper Butterfly images, saving them all to IMG_FOLDER

In [None]:
butterflies = ['meadow brown butterfly', 'gatekeeper butterfly']
d = download_flickr_photos(butterflies, size='medium', max_nb_img=IMAGES_PER_CLASS)

Create a Pandas DataFrame containing all the information (most important is class and filename so we know which species is shown in each file)

In [None]:
df = pd.DataFrame(d)

df.to_csv(BUTTERFLIES_ORIG_FILEPATH, index=False)

**Stop here.** The next section should be used instead of the above code if you just want to download the same images as I used.

# Download From CSV

There is no need to run this section if you've already run the above.

Only use the code below to download the same Flickr images that I've used so you can use the same bounding boxes and other CSV data that I created.

In [None]:
df = pd.read_csv(BUTTERFLIES_ORIG_FILEPATH)

In [None]:
results_folder = IMG_FOLDER + '/'
if not os.path.exists(results_folder):
    os.makedirs(results_folder)

for i, row in df.iterrows():
    filename = row['filename']
    url = row['original_url']
    pathname = results_folder + filename
    print(f'Downloading {url} into {pathname}')
    urllib.request.urlretrieve(url,  pathname)
