In [13]:
# Standard imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


# For http interfaces and APIs
import requests
from flickrapi import FlickrAPI

# System I/O controls and clock packages
import pickle
import os
import sys
import time

### FilckrAPI Pic Collector Implementation
1. Takes a dictionary of search terms and downloads images
2. Creates a directory of each search term where images are stored (data/{search-term-directory}/{image-file}). 
3. Creates a dictionary of information regarding images gathered (not as important as it could be*).
<br> *Because I am using manual selection to remove images that don't apply, it will be required to build 
<br> the labels.csv file after the manual selection has occurred. As a result, I will build the labels.csv 
<br> file after the image collection process has been completed in the DataLoader.ipynb file.

Due to the errors that crop up from time to time while running this task, (most often connection reset by peer),
<br> I have found that this is more of a manually managed process than fully automated.
<br>I kick off a search with the full dictionary of target plants and then end up removing species from the dictionary
<br>once I have enough images collected prior to restarting the process to collect the remaining species images.
<br>
<br>
Future plans to create a script that will take search inputs, number of images and directory name for easier handling.

### Important Note: FlickrAPI Throughput
FlickrAPI need to stay below 3600 queries per hour. In order to be conservative, I've put in a 1s delay between image downloads. 

In [2]:
# load key and secret
f_key = open('Key.txt', encoding='utf-8')
f_secret = open('Secret.txt', encoding='utf-8')
KEY = f_key.read()
SECRET = f_secret.read()

In [3]:
# I only want images of format: url_q so have removed the other image size formats.

SIZES = ['url_q']  # in order of preference. 

In [4]:
# Initialize the API comms with key and secret, search for public photos based on dictionary search terms.

def get_photos(image_tag):
    extras = SIZES
    flickr = FlickrAPI(KEY, SECRET)
    photos = flickr.walk(text=image_tag,  # it will search by image title and image tags
                            extras=extras,  # get the urls for each size we want
                            privacy_filter=1,  # search only for public photos
                            per_page=50,
                            sort='relevance')  # we want what we are looking for to appear first
    return photos

In [5]:
# This method gets a single url and handles the case where multiple image sizes are searched for.
# (in the case an image doesn't exist in one size, try another from the list). 
def get_url(photo):
    for i in range(len(SIZES)):  # makes sure the loop is done in the order we want
        url = photo.get(SIZES[i])
        if url:  # if url is None try with the next size
            return url

In [6]:
# This method will create a list of valid urls per plant with the max number being the value of images_per_plant
# inputs image_tag which is the current search term, max which is the images_per_plant value.
# returns a list of urls.

def get_urls(image_tag, max):
    photos = get_photos(image_tag)
    counter=0
    urls=[]

    for photo in photos:
        if counter < max:
            url = get_url(photo)  # get preffered size url
            if url:
                urls.append(url)
                counter += 1
            # if no url for the desired sizes then try with the next photo
        else:
            break

    return urls

In [7]:
# This method will create directories (if they don't already exist) based on the search terms included in the input dictionary to the download method.

def create_folder(path):
    if not os.path.isdir(path):
        os.makedirs(path)

In [8]:
# This method downloads the images and stores them in the correct directories.
# It will ignore the file if it already exists.
# It will return a dictionary of filename, class and species though this isn't used currently.

def download_images(urls, path, plant):
    create_folder(path)  # makes sure path exists
    
    # Create dictionary of values to keep track of file names / paths, species, class
    images_dict = {'filename':[], 'class':[], 'species':[]}

    for url in urls:
        image_name = url.split("/")[-1]
        image_path = os.path.join(path, image_name)
        if not os.path.isfile(image_path):  # ignore if already downloaded
            response=requests.get(url,stream=True)

            with open(image_path,'wb') as outfile:
                outfile.write(response.content)
            images_dict['filename'].append(image_name)
            images_dict['class'].append(plants[plant]) 
            images_dict['species'].append(plant)
        time.sleep(1)
    return images_dict

In [9]:
# This method kicks it all off. Sets number of images to gather, input is the dictionary of search terms.
# Returns dictionary of results
images_per_plant = 3000

def download(data_dict):
    for key in data_dict:

        print('Getting urls for', key)
        urls = get_urls(key, images_per_plant)
        print('Downloading images for', key)
        path = os.path.join('data', key)
        images_dict = download_images(urls, path, key)
    return images_dict

### Prepare inputs for search terms, labels for classes

In [15]:
# Create a list of search terms, scientific name, common name(s) and class.
# Heracleum mantegazzianum, giant hogweed class 0
# Echium vulgare, blueweed class 1
# Ulex europaeus, gorse class 2

plants = {'Heracleum mantegazzianum': 0, 'giant hogweed': 0}

#  'Echium vulgare':1,'blueweed': 1, 'Ulex europaeus':2, 'gorse': 2
# Need a list of class 3 plants, will start with these though.

In [16]:
# Calls download method which kicks off the image search and assigns the resulting dictionary to images_dict (currently not used for anything).
images_dict = download(plants)

Getting urls for Heracleum mantegazzianum
Downloading images for Heracleum mantegazzianum
Getting urls for giant hogweed
Downloading images for giant hogweed


### Null Class Images
The list of plants below are the species of plants that will make up my 4th class (clsas_3), the negative class.
Due to issues with implementing this, I am no longer using this list of species in my project but may add them again later.

For now, I will use the images I have collected once I have implemented a probabilities method of determining if a photo is one of the 3 'positive' classes or not.

<table>
    <tr>
        <th>Scientific Name</th>
        <th>Common Name</th>
    </tr>
    <tr>
        <td>allium cernuum</td>
        <td>nodding onion</td>
    </tr>
    <tr>
        <td>amsinckia menziesii var. intermedia</td>
        <td>common fiddleneck</td>
    </tr>
    <tr>
        <td>lysimachia thyrsiflora</td>
        <td>tufted loosestrife</td>
    </tr>
    <tr>
        <td>mahonia aquifolium</td>
        <td>oregon-grape</td>
    </tr>
    <tr>
        <td>maianthemum racemosum ssp. amplexicaule</td>
        <td>false solomon's-seal</td>
    </tr>
    <tr>
        <td>monotropa uniflora</td>
        <td>indian-pipe</td>
    </tr>
    <tr>
        <td>oplopanax horridus</td>
        <td>devils club plant</td>
    </tr>
    <tr>
        <td>pedicularis contorta var. contorta</td>
        <td>coil-beaked lousewort</td>
    </tr>
    <tr>
        <td>phyllodoce empetriformis</td>
        <td>pink mountain-heather</td>
    </tr>
    <tr>
        <td>potentilla drummondii</td>
        <td>drummond's cinquefoil</td>
    </tr>
    <tr>
        <td>ranunculus acris</td>
        <td>meadow buttercup</td>
    </tr>
</table>   
        

In [None]:
# The null class will be made up of 300 images each of 22 plant species native to BC.

class_dict = {'Heracleum mantegazzianum': 0, 'giant hogweed': 0, 
              'Echium vulgare':1, 'blueweed': 1, 'Ulex europaeus':2, 
              'gorse': 2, 'allium cernuum':3, 'nodding onion':3, 
              'amsinckia menziesii var. intermedia':3,
              'common fiddleneck':3, 'lysimachia thyrsiflora':3, 
              'tufted loosestrife':3, 'mahonia aquifolium':3,
              'oregon-grape':3, 'maianthemum racemosum ssp. amplexicaule':3,
              'false solomon\'s-seal':3, 'monotropa uniflora':3, 
              'indian-pipe':3, 'oplopanax horridus':3, 
              'devils club plant':3, 'pedicularis contorta var. contorta':3,
              'coil-beaked lousewort':3, 'phyllodoce empetriformis':3, 
              'pink mountain-heather':3, 'potentilla drummondii':3,
              'drummond\'s cinquefoil':3, 'ranunculus acris':3, 
              'meadow buttercup':3} 

### Sorting Data

Because of the Internal Server 500 errors when getting the URLs for the positive classes images, I will need to create my label dataframe and save to csv from the local file system.

In [None]:
print(dir_names)

In [None]:
image_names = []
image_dict = {}
for i, dir in enumerate(dir_names):
    image_names.append(os.listdir(f'data/BC-images-clean/{dir_names[i]}'))
    image_dict.update({dir_names[i]:image_names})

In [None]:
# open the file to save to
image_dict_pkl = open('data/images-pickle_file2', 'wb')
 
# write the images dictionary to the file
pickle.dump(images_dict, images_dict_pkl)