In [1]:
# Standard imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle

# For http interfaces and APIs
import requests
from flickrapi import FlickrAPI

# For performing regex operations
import re
# For adding delays so that we don't spam requests
import time

from progress.bar import Bar
import os
import sys
import time
import glob


### FilckrAPI Implementation
1. Load key and secret for API authentication.
2. 

### FlickrAPI Throughput
FlickrAPI need to stay below 3600 queries per hour. In order to be conservative, I've put in a 2s delay between image downloads. 
It doesn't really matter where the delay goes, the effect will be the same.

In [2]:
# load key and secret
f_key = open('Key.txt', encoding='utf-8')
f_secret = open('Secret.txt', encoding='utf-8')
KEY = f_key.read()
SECRET = f_secret.read()

In [3]:
# SIZES = ['url_s', 'url_t', 'url_q', 'url_sq' ]  # in order of preference. I only want thumbnails where possible

In [4]:
def get_photos(image_tag):
    extras = 'url_q'
    flickr = FlickrAPI(KEY, SECRET)
    photos = flickr.walk(text=image_tag,  # it will search by image title and image tags
                            extras=extras,  # get the urls for each size we want
                            privacy_filter=1,  # search only for public photos
                            per_page=50,
                            sort='relevance')  # we want what we are looking for to appear first
    return photos

In [5]:
def get_url(photo):
    for i in range(len('url_q')):  # makes sure the loop is done in the order we want
        url = photo.get('url_q')
        if url:  # if url is None try with the next size
            return url

In [6]:
def get_urls(image_tag, max):
    photos = get_photos(image_tag)
    counter=0
    urls=[]

    for photo in photos:
        if counter < max:
            url = get_url(photo)  # get preffered size url
            if url:
                urls.append(url)
                counter += 1
            # if no url for the desired sizes then try with the next photo
        else:
            break

    return urls

In [7]:
def create_folder(path):
    if not os.path.isdir(path):
        os.makedirs(path)

In [8]:
def download_images(urls, path, plant):
    create_folder(path)  # makes sure path exists
    
    # Create dictionary of values to keep track of file names / paths, species, class, write success
    images_dict = {'filename':[], 'class':[], 'species':[], 'write_success':[]}

    for url in urls:
        image_name = url.split("/")[-1]
        image_path = os.path.join(path, image_name)
        if not os.path.isfile(image_path):  # ignore if already downloaded
            response=requests.get(url,stream=True)

            with open(image_path,'wb') as outfile:
                outfile.write(response.content)
            images_dict['filename'].append(image_name)
            images_dict['class'].append(null_class_dict[plant]) 
            images_dict['species'].append(plant)
        time.sleep(2)
    return images_dict

In [9]:
images_per_plant = 350

def download(data_dict):
    for key in data_dict:

        print('Getting urls for', key)
        urls = get_urls(key, images_per_plant)
        print('Downloading images for', key)
        path = os.path.join('data', key)
        images_dict = download_images(urls, path, key)
    return images_dict

### Prepare inputs for search terms, labels for classes

In [None]:
# Create a list of search terms, scientific name, common name(s) and class.
# Heracleum mantegazzianum, giant hogweed class 0
# Echium vulgare, blueweed class 1
# Ulex europaeus, gorse class 2
# 'heracleum mantegazzianum': 0, 'giant hogweed': 0, 
plants = {'blueweed': 1}

# Need a list of class 3 plants, will start with these though.

<table>
    <tr>
        <th>Scientific Name</th>
        <th>Common Name</th>
    </tr>
    <tr>
        <td>allium cernuum</td>
        <td>nodding onion</td>
    </tr>
    <tr>
        <td>amsinckia menziesii var. intermedia</td>
        <td>common fiddleneck</td>
    </tr>
    <tr>
        <td>lysimachia thyrsiflora</td>
        <td>tufted loosestrife</td>
    </tr>
    <tr>
        <td>mahonia aquifolium</td>
        <td>oregon-grape</td>
    </tr>
    <tr>
        <td>maianthemum racemosum ssp. amplexicaule</td>
        <td>false solomon's-seal</td>
    </tr>
    <tr>
        <td>monotropa uniflora</td>
        <td>indian-pipe</td>
    </tr>
    <tr>
        <td>oplopanax horridus</td>
        <td>devils club plant</td>
    </tr>
    <tr>
        <td>pedicularis contorta var. contorta</td>
        <td>coil-beaked lousewort</td>
    </tr>
    <tr>
        <td>phyllodoce empetriformis</td>
        <td>pink mountain-heather</td>
    </tr>
    <tr>
        <td>potentilla drummondii</td>
        <td>drummond's cinquefoil</td>
    </tr>
    <tr>
        <td>ranunculus acris</td>
        <td>meadow buttercup</td>
    </tr>
</table>   
        

In [10]:
# The null class will be made up of 300 images each of 22 plant species native to BC.
# 'allium cernuum':3, 'nodding onion':3, 'amsinckia menziesii var. intermedia':3,
#                    'common fiddleneck':3, 'lysimachia thyrsiflora':3, 
#                    'tufted loosestrife':3, 'mahonia aquifolium':3,
#                    'oregon-grape':3, 'maianthemum racemosum ssp. amplexicaule':3,
#                    'false solomon\'s-seal':3, 'monotropa uniflora':3, 
#                    'indian-pipe':3, 'oplopanax horridus':3, 
#                    'devils club plant':3, 'pedicularis contorta var. contorta':3,
#                    'coil-beaked lousewort':3, 'phyllodoce empetriformis':3, 
#                    'pink mountain-heather':3, 'potentilla drummondii':3,
#                    'drummond\'s cinquefoil':3, 'ranunculus acris':3, 
#                    'meadow buttercup':3
null_class_dict = {'tufted loosestrife':3, 'maianthemum racemosum ssp. amplexicaule':3, 
                   'pedicularis contorta var. contorta':3, 'potentilla drummondii':3,
                   'drummond\'s cinquefoil':3} 

In [11]:
images_dict = download(null_class_dict)

Getting urls for tufted loosestrife
Downloading images for tufted loosestrife
Getting urls for maianthemum racemosum ssp. amplexicaule
Downloading images for maianthemum racemosum ssp. amplexicaule
Getting urls for pedicularis contorta var. contorta
Downloading images for pedicularis contorta var. contorta
Getting urls for potentilla drummondii
Downloading images for potentilla drummondii
Getting urls for drummond's cinquefoil
Downloading images for drummond's cinquefoil


In [None]:
# open the file to save to
images_dict_pkl = open('data/images-pickle_file2', 'wb')
 
# write the images dictionary to the file
pickle.dump(images_dict, images_dict_pkl)

### Sorting Data

Because of the Internal Server 500 errors when getting the URLs for the positive classes images, I will need to create my label dataframe and save to csv from the local file system.

In [None]:
dir_names = ['blueweed', 'echium_vulgare', 'giant_hogweed', 'gorse', 'heracleum_mantegazzianum', 'ulex_europaeus']
image_names = []
image_dict = {}
for i, dir in enumerate(dir_names):
    image_names.append(glob.glob(f'/home/ksawczuk/python-repo/InvasiveId/data/{dir_names[i]}/*.jpg'))
    image_dict.update({dir_names[i]:image_names})