In [246]:
# web scraping
from selenium import webdriver
from selenium.webdriver.common.by import By
import requests

# images
import PIL
from io import BytesIO

# file and os management
import os, re, time, shutil
from imutils import paths

# generic
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd 

# notebook display
from IPython.display import display, clear_output

In [251]:

toxic_meta = pd.DataFrame({
    "class_id" : [0, 1, 2, 3, 4, 5],
    "slang" : ["Western Poison Oak", "Eastern Poison Oak", "Eastern Poison Ivy", "Western Poison Ivy", "Poison Sumac", "Nontoxic"],
    "scientific_name" : ["Toxicodendron diversilobum", "Toxicodendron pubescens", "Toxicodendron radicans",
                         "Toxicodendron rydbergii", "Toxicodendron vernix", "NA"],
    "herbarium22_category_id" : [int(c) for c in [14625, 14626, 14627, 14628, 14629, -1]],
    "path" : ["../input/toxic-plant-classification/tpc-imgs/000/", "../input/toxic-plant-classification/tpc-imgs/001/",
             "../input/toxic-plant-classification/tpc-imgs/002/", "../input/toxic-plant-classification/tpc-imgs/003/",
             "../input/toxic-plant-classification/tpc-imgs/004/", "../input/toxic-plant-classification/tpc-imgs/005/"],
    "search_url" : ["https://www.inaturalist.org/taxa/51080-Toxicodendron-diversilobum/browse_photos",
                    "https://www.inaturalist.org/taxa/52083-Toxicodendron-pubescens/browse_photos",
                    "https://www.inaturalist.org/taxa/58732-Toxicodendron-radicans/browse_photos",
                    "https://www.inaturalist.org/taxa/58729-Toxicodendron-rydbergii/browse_photos",
                    "https://www.inaturalist.org/taxa/54767-Toxicodendron-vernix/browse_photos",
                    "NA"]
})

toxic_meta.to_csv("tpc_meta.csv")


class_to_slang = dict(zip(toxic_meta.class_id, toxic_meta.slang))
class_to_sciname = dict(zip(toxic_meta.class_id, toxic_meta.scientific_name))
toxic_meta

Unnamed: 0,class_id,slang,scientific_name,herbarium22_category_id,path,search_url
0,0,Western Poison Oak,Toxicodendron diversilobum,14625,../input/toxic-plant-classification/tpc-imgs/000/,https://www.inaturalist.org/taxa/51080-Toxicod...
1,1,Eastern Poison Oak,Toxicodendron pubescens,14626,../input/toxic-plant-classification/tpc-imgs/001/,https://www.inaturalist.org/taxa/52083-Toxicod...
2,2,Eastern Poison Ivy,Toxicodendron radicans,14627,../input/toxic-plant-classification/tpc-imgs/002/,https://www.inaturalist.org/taxa/58732-Toxicod...
3,3,Western Poison Ivy,Toxicodendron rydbergii,14628,../input/toxic-plant-classification/tpc-imgs/003/,https://www.inaturalist.org/taxa/58729-Toxicod...
4,4,Poison Sumac,Toxicodendron vernix,14629,../input/toxic-plant-classification/tpc-imgs/004/,https://www.inaturalist.org/taxa/54767-Toxicod...
5,5,Nontoxic,,-1,../input/toxic-plant-classification/tpc-imgs/005/,


In [263]:
# Nontoxic images:
nontoxic_meta = pd.DataFrame({
    "class_id" : [0, 1, 2, 3, 4],
    "slang": ["Virginia creeper", "Boxelder", "Jack-in-the-pulpit", "American hog-peanut", "Fragrant Sumac"],
    "scientific_name": ["Parthenocissus quinquefolia", "Acer negundo L.", "Arisaema triphyllum", "Amphicarpaea bracteata", "Rhus aromatica"],
    "herbarium22_category_id": [10340, 83, 1055, 610, 12479],
    "search_url": ["https://www.inaturalist.org/taxa/50278-Parthenocissus-quinquefolia/browse_photos",
                   "https://www.inaturalist.org/taxa/47726-Acer-negundo/browse_photos",
                   "https://www.inaturalist.org/taxa/50310-Arisaema-triphyllum/browse_photos",
                  # "https://www.inaturalist.org/taxa/447285-Amphicarpaea-bracteata-edgeworthii/browse_photos", ##FIX! NOT ENOUGH PHOTOS (might be a subspecies...)
                   "https://www.inaturalist.org/taxa/58738-Rhus-aromatica/browse_photos"]
})

# Scraping iNaturalist 
## Scrape the search page's html content for image URLs, and then get the images from their url

In [275]:
def scrape_for_urls(search_url, max_urls=5):
    """
    Use selenium to open the page, load all html content, find urls hidden in html elements, and save the urls.
    By default, the search page is set so Grouping=None, Plant Phenology=Any, Order By=Faves, Photo Licensing=Any,
    and Quality Grade=Research.
    """
    driver = webdriver.Chrome("C:/Users/hanse/Documents/chromedriver_win32/chromedriver.exe")
    driver.get(search_url)
    time.sleep(5) #sleep_between_interactions

    # Stage 1: Load enough images to satisfy the desired number of urls (max_urls) 
    nonunique_urls = []
    while len(set(nonunique_urls)) < max_urls:
        # Scrape through site counting urls. Load more images if need more urls to reach max_urls
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") ##scroll to end of page
        elem = driver.find_element(By.XPATH, "//*") ##find all HTML elements
        source_code = elem.get_attribute('outerHTML')  ##save html source code
        time.sleep(3)
        split_html = source_code.split("CoverImage low undefined loaded") ##split by the class which contains image url in stlye
        for i in range(len(split_html)):
            result = re.search('&quot;(.*)&quot', split_html[i])
            if result is not None:
                nonunique_urls += [result.group(1)]
            if len(set(nonunique_urls)) >= max_urls: ##continue until the number of UNIQUE urls in the list is max_urls...
                break
    
    # Stage 2: Save the unique urls in a list
    urls = []
    elem = driver.find_element(By.XPATH, "//*") ##find all HTML elements one final time
    source_code = elem.get_attribute('outerHTML')  ##save final html source code with all loaded urls
    time.sleep(3)
    split_html = source_code.split("CoverImage low undefined loaded") ##split string and isolate urls
    for i in range(len(split_html)):
        result = re.search('&quot;(.*)&quot', split_html[i])
        if result is not None:
            urls += [result.group(1)]
        if len(urls) >= max_urls: ##continue saving until number of urls is = to max
            break

    driver.close()
    return urls

In [291]:
def get_images_and_save(class_id, urls, toxic):
    "Retrieves images from urls and saves them in a structured format"
    if toxic:
        dir=f'../iNaturalist/toxic_images/00{class_id}/'
    else:
        dir=f'../iNaturalist/nontoxic_images/00{class_id}/'

    if os.path.isdir(dir): # see if dir exists - if so remove it to get a clean slate
        shutil.rmtree(dir)
    os.mkdir(dir) # make a new empty dir

    total = 0
    for idx, url in enumerate(urls):
        # before saving image to disk, give an indexed name
        if total < 10:
            str_total = "00"+str(total)
        elif 9 < total < 100:
            str_total = "0"+str(total)
        elif total > 99:
            str_total = str(total)
        img_path = os.path.join(dir+str_total+".jpg")
        try:
            res = requests.get(urls[idx], timeout=60)
            img = PIL.Image.open(BytesIO(res.content))
            img.save(img_path)
            # update the counter
            total += 1
            clear_output(wait=True) ##allow print statements to overwrite previous ones
            display(f"[INFO] downloaded: {img_path} | Total {total}")
        except Exception as e:
            display(f"[INFO] error downloading {img_path}...skipping")
            display(e)


In [277]:
max_urls = 50  ##about 10 urls found per scroll
toxic_urls_list = []
nontoxic_urls_list = []

# SCRAPE iNaturalist HTML SOURCE FOR IMAGE URLS 
for i in range(5):
    "GET TOXIC URLS"
    toxic_urls_list += [scrape_for_urls(toxic_meta.loc[i, 'search_url'], max_urls)]
    "GET NONTOXIC URLS"
    nontoxic_urls_list += [scrape_for_urls(nontoxic_meta.loc[i, 'search_url'], max_urls)]



'[INFO] downloaded: ../iNaturalist/toxic_images/000/086.jpg | Total 87'

UnboundLocalError: local variable 'p' referenced before assignment

In [299]:
# DOWNLOAD THE CORRESPONDING IMAGES TO LOCAL DISK
for i in range(5):
    "GET TOXIC IMAGES"
    get_images_and_save(class_id=i, urls=toxic_urls_list[i], toxic=True)
    "GET NONTOXIC IMAGES"
    get_images_and_save(class_id=i, urls=nontoxic_urls_list[i], toxic=False)

'[INFO] downloaded: ../iNaturalist/nontoxic_images/004/066.jpg | Total 67'

# Clean Up

In [300]:
def clean_image_paths(image_directory):
    """
    Tries to load each image using OpenCV. If it returns None, the image is faulty and we delete it.
    If loading the image does not work and produces an error, we delete it.  
    """
    # loop over the image paths we just downloaded
    for imagePath in paths.list_images(image_directory):
        # initialize if the image should be deleted or not
        delete = False
        # try to load the image
        try:
            image = cv2.imread(imagePath)
            # if the image is `None` then we could not properly load it
            # from disk, so delete it
            if image is None:
                delete = True
        # if OpenCV cannot load the image then the image is likely
        # corrupt so we should delete it
        except:
            print("Except")
            delete = True
        # check to see if the image should be deleted
        if delete:
            display("[INFO] deleting {}".format(imagePath))
            os.remove(imagePath)

In [301]:
for cat in ["000/", "001/", "002/", "003/", "004/"]:
    tox_dir = os.path.join("../iNaturalist/toxic_images/",cat)
    clean_image_paths(tox_dir)
    nontox_dir = os.path.join("../iNaturalist/nontoxic_images/",cat)
    clean_image_paths(nontox_dir)


Reset image filenames so they are indexed from 000, 001, ... uninterruped.

In [None]:
# p1 = '../toxic_images/'
# p2 = '../nontoxic_images/'
# paths = [p1+"000/", p1+"001/", p1+"002/", p1+"003/", p1+"004/", p2+"000/"]

# for path in paths:
#     for i, filename in enumerate(os.listdir(path)):
#         if i < 10:
#              str_i = "00"+str(i)
#         elif 9 < i < 100:
#             str_i = "0"+str(i)
#         elif i > 99:
#             str_i = str(i)

#         os.rename(path + filename, path + str_i + ".jpg")

# Image Counts by Category

In [307]:
# Final Image Counts: 
p1 = "../iNaturalist/toxic_images/"
p2 = "../iNaturalist/nontoxic_images/"

print("Toxic Images:")
total = 0
for c in ['000/', '001/', '002/', '003/', '004/']:
    pth = os.path.join(p1, c)
    ims = len([name for name in os.listdir(pth)])
    total += ims
    print(f"Category {c} - Images: {ims}")
print("total = ", total)

print("")
print("Nontoxic Images:")
total = 0
for c in ['000/', '001/', '002/', '003/', '004/']:
    pth = os.path.join(p2, c)
    ims = len([name for name in os.listdir(pth)])
    total += ims
    print(f"Category {c} - Images: {ims}")
print("total = ", total)


Toxic Images:
Category 000/ - Images: 87
Category 001/ - Images: 72
Category 002/ - Images: 72
Category 003/ - Images: 74
Category 004/ - Images: 89
total =  394

Nontoxic Images:
Category 000/ - Images: 96
Category 001/ - Images: 86
Category 002/ - Images: 82
Category 003/ - Images: 57
Category 004/ - Images: 67
total =  388
