<a href="https://colab.research.google.com/github/kenichinakanishi/houseplant_classifier/blob/master/Part1_Building_a_Database.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Web Data Scraping

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup

def getHTMLContent(link):
    html = urlopen(link)
    soup = BeautifulSoup(html, 'html.parser')
    return soup

Grab data for cats.

In [None]:
req = Request('https://www.aspca.org/pet-care/animal-poison-control/cats-plant-list', headers={'User-Agent': 'Mozilla/5.0'})
webpage = urlopen(req).read()
# Soupify the webpage
soup = BeautifulSoup(webpage, 'lxml')       
# Search through the parse tree to get all the content from the table         
content_list = soup.find_all('span')[7:-4]       
# Put it in a dataframe for further processing
df_cats = pd.DataFrame(content_list)           

In [None]:
# Clean up the strings
df_cats[0] = df_cats[0].apply(lambda x: str(x).split('>')[1][:-3])
df_cats[4] = df_cats[4].apply(lambda x: str(x).split('>')[1][:-3])
df_cats[1] = df_cats[1].apply(lambda x: str(x).split('(')[1][0:-4])
# Get rid of useless columns and rename the columns
df_cats = df_cats.drop(columns=[2,3,5,6]).rename(columns = {0:'Name',1:'Alternative Names',4:'Scientific Name',7:'Family'})
# Separate toxic and non-toxic plants
df_cats['Toxic to Cats'] = True
first_nontoxic_cats = [index for index in df_cats[df_cats['Name'].str.startswith('A')].index if index>100][0]
df_cats.loc[first_nontoxic_cats:,'Toxic to Cats'] = False

Same thing for dogs.

In [None]:
req = Request('https://www.aspca.org/pet-care/animal-poison-control/dogs-plant-list', headers={'User-Agent': 'Mozilla/5.0'})
webpage = urlopen(req).read()
soup = BeautifulSoup(webpage, 'lxml')                 # soupify the webpage
content_list = soup.find_all('span')[7:-4]            # Get all the content from the table
df_dogs = pd.DataFrame(content_list)                  # Put it in a dataframe for processing

In [None]:
# Clean up the strings
df_dogs[0] = df_dogs[0].apply(lambda x: str(x).split('>')[1][:-3])
df_dogs[4] = df_dogs[4].apply(lambda x: str(x).split('>')[1][:-3])
df_dogs[1] = df_dogs[1].apply(lambda x: str(x).split('(')[1][0:-4])
# Get rid of useless columns and rename the columns
df_dogs = df_dogs.drop(columns=[2,3,5,6]).rename(columns = {0:'Name',1:'Alternative Names',4:'Scientific Name',7:'Family'})
# Separate toxic and non-toxic plants
df_dogs['Toxic to Dogs'] = True
first_nontoxic_dogs = [index for index in df_dogs[df_dogs['Name'].str.startswith('A')].index if index>100][0]
df_dogs.loc[first_nontoxic_dogs:,'Toxic to Dogs'] = False

Merge data into one master dataframe. 

In [None]:
# Merge dataframes into one, outer merge used to retain values that only exist on one side
df_catsdogs = df_dogs.merge(df_cats, how='outer', on=['Name','Alternative Names','Scientific Name','Family'])
df_catsdogs = df_catsdogs.fillna('Unknown')
aspca_df = df_catsdogs.copy()
# Assume same toxicity for dogs and cats if unknown
aspca_df['Toxic to Cats'] = aspca_df.apply(lambda x: x['Toxic to Dogs'] if (x['Toxic to Cats'] == 'Unknown') else x['Toxic to Cats'], axis=1)
aspca_df['Toxic to Dogs'] = aspca_df.apply(lambda x: x['Toxic to Cats'] if (x['Toxic to Dogs'] == 'Unknown') else x['Toxic to Dogs'], axis=1)

In [None]:
aspca_df.sample(10)

Unnamed: 0,Name,Alternative Names,Scientific Name,Family,Toxic to Dogs,Toxic to Cats
821,Poison Sumac,"Similar: Poison Ivy, Poison Oak",Toxicodendron species,Anacardiaceae,False,False
864,Sand Lily,"mountain lily, star lily",Leucocrinum montanum,Liliaceae,False,False
200,Jade Plant,"Baby Jade, Dwarf rubber plant, Jade tree, Chin...",Crassula argentea,Crassulaceae,True,True
243,Mayweed,"Poison Daisy, Stinking Chamomile",Anthemis cotula,Asteraceae,True,True
750,Mockernut Hickory,Squarenut,Carya tomentosa,Juglandaceae,False,False
333,Sago Palm,"Coontie Palm, Cardboard Palm, cycads and zamias","Cycas revoluta, zamia species",Cycadaceae,True,True
766,Muscari Armeniacum,Grape Hyacinth,Muscari armeniacum,Hyacinthaceae,False,False
195,Inkberry,"English Holly, European Holly, Oregon Holly, A...",Ilex opaca,Aquifoliaceae,True,True
749,Mistletoe Cactus,,Rhipsalis cassutha,Cactaceae,False,False
699,King and Queen Fern,"Hen and Chickens Fern, Spleenwort, Parsley fern",Asplenium bulbiferum,Polypodaceae,False,False


# Data Cleanup

In [None]:
aspca_df = aspca_df.drop_duplicates('Scientific Name') # Get rid of duplicates
aspca_df = aspca_df.reset_index(drop=True).sort_index()   # Reset and sort index

In [None]:
aspca_df = aspca_df.drop(aspca_df[aspca_df['Scientific Name'].isin(['','NONE LISTED'])].index,axis=0).reset_index(drop=True).sort_index()    # Fix mistakes in database

In [None]:
# Ensure proper punctuation for each scientific name.
def normalize_capitalization(x):
  first_word, rest = x.split()[0], x.split()[1:]
  first_word = [first_word.capitalize()]
  rest = [word.lower() for word in rest]
  return ' '.join(first_word+rest)

# Clean up repeated species that have different names
def species_normalizer(word):
  if word.split()[-1] in ['sp','species','spp','sp.','spp.']:
    word = ''.join(word.split()[:-1])
  return word

# Remove cv from names, as it is an outdated way of referring to cultivars
def cv_remover(word):
  if 'cv' in word:
    word = word.replace(' cv ',' ')
  return word

# Remove var. from names
def var_remover(word):
  if 'var' in word:
    word = word.replace(' var. ',' ')
  return word

# Apply each of the functions
aspca_df['Scientific Name'] = aspca_df['Scientific Name'].apply(normalize_capitalization)
aspca_df['Scientific Name'] = aspca_df['Scientific Name'].apply(species_normalizer)
aspca_df['Scientific Name'] = aspca_df['Scientific Name'].apply(cv_remover)
aspca_df['Scientific Name'] = aspca_df['Scientific Name'].apply(var_remover)

# Remove special characters
aspca_df['Scientific Name'] = aspca_df['Scientific Name'].apply(lambda x: ''.join([character for character in x if character.isalnum() or character.isspace()]))

# Reset dataframe for further processing
aspca_df = aspca_df.sort_values('Scientific Name').drop_duplicates('Scientific Name')
aspca_df = aspca_df.reset_index(drop=True).sort_index()

In [None]:
aspca_df.sample(10)

Unnamed: 0,Name,Alternative Names,Scientific Name,Family,Toxic to Dogs,Toxic to Cats
510,Deadly Nightshade,"Nightshade, Black Nightshade, European Bitters...",Solanum,Solanaceae,True,True
410,Trailing Peperomia,,Peperomia prostata,Piperaceae,False,False
327,Leopard Lily,,Lachenalia lilacina,Hyacinthaceae,False,False
297,Silver Pink Vine,,Hoya publcalyx,Asclepiadaceae,False,False
480,Scarlet Sage,Texas Sage,Salvia coccinea,Labiatae,False,False
292,Belmore Sentry Palm,Curly Palm,Howea belmoreana,Palmae,False,False
357,Apple,Includes crabapples,Malus sylvestrus,Rosaceae,True,True
189,Dainty Rabbits-Foot Fern,"Lacy Paw, Lacy Hare's Foot",Davallia fejeensis,Davalliaceae,False,False
132,Bergamot Orange,"Bergamot, Citrus bergamia",Citrus aurantium,Rutaceae,True,True
384,Nicotiana,"Tree Tobacco, Tobacco, Mustard Tree",Nicotiana glauca,Solanaceae,True,True


## Cross-check scientific names against http://www.worldfloraonline.org/

From experimentation with this dataset, a large amount of the scientific names in the dataset are either mispelled, or out of date synonyms for the accepted name of a species. 

Mispellings will cause issues for the searching of google images for a plant that doesn't exist!

Searching for out of date synonyms can have similar issues, or cause a class to be represented twice with different labels, causing our model to have to guess between two identical plants. 

To fix this, we are going to rely on the [World Flora Online taxonomic backbone](http://www.worldfloraonline.org/downloadData), which is actively curated by taxonomic specialists of particular plant groups, with Taxonomic Expert Networks (TENs) given responsibility for updating the classifications of families. 

This database provides their data in a txt file, which we can read in and work to compare against the database scraped from the ASPCA plant toxicity database.

In [None]:
use_cols = ['scientificName','taxonRank','family','genus','taxonomicStatus','taxonID', 'acceptedNameUsageID']
wfo_df = pd.read_csv('/content/drive/My Drive/Houseplant Classifier/classification.txt', sep='\t', lineterminator='\n', usecols=use_cols)
wfo_df = wfo_df.sort_values('taxonomicStatus')

In [None]:
wfo_df.sample(10)

Unnamed: 0,taxonID,scientificName,taxonRank,family,genus,taxonomicStatus,acceptedNameUsageID
952136,wfo-0000959029,Eugenia sellowiana,SPECIES,Myrtaceae,Eugenia,Synonym,wfo-0000336633
973864,wfo-0000980782,Hedyotis recurva,SPECIES,Rubiaceae,Hedyotis,Synonym,wfo-0000255227
1223623,wfo-0001233467,Microlepia krameri,SPECIES,Dennstaedtiaceae,Microlepia,Unchecked,
958276,wfo-0000965169,Euphorbia tricolor,SPECIES,Euphorbiaceae,Euphorbia,Accepted,
346754,wfo-0000351334,Carex sinoaristata,SPECIES,Cyperaceae,Carex,Accepted,
923267,wfo-0000930031,Salix phloragna,SPECIES,Salicaceae,Salix,Unchecked,
529536,wfo-0000535004,Anemonastrum brevipedunculatum,SPECIES,Ranunculaceae,Anemonastrum,Synonym,wfo-0000535519
607087,wfo-0000612988,Coccoloba acutissima,SPECIES,Polygonaceae,Coccoloba,Synonym,wfo-0000613349
695250,wfo-0000701497,Geranium trilobum,SPECIES,Geraniaceae,Geranium,Synonym,wfo-0000700548
470827,wfo-0000476062,Pentaglottis suberifolia,SPECIES,Boraginaceae,Pentaglottis,Unchecked,


In [None]:
# Don't need this column, we trust the WFO database more
aspca_df.drop('Family', axis=1, inplace=True)
# Merge dataframes together to get trusted info
aspca_df = aspca_df.merge(wfo_df, how = 'left', left_on = ['Scientific Name'], right_on = ['scientificName'])
# Sort by taxonomicStatus and drop duplicates keeping the first - keeping accepted names as priority
aspca_df = aspca_df.sort_values('taxonomicStatus').drop_duplicates('Scientific Name', keep='first').reset_index(drop=True)
# Fill NaN's with Unknown
aspca_df = aspca_df.fillna('Unknown')

### Fix unknown data

Many scientific names refer to the same species but are off by a few letters due to errors in the database. Lets use a function from difflib to determine string distances to spot these errors. We can sort the dataframe and compare only to the scientific names that begin with the same letter to save time.  

In [None]:
# Clean up and deal with scientific names that are unknown, due to misspellings or otherwise.
aspca_df = aspca_df.sort_values('taxonomicStatus').drop_duplicates('Scientific Name', keep='first').reset_index(drop=True)
unknown_idx = aspca_df[aspca_df.taxonomicStatus == 'Unknown'].index
print(len(unknown_idx))

101


In [None]:
def get_closest_name(unknown_name, name_df = wfo_df, name_col = 'scientificName', threshold=0.9, verbose=False):
  """ Matches an 'unknown_name' against accepted names in a 'name_df'. Will return names that are above a 'threshold' of closeness. 

  Parameters
  ----------
  unknown_name: str
    Name we want to match against accepted names. 
  name_df: DataFrame
    DataFrame containing accepted names.
  name_col: str, name of name_df column 
    DataFrame column containing accepted names.
  threshold: int
    How closely does the unknown_name need to match with the accepted name.
    If above this threshold, the name is added to a dictionary of possible names.
  verbose: bool
    Should the function print the entire list of possible names. 

  Returns:
  ----------
  str
    Closest name to 'unknown_name' that was above the given 'threshold'.
  """
  import operator
  from difflib import SequenceMatcher
  def similar(a, b):
      return SequenceMatcher(None, a, b).ratio()
  poss_names = {}
  # Only look through entries with the same first letter to save time
  for true_sciname in name_df[name_df[name_col].str.startswith(unknown_name[0])][name_col].values:
    similar_score = similar(unknown_name, true_sciname)
    if similar_score>threshold:
      poss_names[true_sciname]=similar_score
  # If the dict is empty
  if verbose == True:
    print(poss_names)
  if not bool(poss_names):
    print(f'No names close enough to {unknown_name}.')
    return ''
  else:
    print(f'{unknown_name} is closest to {max(poss_names.items(), key=operator.itemgetter(1))[0]}, with a score of {max(poss_names.items(), key=operator.itemgetter(1))[1]:.2f}')
    return max(poss_names.items(), key=operator.itemgetter(1))[0]

In [None]:
def fix_name(unknown_name, true_name):
  """ Fixes the aspca_df entries according to the accepted wfo_df entry.

  Parameters
  ----------
  unknown_name: str
    Name we want to fix. 
  true_name: DataFrame
    Accepted name to use.
  """
  # Get the series we're looking to change
  unknown_data = aspca_df[aspca_df['Scientific Name'] == unknown_name]
  # Grab accepted data from wfo database based on ID lookup
  true_data = wfo_df[wfo_df['scientificName'] == true_name]
  true_sciname = true_data.loc[:,'scientificName'].values[0]
  true_family = true_data.loc[:,'family'].values[0]
  true_genus = true_data.loc[:,'genus'].values[0]
  true_taxonomicStatus = true_data.loc[:,'taxonomicStatus'].values[0]
  # Change scientific name, family, genus and taxonomic status to accepted versions
  aspca_df.iloc[unknown_data.index,2] = true_sciname
  aspca_df.iloc[unknown_data.index,8] = true_family
  aspca_df.iloc[unknown_data.index,9] = true_genus
  aspca_df.iloc[unknown_data.index,10] = true_taxonomicStatus

In [None]:
unknown_idx = aspca_df[aspca_df.taxonomicStatus == 'Unknown'].index
print(f'{len(unknown_idx)} plants currently cannot be matched.')
from tqdm.notebook import tqdm
for i in tqdm(unknown_idx):
  unknown_name = aspca_df.iloc[i,2]
  closest_name = get_closest_name(unknown_name)
  if closest_name == '':
    continue
  fix_name(unknown_name,closest_name)

101 plants currently cannot be matched.


HBox(children=(FloatProgress(value=0.0, max=101.0), HTML(value='')))

Nephrolepsis exalta is closest to Nephrolepis exaltata, with a score of 0.92
No names close enough to Nephrolepsis cordifolia plumosa.
No names close enough to Nephrolepsis cordifolia duffii.
No names close enough to Nephrolepis exalta bostoniensis.
Nephrolepis exalta is closest to Nephrolepis exaltata, with a score of 0.95
No names close enough to Neoregalia.
No names close enough to Miltonia roezlii alba.
No names close enough to Maranta insignis.
Malus sylvestrus is closest to Malus sylvestris, with a score of 0.94
No names close enough to Lilium orientalis.
No names close enough to Lampranthus piquet.
Lavendula angustifolia is closest to Lavandula angustifolia, with a score of 0.95
Tolmeia menziesii is closest to Tolmiea menziesii, with a score of 0.94
Kalmia poliifolia is closest to Kalmia polifolia, with a score of 0.97
Kalmia augustifolia is closest to Kalmia angustifolia, with a score of 0.95
Jasminium is closest to Jasminum, with a score of 0.94
Hoya publcalyx is closest to Ho

### Manual Fixes

Unfortunately, some of these unidentified species don't have an entry in the database that is sufficiently close enough for me to feel comfortable with automatic fixing. Hence, we do some manual fixes for the remaining unknowns. Thankfully, the above code has reduced the number of samples that need manual attention by around 60. 

In [None]:
# Scientific names that don't match anything on record automatically
unknown_df = aspca_df[aspca_df.taxonomicStatus == 'Unknown']
# Synonyms that don't have a database link to the accepted name
aspca_df = aspca_df.sort_values('taxonomicStatus').drop_duplicates('Scientific Name', keep='first').reset_index(drop=True)
unknown_ids = aspca_df[(aspca_df.acceptedNameUsageID == 'Unknown') & (aspca_df.taxonomicStatus == 'Synonym')]
len(unknown_ids) + len(unknown_df)

52

In [None]:
# Manually fix some scientific names that don't match anything on record automatically
fix_name('Nephrolepsis cordifolia plumosa', 'Nephrolepis cordifolia')
fix_name('Nephrolepsis cordifolia duffii', 'Nephrolepis cordifolia')
fix_name('Nephrolepis exalta bostoniensis', 'Nephrolepis exaltata')
fix_name('Neoregalia', 'Neoregelia')
fix_name('Miltonia roezlii alba', 'Miltonia roezlii')
fix_name('Maranta insignis', 'Calathea insignis')
fix_name('Lilium orientalis', 'Lilium japonicum')
fix_name('Lampranthus piquet', 'Lampranthus piquetbergensis')
fix_name('Hoya carnosa krinkle kurl', 'Hoya carnosa')
fix_name('Hemigraphis exotica', 'Hemigraphis alternata')
fix_name('Lilium asiatica', 'Lilium japonicum')
fix_name('Nolina tuberculata', 'Beaucarnea recurvata')
fix_name('Giant dracaena', 'Cordyline australis')
fix_name('Scindapsusphilodendron', 'Philodendron scandens')
fix_name('Schefflera or brassia actinoplylla', 'Schefflera actinophylla')
fix_name('Phoenix robellinii', 'Phoenix roebelenii')
fix_name('Peperomia serpens variegata', 'Peperomia serpens')
fix_name('Bertolonia mosaica', 'Fittonia albivenis')
fix_name('Begonia semperflorens cultivar', 'Begonia semperflorens')
fix_name('Begonia rex peace', 'Begonia rex')
fix_name('Asparagus densiflorus sprengeri', 'Asparagus densiflorus')
fix_name('Albiflora', 'Tradescantia zebrina')
fix_name('Acantha', 'Acanthus')
fix_name('Episcia cultivar', 'Episcia')
fix_name('Echevaria', 'Echeveria')
fix_name('Echeveria puloliver', 'Echeveria harmsii')
fix_name('Dypsis lutescens chrysalidocarpus lutescens alternate scientific name', 'Dypsis lutescens')
fix_name('Draceana', 'Dracaena')
fix_name('Daucus carota sativa', 'Daucus carota')
fix_name('Ceratostigma larpentiae', 'Ceratostigma plumbaginoides')
fix_name('Cycasrevolutazamia', 'Cycas revoluta')
fix_name('Cucurbita maxima turbaniformis', 'Cucurbita maxima')
fix_name('Cucurbita maxima hubbard', 'Cucurbita maxima')
fix_name('Cucurbita maxima butternut', 'Cucurbita maxima')
fix_name('Cucurbita maxima banana', 'Cucurbita maxima')
fix_name('Cucurbita maxima buttercup', 'Cucurbita maxima')
fix_name('Cucurbia pepo zucchini', 'Cucurbita pepo')
fix_name('Cryptanthus bivattus minor', 'Cryptanthus bivittatus')
fix_name('Cycasandzamia', 'Cycas')

In [None]:
# Manually match up synonyms that don't have a database link to the accepted name
fix_name('Chlorophytum bichetii', 'Chlorophytum laxum')
fix_name('Rhapis flabelliformis', 'Rhapis excelsa')
fix_name('Cleome hassleriana', 'Cleome spinosa')
fix_name('Pellionia pulchra', 'Pellionia repens')
fix_name('Cissus discolor', 'Cissus javana')
fix_name('Miltonia roezlii', 'Miltoniopsis roezlii')
fix_name('Sorghum vulgare var. sudanense', 'Sorghum bicolor')
fix_name('Camellia japonica var. japonica', 'Camellia japonica')
fix_name('Onychium japonicum', 'Onychium japonicum')
fix_name('Epidendrum atropurpureum', 'Psychilis atropurpurea')
fix_name('Philodendron scandens', 'Philodendron hederaceum')
fix_name('Origanum vulgare var. hirtum', 'Origanum vulgare subsp. hirtum')
fix_name('Guzmania lingulata var. minor', 'Guzmania lingulata var. concolor')
fix_name('Lavandula angustifolia', 'Lavandula angustifolia')
fix_name('Begonia semperflorens', 'Begonia cucullata')
fix_name('Calathea insignis', 'Calathea crotalifera')
fix_name('Citrus ×limonia', 'Citrus limon')
fix_name('Coleus amboinicus', 'Plectranthus amboinicus')
fix_name('Rhipsalis cassytha', 'Rhipsalis dichotoma')
fix_name('Lycopersicon', 'Solanum lycopersicum')
fix_name('Lachenalia lilacina', 'Iris domestica')
fix_name('Cymopterus watsonii', 'Cymopterus terebinthinus')

Finally, we check that every houseplant now has an accepted scientific name we will use for image lookup. 

In [None]:
# Scientific names that don't match anything on record automatically
unknown_df = aspca_df[aspca_df.taxonomicStatus == 'Unknown']
# Synonyms that don't have a database link to the accepted name
aspca_df = aspca_df.sort_values('taxonomicStatus').drop_duplicates('Scientific Name', keep='first').reset_index(drop=True)
unknown_ids = aspca_df[(aspca_df.acceptedNameUsageID == 'Unknown') & (aspca_df.taxonomicStatus == 'Synonym')]
len(unknown_ids) + len(unknown_df)

0

### Fix synonymous scientific names

Scientific names can change over time due to updated research. If a scientific name is a synonym for an accepted one, we'd like to use the accepted one. 

In [None]:
synonym_idx = aspca_df[aspca_df['taxonomicStatus'].values == 'Synonym'].index
print(f'{len(synonym_idx)} entries have a more acceptable synonym')

74 entries have a more acceptable synonym


In [None]:
# Work to update the remaining scientific names that are synonyms for their accepted scientific names
aspca_df = aspca_df.sort_values('taxonomicStatus').drop_duplicates('Scientific Name', keep='first').reset_index(drop=True)
synonym_idx = aspca_df[aspca_df['taxonomicStatus'].values == 'Synonym'].index
for i in synonym_idx:
  # Get the series we're looking to change
  synonym_data = aspca_df.iloc[i,:]
  synonym_name = synonym_data.loc['Scientific Name']
  # Grab accepted data from wfo database based on ID lookup
  true_data = wfo_df[wfo_df['taxonID'] == synonym_data.loc['acceptedNameUsageID']]
  true_sciname = true_data.iloc[:,1].values[0]
  fix_name(synonym_name,true_sciname)

In [None]:
synonym_idx = aspca_df[aspca_df['taxonomicStatus'].values == 'Synonym'].index
print(f'{len(synonym_idx)} entries have a more acceptable synonym')

0 entries have a more acceptable synonym


### Finish off
We apply a few more cleaning steps to reorganize the data now that we have a clean dataset. 
Namely, we drop duplicates, reset the index, fix up the genus of one-word houseplant names, remove columns we no longer need and standardize the names. 

In [None]:
# Sort and drop again
aspca_df = aspca_df.sort_values('taxonomicStatus').drop_duplicates('Scientific Name', keep='first')
aspca_df = aspca_df.sort_values('Scientific Name').reset_index(drop=True).sort_index()
# Set genus of one-word names to be the name, rather than NaN
aspca_df.loc[aspca_df.fillna('Unknown')['genus']=='Unknown', 'genus'] = aspca_df.loc[aspca_df.fillna('Unknown')['genus']=='Unknown', 'Scientific Name']
# Drop columns we no longer need
aspca_df = aspca_df.drop(['taxonID', 'scientificName', 'taxonomicStatus', 'acceptedNameUsageID', 'taxonRank'], axis=1)
# Standardize column names
aspca_df.rename(columns = {'genus':'Genus', 'family':'Family'}, inplace=True)
# Reorder columns
cols = ['Name', 'Scientific Name', 'Genus', 'Family', 'Alternative Names', 'Toxic to Dogs', 'Toxic to Cats']
aspca_df = aspca_df[cols]

In [None]:
aspca_df.to_csv('Plant Toxicity - v6.csv')
aspca_df.sample(10)

Unnamed: 0,Name,Scientific Name,Genus,Family,Alternative Names,Toxic to Dogs,Toxic to Cats
210,Blooming Sally,Epilobium angustifolium,Epilobium,Onagraceae,"Willow Herb, Great Willow Herb, Fire Weed",False,False
489,Arrow-Head Vine,Syngonium podophyllum,Syngonium,Araceae,"Nephthytis, Green Gold Naphthysis, African Eve...",True,True
194,Epazote,Dysphania ambrosioides,Dysphania,Amaranthaceae,"Wormseed, Jusuit's Tea, Mexican Tea, Paico",True,True
444,Kenya Violet,Saintpaulia confusa,Saintpaulia,Gesneriaceae,Usambra violet,False,False
49,Yellowrocket,Barbarea vulgaris,Barbarea,Brassicaceae,,False,False
153,Fig Leaf Gourd,Cucurbita ficifolia,Cucurbita,Cucurbitaceae,Malabar Gourd,False,False
75,China Aster,Callistephus chinensis,Callistephus,Asteraceae,"Annual Aster, Aster Sinensis",False,False
14,Hollyhock,Alcea rosea,Alcea,Malvaceae,,False,False
173,Leopard Orchid,Dendrobium gracilicaule,Dendrobium,Orchidaceae,Tiger Orchid,False,False
500,Fortunes Palm,Trachycarpus fortunei,Trachycarpus,Arecaceae,Chusan palm,False,False


Some notes: 
*   Names that typically refer to a specific part of a plant such as a melon or carrot might be a bit tricky.
*   Need to consider whether it is better to google search for the common name or scientific name.
*   Classification of species with a lot of specific varieties as well as a generic entry e.g. lily/red lily/tiger lily etc. might also be difficult to manage. 



# Explore the Data

Now we have a lot of useful information on plant toxicity to dogs and cats. 

In [None]:
aspca_df.head()

Unnamed: 0,Name,Scientific Name,Genus,Family,Alternative Names,Toxic to Dogs,Toxic to Cats
0,Sand Verbena,Abronia fragrans,Abronia,Nyctaginaceae,"Prairie Snowball, Wild Lantana",False,False
1,Prayer Bean,Abrus precatorius,Abrus,Fabaceae,"Rosary Pea, Buddhist Rosary Bead, Indian Bead,...",True,True
2,Chenille Plant,Acalypha hispida,Acalypha,Euphorbiaceae,"Philippine Medusa, Foxtail, Red-hot Cat Tail",False,False
3,Copperleaf,Acalypha wilkesiana,Acalypha,Euphorbiaceae,Lance Copperleaf,False,False
4,Measles Plant,Acanthus,Acanthus,Acanthaceae,"Polka Dot Plant, Flamingo Plant, Baby’s Tears,...",False,False


Interestingly, there are only a few plants which have species-specific toxicity - lillies and walnuts.

In [None]:
aspca_df[aspca_df['Toxic to Dogs'] != aspca_df['Toxic to Cats']]

Unnamed: 0,Name,Scientific Name,Genus,Family,Alternative Names,Toxic to Dogs,Toxic to Cats
258,Day Lilies (many varieties),Hemerocallis,Hemerocallis,Asphodelaceae,,False,True
259,Orange Day Lily,Hemerocallis minor,Hemerocallis,Asphodelaceae,,False,True
291,Black Walnut,Juglans nigra,Juglans,Juglandaceae,,True,False
312,Lily,Lilium,Lilium,Liliaceae,,False,True
313,Stargazer Lily,Lilium japonicum,Lilium,Liliaceae,,False,True
314,Tiger Lily,Lilium lancifolium,Lilium,Liliaceae,,False,True
315,Easter Lily,Lilium longiflorum,Lilium,Liliaceae,,False,True
316,Red Lily,Lilium philadelphicum,Lilium,Liliaceae,,False,True
317,Japanese Show Lily,Lilium speciosum,Lilium,Liliaceae,,False,True


We can also notice that not all members of a family will be toxic/non-toxic.

In [None]:
aspca_df[['Family','Toxic to Dogs','Toxic to Cats']].pivot_table(index = 'Family').sort_values(by='Toxic to Dogs')[70:80]

Unnamed: 0_level_0,Toxic to Cats,Toxic to Dogs
Family,Unnamed: 1_level_1,Unnamed: 2_level_1
Hydrangeaceae,0.5,0.5
Oleaceae,0.5,0.5
Euphorbiaceae,0.6,0.6
Fabaceae,0.6,0.6
Berberidaceae,0.666667,0.666667
Apiaceae,0.666667,0.666667
Polygonaceae,0.666667,0.666667
Moraceae,0.666667,0.666667
Araliaceae,0.714286,0.714286
Caryophyllaceae,0.75,0.75


In [None]:
# How many Families have mixed toxicity
len(aspca_df[['Family','Toxic to Dogs','Toxic to Cats']].pivot_table(index = 'Family').sort_values(by='Toxic to Dogs')[aspca_df[['Family','Toxic to Dogs','Toxic to Cats']].pivot_table(index = 'Family').sort_values(by='Toxic to Dogs')['Toxic to Dogs'].apply(lambda x: 0<x<1)])

33

In [None]:
# How many Families
len(aspca_df['Family'].unique())

109

The same is true for only 7 genuses of a plant. Hence classification of a plant will need to be a more granular to be safe. 


In [None]:
aspca_df[['Genus','Toxic to Dogs','Toxic to Cats']].pivot_table(index = 'Genus').sort_values(by='Toxic to Dogs')[208:218]

Unnamed: 0_level_0,Toxic to Cats,Toxic to Dogs
Genus,Unnamed: 1_level_1,Unnamed: 2_level_1
Dionaea,0.0,0.0
Plectranthus,0.25,0.25
Ipomoea,0.5,0.5
Tradescantia,0.5,0.5
Dracaena,0.6,0.6
Schefflera,0.666667,0.666667
Cordyline,0.666667,0.666667
Iris,0.666667,0.666667
Prunus,1.0,1.0
Asclepias,1.0,1.0


In [None]:
# How many Genuses have mixed toxicity
len(aspca_df[['Genus','Toxic to Dogs','Toxic to Cats']].pivot_table(index = 'Genus').sort_values(by='Toxic to Dogs')[aspca_df[['Genus','Toxic to Dogs','Toxic to Cats']].pivot_table(index = 'Genus').sort_values(by='Toxic to Dogs')['Toxic to Dogs'].apply(lambda x: 0<x<1)])

7

In [None]:
# How many Genuses
len(aspca_df[['Genus','Toxic to Dogs','Toxic to Cats']].pivot_table(index = 'Genus').sort_values(by='Toxic to Dogs'))

350

# Scraping Google Images for URLS

[URL scraping](https://towardsdatascience.com/image-scraping-with-python-a96feda8af2d) was achieved using Selenium. 
Selenium is extremely powerful, and nearly all interactions with a website can be simulated.


In [None]:
# If running in Colabs
!pip install selenium -q
!apt-get update # to update ubuntu to correctly run apt install
!apt install chromium-chromedriver -q
!cp /usr/lib/chromium-browser/chromedriver /usr/bin
import sys
sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')

# Import and setup the Selenium webdriver
from selenium import webdriver
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
wd = webdriver.Chrome('chromedriver',chrome_options=chrome_options)

Get:1 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease [15.9 kB]
Hit:2 http://archive.ubuntu.com/ubuntu bionic InRelease
Get:3 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Get:4 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
Hit:5 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu bionic InRelease
Get:6 http://archive.ubuntu.com/ubuntu bionic-backports InRelease [74.6 kB]
Get:7 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Ign:8 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Ign:9 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Get:10 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release [697 B]
Get:11 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release [564 B]
Get:12 https://developer.download.nvidia.com/compute/cuda/repos/

  if sys.path[0] == '':


The Selenium webdriver acts as our virtual browser, and can be controlled through python commands. Here we adapt a script used to grab image files from google images to only look for and download thumbnails, as we are going to be grabbing a lot of images. 

A small catch is that many of google's image thumbnails are stored as base64 encoded images. We'd like to also grab these so we don't miss out on any images with high relevance, as the further along we go in search results, the worse the images become for training purposes.

In [None]:
import requests
import time

def fetch_thumbnail_urls(query:str, max_links_to_fetch:int, wd:webdriver, sleep_between_interactions:int=1, non_commercial=False, shuffle=False):
    """ Gathers urls from google images based on a query, using a Selenium webdriver (wd).
    sleep_between_interactions can be changed to accomodate for slower connections/computers. 
    if shuffle is true, the list of urls returned will be shuffled into a random order

    Parameters
    ----------
    query: str
      Query passed to Google Images.
    max_links_to_fetch: int
      Number of URLs to fetch.
    wd: Selenium webdriver
      Selenium webdriver instance to use.
    sleep_between_interactions: int
       Time in seconds to wait between webdriver interactions.
    non_commercial: bool
      Should the function search for images tagged only for non commercial use.  
    shuffle: bool
      Should the order of the URLs returned be shuffled. 

    Returns:
    ----------
    List
      List of URLs.
    """
    def scroll_to_end(wd):
        wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(sleep_between_interactions)    
    
    # build the google query
    if non_commercial == True:
      search_url = 'https://www.google.com/search?as_st=y&source=hp&safe=off&tbm=isch&as_epq={q}&gs_l=img&tbs=sur%3Af'
    else:
      search_url = "https://www.google.com/search?as_st=y&source=hp&safe=off&tbm=isch&as_epq={q}&gs_l=img"
    # load the page
    wd.get(search_url.format(q=query))

    image_urls = []
    image_count = 0
    results_start = 0
    while image_count < max_links_to_fetch:
        scroll_to_end(wd)

        # get all image thumbnail results
        thumbnail_results = wd.find_elements_by_css_selector("img.Q4LuWd")
        number_results = len(thumbnail_results)
        
        for img in thumbnail_results:
            # extract image urls if they're available from an address
            if img.get_attribute('src') and 'http' in img.get_attribute('src'):
                image_urls.append(img.get_attribute('src'))
            # also grab the straight 64-bit encoded images google uses
            elif img.get_attribute('src') and 'data' in img.get_attribute('src'):
                image_urls.append(img.get_attribute('src'))

            image_count = len(image_urls)

            # break if we reach the specified quota
            if len(image_urls) >= max_links_to_fetch:
                break
        # if we need more images, click load more images button      
        else:
            time.sleep(30)
            load_more_button = wd.find_element_by_css_selector(".mye4qd")
            if load_more_button:
                wd.execute_script("document.querySelector('.mye4qd').click();")

        # move the result startpoint further down
        results_start = len(thumbnail_results)

    if shuffle==True:
      random.shuffle(image_urls)

    return image_urls

Great! Now we have a way to scrape google images for pictures without having to build that entire database by hand! 

Onto fast.ai v2 to download, organize and build a cnn classifier. 

# Downloading Images with FastAI

Fastai v2 has a built-in download_images function that... downloads images. 
However - we'd like to upgrade it a little bit to has the images as they come in, and ignore/delete any duplicates so we end up with consistent sets of unique images. 

We also choose to use only the scientific name to search for images - due to common plant names including things like "Elephant Ears", "Fluffy Ruffles", "Pink Pearl" and many other objects that will confuse the search. 

In [None]:
# Run once per session
!pip install fastai==2.0.18 -q
from fastai.vision.all import *

[K     |████████████████████████████████| 358kB 5.2MB/s 
[K     |████████████████████████████████| 51kB 4.9MB/s 
[?25h



In [None]:
import io
from PIL import Image
import base64
import hashlib
def download_images(dest, url_file=None, urls=None, max_pics=150, n_workers=1, timeout=4):
    """ Download images listed in text file `url_file` to path `dest`, at most `max_pics`
    Duplicate protection added using hash keys.
      After image is downloaded, the hash is compared against other image hashes before saving. 
      If the hash already exists, the next url is tried. 

    Parameters
    ----------
    dest: Path or str
      Download destination folder.
    url_file: 
      File containing \n separated URLs.
    urls:
      List of URLs.
    max_pics: int
       Number of images to download.
    n_workers: int
      Number of cores to use in parallel. Must be 1 for duplicate protection to work. 

    Returns:
    ----------
    Downloads images from given URLs to dest directory.
    """
    hash_keys = dict()
    # Set up hash dict to prevent duplicate image download
    if urls is None: urls = url_file.read().strip().split("\n")
    dest = Path(dest)
    dest.mkdir(exist_ok=True)
    # n_workers must be 1 since we're checking for unique images during the downloading process
    parallel(partial(_download_image_inner, dest, timeout=timeout, max_pics=max_pics), list(enumerate(urls)), n_workers=1)

def _download_image_inner(dest, inp, timeout=4, max_pics=150):
    # Input is an enumerate object
    i,url = inp
    suffix = re.findall(r'\.\w+?(?=(?:\?|$))', url)
    suffix = suffix[0] if len(suffix)>0  else '.jpg'
    # If we have enough pictures, just do nothing until we run out of URLs
    if len(dest.ls()) >= max_pics:
      return
    # Adapt fast.ai v2 download_images function to handle base64 encoded images
    # If the grabbed url was an encoded jpg, decode it and save it inline with the rest, if unique
    try:
      if url[:15] == 'data:image/jpeg':
        encoded_image = url[url.find('/9'):]
        im = Image.open(io.BytesIO(base64.b64decode(encoded_image)))
        filehash = hashlib.md5(im.tobytes()).hexdigest()
        if filehash not in hash_keys: 
          hash_keys[filehash] = i
          im.save(dest/f"{i:08d}{suffix}")
        else:
          pass
    except:
      pass
    # Adapt fast.ai v2 download_images function to handle base64 encoded images
    # If the grabbed url was an encoded png, decode it and save it inline with the rest, if unique
    try:
      if url[:14] == 'data:image/png':
        encoded_image = url[url.find('iVBOR'):]
        im = Image.open(io.BytesIO(base64.standard_b64decode(encoded_image))).convert('RGB')
        filehash = hashlib.md5(im.tobytes()).hexdigest()
        if filehash not in hash_keys: 
          hash_keys[filehash] = i
          im.save(dest/f"{i:08d}{suffix}")
        else:
          pass
    except:
      pass
    # If the grabbed url was a http site, download it and check we haven't already got the same image.
    try: 
      download_url(url, dest/f"{i:08d}{suffix}", overwrite=True, show_progress=True, timeout=timeout)
      im = Image.open(dest/f"{i:08d}{suffix}")
      filehash = hashlib.md5(im.tobytes()).hexdigest()
      if filehash not in hash_keys: 
        hash_keys[filehash] = i
      else:
        (dest/f"{i:08d}{suffix}").unlink()
    except Exception as e: f"Couldn't download {url}."

Test Download

In [None]:
wd = webdriver.Chrome('chromedriver',options=options)
from tqdm.notebook import tqdm
import itertools
scientific_names = aspca_df['Scientific Name'][0:4]
# loop over all houseplant names, grab urls and download them into my google drive in separate folders
for name in tqdm(scientific_names):
  try:
    path = Path('/content/sample_data');
    folder = name
    dest = path/folder
    dest.mkdir(parents=True, exist_ok=True)
    if len(dest.ls())<150:
      print(f'{name} has {len(dest.ls())} images.')
      url_science = fetch_thumbnail_urls(f'{name}', max_links_to_fetch = 600, wd=wd, non_commercial = False, shuffle = False)
      dest = path/folder
      # Force hash_key to be refreshed - stored as a global in the function, emptied here
      hash_keys = dict()
      download_images(path/folder, urls = url_science, max_pics=150) # The modified fast.ai convenience function
      print(f'Finished downloading images of {name} : {len(dest.ls())} images downloaded.') 
    else:
      print(f'{name} already has sufficient images.')
  except Exception as e:
    print(f'Error with {name}. {e}')

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))

Abronia fragrans has 0 images.


Finished downloading images of Abronia fragrans : 150 images downloaded.
Abrus precatorius has 0 images.


Finished downloading images of Abrus precatorius : 150 images downloaded.
Acalypha hispida has 0 images.


Finished downloading images of Acalypha hispida : 150 images downloaded.
Acalypha wilkesiana has 0 images.


Finished downloading images of Acalypha wilkesiana : 150 images downloaded.



Download full set of data.

In [None]:
# Instantiate webdriver
wd = webdriver.Chrome('chromedriver',options=options)
from tqdm.notebook import tqdm
import itertools
scientific_names = aspca_df['Scientific Name']
# Loop over all houseplant names, grab urls and download them into my google drive in separate folders
for name in tqdm(scientific_names):
  try:
    path = Path('/content/drive/My Drive/Houseplant Classifier/plant_images_deepest');
    folder = name
    dest = path/folder
    dest.mkdir(parents=True, exist_ok=True)
    if len(dest.ls())<150:
      print(f'{name} has {len(dest.ls())} images.')
      url_science = fetch_thumbnail_urls(f'{name}', max_links_to_fetch = 600, wd=wd, non_commercial = False, shuffle = False)
      dest = path/folder
      # Force hash_key to be refreshed - stored as a global in the function, emptied here
      hash_keys = dict()
      download_images(path/folder, urls = url_science, max_pics=150) # The modified fast.ai convenience function
      print(f'Finished downloading images of {name} : {len(dest.ls())} images downloaded.') 
    else:
      print(f'{name} already has sufficient images.')
  except Exception as e:
    print(f'Error with {name}. {e}')

### Check a folder for duplicates

In [None]:
path = Path('/content/drive/My Drive/Houseplant Classifier/plant_images_deepest')

def find_duplicates(folder):
  import hashlib
  from tqdm.notebook import tqdm
  hash_keys = dict()
  duplicates = []
  for index, filename in tqdm(enumerate(folder.ls())):
        with open(filename, 'rb') as f:
            filehash = hashlib.md5(f.read()).hexdigest()
        if filehash not in hash_keys: 
            hash_keys[filehash] = index
        else:
            duplicates.append((index,hash_keys[filehash]))
  if len(duplicates)>0:
    print(f'{len(duplicates)} duplicates out of {len(folder.ls())} images in {folder}')
    return duplicates
  else:
    print(f'No duplicates found in {folder}')

In [None]:
folder = path/'Abronia fragrans'
find_duplicates(folder)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


No duplicates found in /content/drive/My Drive/Houseplant Classifier/plant_images_duplitest/Abronia fragrans


In [None]:
for folder in path.ls():
  find_duplicates(folder)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


No duplicates found in /content/drive/My Drive/Houseplant Classifier/plant_images_duplitest/Abronia fragrans


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


No duplicates found in /content/drive/My Drive/Houseplant Classifier/plant_images_duplitest/Abrus precatorius


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


No duplicates found in /content/drive/My Drive/Houseplant Classifier/plant_images_duplitest/Acalypha hispida


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


No duplicates found in /content/drive/My Drive/Houseplant Classifier/plant_images_duplitest/Acalypha wilkesiana


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


No duplicates found in /content/drive/My Drive/Houseplant Classifier/plant_images_duplitest/Acanthus


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


No duplicates found in /content/drive/My Drive/Houseplant Classifier/plant_images_duplitest/Acer rubrum


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


No duplicates found in /content/drive/My Drive/Houseplant Classifier/plant_images_duplitest/Acer tataricum subsp. ginnala


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


No duplicates found in /content/drive/My Drive/Houseplant Classifier/plant_images_duplitest/Achillea millefolium


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


No duplicates found in /content/drive/My Drive/Houseplant Classifier/plant_images_duplitest/Adenium obesum


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


No duplicates found in /content/drive/My Drive/Houseplant Classifier/plant_images_duplitest/Adonidia merrillii


In [None]:
# Plot duplicates from a folder if needed
file_list = folder.ls()
for file_indexes in duplicates[:30]:
        try:
            plt.subplot(121),plt.imshow(Image.open(file_list[file_indexes[1]]))
            plt.title(file_list[file_indexes[1]]), plt.xticks([]), plt.yticks([])

            plt.subplot(122),plt.imshow(Image.open(file_list[file_indexes[0]]))
            plt.title(str(file_list[file_indexes[0]]) + ' duplicate'), plt.xticks([]), plt.yticks([])
            plt.show()

        except OSError as e:
            continue

### Check folders have been downloaded, and possess the correct number of images

In [None]:
def validate_downloads(data_path:str, n_files:int, class_list):
  '''
  Validate that all files (up to n_files) and folders (across all classes) have been downloaded.
  '''
  data = []
  # Have any folders been missed
  for plant_class in class_list:
    if plant_class not in sorted(os.listdir(data_path)):
      data.append(plant_class)
  # Have any images been missed
  for folder in sorted(os.listdir(data_path)):
      n = len(sorted(os.listdir(data_path+folder)))
      if n < n_files: 
          data.append(folder)
  df = pd.DataFrame(data, columns=['Class'])
  # Return dataframe containing classes with problems to re-download
  return df

In [None]:
data_path = '/content/drive/My Drive/Houseplant Classifier/plant_images_deepest_cleaned/'
n_files = 150
class_list = aspca_df['Scientific Name']
df = validate_downloads(data_path, n_files, class_list); df

Unnamed: 0,Class
0,Abronia fragrans
1,Abrus precatorius
2,Acalypha hispida
3,Acalypha wilkesiana
4,Acanthus
...,...
480,Zamia furfuracea
481,Zamia pumila
482,Zantedeschia aethiopica
483,Zephyranthes drummondii


### See if any extra folders exist in the database

In [None]:
def rev_validate_downloads(data_path:str, n_files:int, class_list):
  '''
  Check for folders that shouldn't be there.
  '''
  data = []
  # Have any folders been missed
  for plant_class in sorted(os.listdir(data_path)):
    if aspca_df['Scientific Name'].apply(lambda x:x==plant_class).any() == False:
      data.append(plant_class)
  for folder in sorted(os.listdir(data_path)):
    n = len(sorted(os.listdir(data_path+folder)))
    if n > n_files: 
        data.append(folder)
  # Return dataframe containing classes with problems to delete
  df = pd.DataFrame(data, columns=['Class'])
  return df

In [None]:
data_path = '/content/drive/My Drive/Houseplant Classifier/plant_images_deepest/'
class_list = aspca_df['Scientific Name']
df_delete = rev_validate_downloads(data_path, 150, class_list); df_delete

Unnamed: 0,Class


# Verify that all images are readable

In [None]:
from tqdm import tqdm
imgs = L() # Create a fast.ai v2 type list
for n in tqdm(id):
  path_n = path/n # Define the path to the image folder
  imgs += verify_images(path_n.ls()) # Put images that can't be verified into the img list

print(f'{len(imgs)} images are unreadable.')

for im in imgs:
  im.unlink() # Delete image if it was unreadable