# Download Pipeline

## Imports

In [1]:
from json import loads as parse_json
import json
import pandas as pd
import numpy as np
import requests
import io
import os

# Changing the option to show a dataframe not 'in-line'
pd.set_option('display.expand_frame_repr', False)

In [3]:

dir = os.listdir()
print(dir)

csv_file = [i for i in dir if '.csv' in i][0]
print(csv_file)

['.idea', 'venv', 'image_resizer.py', 'Downloading_Script_old.ipynb', 'Data_Probing_and_Cleaning_Script.ipynb', 'group_images.py', 'download.ipynb', 'links.csv', 'images']
links.csv


## Reading Query Files

In [4]:
# Loading CSVs from google drive
try:
    query = pd.read_csv(csv_file, sep=',', encoding='latin-1', dtype=object).dropna(axis=1, how='all') # uses taxonKey for 'class'
except:
    query = pd.read_csv(csv_file, sep='\t', encoding='latin-1', dtype=object).dropna(axis=1, how='all') # uses taxonKey for 'class'


print("Shape of query: ", query.shape)
# print(query.sort_values('numberOfOccurrences'))

media = pd.read_csv('multimedia.txt', sep='\t', encoding='latin-1', dtype=object, on_bad_lines='skip').dropna(axis=1, how='all') # uses gbifID for 'occurrence'
print("Shape of media: ", media.shape)

occur = pd.read_csv('occurrence.txt', sep='\t', encoding='latin-1', dtype=object, on_bad_lines='skip').dropna(axis=1, how='all') # uses taxonKey for 'class' and gbifIF for 'occurrence'
print("Shape of occur: ", occur.shape)

# unique taxons
occur_count = occur.nunique()
media_count = media.nunique()
print('unique ids in occur: ', occur_count["gbifID"])
print('unique ids in media: ', media_count["gbifID"])

Shape of query:  (0, 0)


FileNotFoundError: [Errno 2] No such file or directory: 'multimedia.txt'

## Managing the Loaded Data 

In [9]:
# combine the dataframes based on matching id
occur_media = pd.merge(occur, media, on="gbifID", how="inner")

print("Shape of occur_media: ", occur_media.shape)
print(occur_media['identifier_y'].head)

Shape of occur_media:  (1468010, 213)
<bound method NDFrame.head of 0          https://inaturalist-open-data.s3.amazonaws.com...
1          https://inaturalist-open-data.s3.amazonaws.com...
2          https://inaturalist-open-data.s3.amazonaws.com...
3          https://inaturalist-open-data.s3.amazonaws.com...
4          https://inaturalist-open-data.s3.amazonaws.com...
                                 ...                        
1468005    https://svampe.databasen.org/uploads/2022-1027...
1468006    https://svampe.databasen.org/uploads/2022-1027...
1468007    https://svampe.databasen.org/uploads/2022-1027...
1468008    https://svampe.databasen.org/uploads/2022-1027...
1468009    http://specify.ugrasu.ru:8080/fileget?coll=Fun...
Name: identifier_y, Length: 1468010, dtype: object>


In [10]:
# sort using the values as numbers, not objects/strings
sort_order = query['numberOfOccurrences'].astype('int32').argsort() 

# sort the queries by occurence number and then reverse it because argsort is always ascending order
taxons = query.iloc[sort_order][query['numberOfOccurrences'].astype('int32') > 0][::-1].dropna(subset=['species']) 

print(taxons[['taxonKey','genus', 'species', 'numberOfOccurrences']])

     taxonKey          genus                   species numberOfOccurrences
463  11571791   Sphaerobolus     Sphaerobolus ingoldii                 452
194   5247390        Cerrena         Cerrena hydnoides                 357
359   6124406       Clathrus        Clathrus crispatus                 254
21   11892250       Astraeus         Astraeus morganii                 201
111   5243168  Chlorophyllum  Chlorophyllum molybdites                 156
..        ...            ...                       ...                 ...
292  11463094   Xerocomellus      Xerocomellus bolinii                   1
294   2520705      Phellinus       Phellinus fastuosus                   1
296   3359489        Russula           Russula vinacea                   1
304   5244501       Clavaria         Clavaria fragilis                   1
258   7351381    Auricularia     Auricularia nigricans                   1

[385 rows x 4 columns]


  taxons = query.iloc[sort_order][query['numberOfOccurrences'].astype('int32') > 0][::-1].dropna(subset=['species'])


## Generating Links

In [14]:
number_of_images = 800
max_species = 40 # number of species without enough images
cur_species_count = 0
# initialize the empty dataframe
links = pd.DataFrame(columns=['key',  'species', 'link'])

# loop through and grab n rows per taxon
for i in range(len(taxons)):
    
    if cur_species_count >= max_species:
        break 

    # subset occurences to get just the first n matching rows
    current_taxon = taxons['taxonKey'].iloc[i]
    subset = occur_media.loc[occur_media['taxonKey'] == current_taxon].dropna(subset=['identifier_y'])

    # initialize a (n, 3) array to fill
    arr = np.ndarray((number_of_images, 3), dtype='object')
    count = 0

    if len(subset) > number_of_images:
        for j in range(len(subset)):

            # get the file extension 
            url = subset.iloc[j]['identifier_y']
            root, ext = os.path.splitext(url)

            # filter out any files that dont have .jpg or .jpeg extensions for uniformity
            if ext == '.jpg' or ext == '.jpeg':
                
                # take the useful values from the current row and insert them into the ndarray
                row = subset.iloc[j][['taxonKey', 'species', 'identifier_y']]
                arr[count] = [row['taxonKey'], row['species'], row['identifier_y']]
                count += 1

            # filled array
            if count >= number_of_images:
                count = 0
                break

        # convert the filled ndarray into a dataframe 
        sub = pd.DataFrame(arr, columns=['key',  'species', 'link'])

        # concatenate the dataframe
        links = pd.concat([links, sub])
        
        cur_species_count += 1
        

# reset and drop the indices        
links = links.reset_index()[['key',  'species', 'link']]

print('Number of Species\t', links.shape[0] / number_of_images)
print('Number of rows:\t', links.shape[0], '\n')

Number of Species	 40.0
Number of rows:	 32000 



In [7]:
# run this to redownload the aggregated links file
links.to_csv('links.csv', index=False)

## Reading from Links

In [2]:
# this is a checkpoint
links = pd.read_csv('links.csv')

In [3]:
nkeys = links['key'].value_counts()
print('Unique keys:\t', len(nkeys))

nspecies = links['species'].value_counts()
print('Unique species:\t', len(nspecies), '\n')

keys_species = links.groupby(['key','species']).size().reset_index().rename(columns={0:'count'})
print(keys_species[['species']].value_counts().sort_values().tail())

print('\nIDs for duplicate species ("Laetiporus sulphureus"): ', links[links['species'] == 'Laetiporus sulphureus']['key'].unique())

links = links[links['key'] != '2542235']

print('\n\nAfter Removal:')
print('\nIDs for duplicate species ("Laetiporus sulphureus"): ', links[links['species'] == 'Laetiporus sulphureus']['key'].unique())

nkeys = links['key'].value_counts()
print('\nUnique keys:\t', len(nkeys))

nspecies = links['species'].value_counts()
print('Unique species:\t', len(nspecies))

Unique keys:	 1
Unique species:	 1 

species         
Pholiota adiposa    1
Name: count, dtype: int64

IDs for duplicate species ("Laetiporus sulphureus"):  []


After Removal:

IDs for duplicate species ("Laetiporus sulphureus"):  []

Unique keys:	 1
Unique species:	 1


## Downloading the Images

In [4]:
dir = os.listdir()
print(dir)
if 'images' not in dir:
    os.mkdir('images/')

['.idea', 'download.ipynb', 'Downloading_Script_old.ipynb', 'group_images.py', 'image_resizer.py', 'image_square.py', 'links.csv', 'venv']


In [5]:
for key in links['key'].unique():
    subset = links[links['key'] == key]
    output_dir = 'images/'
    count = 0
    species_name = subset['species'].unique()[0].replace(' ', '_')

    print(species_name)

    # print(subset)

    for link in links[links['key'] == key]['link']:
        try:
            image = requests.get(link).content
            output_file = os.path.join(output_dir, species_name + '_' + str(count) + '.jpg')
            # print(output_file)
            with open(output_file, 'wb') as writer:
                writer.write(image)

            # FILE.download(output_file)

        except Exception as e: 
            print(e)
        
        finally:
            count += 1


Pholiota_adiposa


In [6]:
from datetime import datetime

now = datetime.now()
current_time = now.strftime("%H:%M:%S")

print('Started at 8:05')
print("Ended at ", current_time)

Started at 2:11
Ended at  20:23:06
