In [4]:
# Imports
import pandas
import subprocess
import numpy
import os

In [2]:
# Will first need to make an AWS account, install aws cli, make a key pair and enter it with `aws configure` 
# Photos and metadata are in a public s3 bucket
if (not os.path.isdir("metadata")): 
   os.mkdir("metadata")
for file_name in ["photos.csv.gz","observations.csv.gz","taxa.csv.gz"]:
    subprocess.run(f"aws s3 cp s3://inaturalist-open-data/{file_name} metadata/{file_name}".split())

download: s3://inaturalist-open-data/photos.csv.gz to metadata/photos.csv.gz
Completed 213.2 MiB/2.6 GiB (9.0 MiB/s) with 1 file(s) remaining  

KeyboardInterrupt: 

In [5]:
def get_taxon_id(species_name, taxa_df):
    matches = taxa_df[taxa_df.name == species_name]
    if len(matches) == 0:
        print("Error: no matches")
    taxon_id = matches.iloc[0]["taxon_id"]
    return taxon_id

In [6]:
def get_photos(label_name, taxa_series, photos_df, observations_df, n_photos=1000,train_fraction=.8):

    # restrict to research quality (verified) observations
    # and find the uuids for all observations
    #uuids = pandas.merge(observations_df.query("quality_grade == 'research'"), taxa_series, on="taxon_id", how="inner")["observation_uuid"]
    uuids = pandas.merge(observations_df.query("(quality_grade == 'research') and (longitude > -124.409591) and (latitude > 32.534156) and (longitude < -114.131211) and (latitude < 42.009518)"), taxa_series, on="taxon_id", how="inner")["observation_uuid"] # [,"latitude","longitude"]

    #print(max(uuids["latitude"]),min(uuids["latitude"]))
    #print(max(uuids["longitude"]),min(uuids["longitude"]))

    # get photo id + file extension pairs corresponding to those observations
    photo_info = pandas.merge(uuids,photos_df, on="observation_uuid", how="inner")[["photo_id","extension"]]

    # randomly sample n rows from the data frame
    # photo_info = photo_info.sample(n=n_photos)

    # set up the directory structure
    dir_names = ["img3","img3/train","img3/val"]
    for dir_name in dir_names:
       if (not os.path.isdir(dir_name)):
            (os.mkdir(dir_name)) 

    # use medium photo resolution
    # there is probably a much faster way to do this if you can batch the downloads
    photo_info.iloc[0:int(n_photos*train_fraction)].apply(lambda x: subprocess.run(["aws","s3","cp",
                                                f"s3://inaturalist-open-data/photos/{x.photo_id}/medium.{x.extension}", 
                                                f"img3/train/{label_name}/{x.photo_id}.{x.extension}"]), axis=1)

    photo_info.iloc[int(n_photos*train_fraction):].apply(lambda x: subprocess.run(["aws","s3","cp",
                                                f"s3://inaturalist-open-data/photos/{x.photo_id}/medium.{x.extension}", 
                                                f"img3/val/{label_name}/{x.photo_id}.{x.extension}"]), axis=1)

In [7]:
# tab-separated even though they are called csv
taxa_df = pandas.read_csv("metadata/taxa.csv.gz", compression="gzip",sep="\t")
photos_df = pandas.read_csv("metadata/photos.csv.gz", compression="gzip",sep="\t")
observations_df = pandas.read_csv("metadata/observations.csv.gz", compression="gzip",sep="\t")

In [8]:
eucalyptus_taxon = get_taxon_id("Eucalyptus",taxa_df)

# we want all observations that are either tagged with the eucalyptus genus taxon_id already
# or are at a lower level in the hierarchy
eucalyptus_taxa = pandas.concat([taxa_df.query(f"ancestry.str.contains('/{eucalyptus_taxon}[/$]', na=False)")["taxon_id"], taxa_df[taxa_df.taxon_id == eucalyptus_taxon]["taxon_id"]])

In [9]:
# list of tree species
# Download from https://tools.bgci.org/global_tree_search_trees_1_5.csv
# doi.org/10.13140/RG.2.2.33593.90725
tree_species_df = pandas.read_csv("metadata/global_tree_search_trees_1_5.csv", encoding="utf-8").rename(columns={"TaxonName":"name"})

In [10]:
# get tree taxa ids by merging on species name
tree_species_taxa = pandas.merge(taxa_df, tree_species_df, how="inner", on="name")["taxon_id"]

# These are the tree taxa that are not in the list of eucalyptus taxa
other_trees_taxa = pandas.Series( numpy.setdiff1d(tree_species_taxa.values, eucalyptus_taxa.values),name="taxon_id")

# these are the data that are not in either the tree taxa or the eucalyptus taxa
background_taxa = pandas.Series(numpy.setdiff1d(taxa_df.taxon_id.values, numpy.union1d(tree_species_taxa.values,eucalyptus_taxa.values)),name="taxon_id")

In [11]:
#get_photos("eucalyptus",eucalyptus_taxa,photos_df,observations_df,n_photos=2000,train_fraction=.8)
#get_photos("tree",other_trees_taxa,photos_df,observations_df,n_photos=2000,train_fraction=.8)
#get_photos("background",background_taxa,photos_df,observations_df,n_photos=2000,train_fraction=.8)

download: s3://inaturalist-open-data/photos/13896470/medium.jpg to img3/train/tree/13896470.jpg
download: s3://inaturalist-open-data/photos/87945631/medium.jpg to img3/train/tree/87945631.jpg
download: s3://inaturalist-open-data/photos/64281743/medium.jpeg to img3/train/tree/64281743.jpeg
download: s3://inaturalist-open-data/photos/107322270/medium.jpg to img3/train/tree/107322270.jpg
download: s3://inaturalist-open-data/photos/111378470/medium.jpg to img3/train/tree/111378470.jpg
download: s3://inaturalist-open-data/photos/156840729/medium.jpeg to img3/train/tree/156840729.jpeg
download: s3://inaturalist-open-data/photos/9871509/medium.jpg to img3/train/tree/9871509.jpg
download: s3://inaturalist-open-data/photos/123900133/medium.jpg to img3/train/tree/123900133.jpg
download: s3://inaturalist-open-data/photos/35483229/medium.jpg to img3/train/tree/35483229.jpg
download: s3://inaturalist-open-data/photos/120121299/medium.jpg to img3/train/tree/120121299.jpg
download: s3://inaturalist-o