In [None]:
!pip install -U tqdm --quiet

In [None]:
import os
import requests
import numpy as np
import pandas as pd
from PIL import Image
from tqdm import tqdm
from dotenv import load_dotenv

from utils.s3_bucket import S3Bucket

In [None]:
load_dotenv("env")

In [None]:
bucket_name = "ava-cv-raw-photo-bucket"
input_prefix = os.path.join("GBIF-csv", "Pl@ntNet")
output_prefix = os.path.join("10-plants", "Pl@ntNet")

In [None]:
bucket = S3Bucket(
    bucket_name=bucket_name,
    region_name=os.environ["REGION_NAME"],
)

input_key = os.path.join(input_prefix, "occurrence.txt")
occurrences = pd.read_csv(bucket[input_key], sep="\t", parse_dates=["eventDate"])
not_null_columns = occurrences.columns[~occurrences.isnull().all()]
print("Columns that are not null:")
print(list(not_null_columns))

In [None]:
# Remove irrelevant columns
occurrences = occurrences[["gbifID", "species", "genus", "eventDate"]]
occurrences

In [None]:
name_mapping = {
    "fragaria": "strawberry",
    "mentha": "mint",
    "origanum vulgare": "oregano",
    "solanum lycopersicum": "tomato",
    "capsicum annuum": "pepper",
    "cucurbita pepo": "zucchini",
    "coriandrum sativum": "cilantro",
    "anethum graveolens": "dill",
    "lactuca sativa": "lettuce",
    "ocimum basilicum": "basil"
}


def get_name(row):
    genus_to_classify = ["fragaria", "mentha"]
    if row["genus"] in genus_to_classify:
        scientific_name = row["genus"]
    else:
        scientific_name = row["species"]
        
    if scientific_name in name_mapping.keys():
        common_name = name_mapping[scientific_name]
        return common_name
    
    return "nan"


# Create new name column which is a combination of both genus and species names
occurrences[["genus", "species"]] = (
    occurrences[["genus", "species"]]
    .apply(lambda x: x.astype(str).str.lower())
)
occurrences["name"] = occurrences.apply(lambda row: get_name(row), axis=1)
occurrences = occurrences.drop(columns=["species", "genus"])
occurrences = occurrences[occurrences["name"] != "nan"]  # remove any rows whose name is "nan"
occurrences

In [None]:
# Get source URL for each ID
input_key = os.path.join(input_prefix, "multimedia.txt")
sources = pd.read_csv(bucket[input_key], sep="\t")
sources = sources[["gbifID", "identifier"]]
occurrences = occurrences.merge(sources, how="left", on="gbifID")
occurrences

In [None]:
occurrences["name"].value_counts()

In [None]:
# Get newest nlargest rows from each name for training
nlargest = min(occurrences["name"].value_counts())
print(nlargest)
nlargest_indices = occurrences.groupby("name")["eventDate"].nlargest(nlargest).reset_index()["level_1"]
occurrences = occurrences[occurrences.index.isin(nlargest_indices)].reset_index(drop=True).reset_index()
occurrences

In [None]:
def download_image(row, prefix):
    try:
        image = Image.open(requests.get(row["identifier"], stream=True).raw).convert('RGB')
        output_key = os.path.join(prefix, row["name"], f'{row["gbifID"]}-{row["index"]}.jpg')
        bucket[output_key] = image
    except KeyboardInterrupt as exc:
        raise exc
    except:
        print(f'Encountered error when attempting to download {row["identifier"]}')

tqdm.pandas()

In [None]:
%%time
_ = occurrences.progress_apply(lambda row: download_image(row, output_prefix), axis=1)