In [2]:
from pathlib import Path
from tqdm import tqdm

import pandas as pd
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt

In [15]:
sample_size = 1_000_000

In [10]:
inat_ods_dir = Path("/workspace/opendata/")

In [12]:
with open(inat_ods_dir / "observations.csv", "r") as f:
    observations_count = sum(1 for _ in f)

In [30]:
chunk_size = 1_000_000
chunk_count = int(observations_count / chunk_size)
# add 10% in each chunk sample for NA rows
chunk_sample_size = int(1.10 * (sample_size / chunk_count))

In [None]:
print(f"Sample size: {sample_size}") 
print(f"Chunk count: {chunk_count}")
print(f"Chunk sample size: {chunk_sample_size}") 
print(f"Chunk count * size: {chunk_count * chunk_sample_size}") 

In [62]:
all_obs = pd.read_csv(
    inat_ods_dir / "observations.csv",
    sep="\t",
    usecols=[
        "observation_uuid", 
        "observer_id", 
        "latitude", 
        "longitude", 
        "taxon_id", 
        "quality_grade",
        "observed_on",
    ],
    dtype={
        "quality_grade": "category"
    },
    chunksize=chunk_size
)

In [None]:
pbar = tqdm(
    total=chunk_count,
    bar_format="{l_bar}{bar:30}{r_bar}{bar:-30b}",
    dynamic_ncols=True
)

obs_chunks = []
for chunk in all_obs:
    sampled_chunk = chunk.sample(chunk_sample_size)
    sampled_chunk = sampled_chunk.dropna()
    obs_chunks.append(sampled_chunk)
    pbar.update(1)

In [39]:
obs = pd.concat(obs_chunks).head(sample_size)

# SAMPLE {sample_size} GOOD OBSERVATIONS

In [None]:
obs.info()

In [41]:
# finish cleaning on our 1M sample
obs.taxon_id = obs.taxon_id.astype(np.int32)
obs.observer_id = obs.observer_id.astype(np.int32)

# skipping this since i think elastic will do infer dates?
#obs.observed_on = pd.to_datetime(obs.observed_on)

# join on world geo dataframe to get continent and country from lat and lng

In [42]:
obs_gdf = gpd.GeoDataFrame(
    obs, geometry=gpd.points_from_xy(obs.longitude, obs.latitude), crs="EPSG:4326"
)

In [43]:
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))

  world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))


In [44]:
obs_gdf = gpd.sjoin(obs_gdf, world)

In [45]:
obs_gdf = obs_gdf[[
    "observation_uuid",
    "observer_id",
    "taxon_id",
    "quality_grade",
    "observed_on",
    "continent",
    "name",
]]

In [46]:
obs_gdf.rename({"name": "country_name"}, axis=1, inplace=True)

In [47]:
obs_gdf.continent.value_counts()

continent
North America              552085
Europe                     201991
Asia                        54773
Oceania                     52116
South America               37051
Africa                      27751
Antarctica                     13
Seven seas (open ocean)         5
Name: count, dtype: int64

In [48]:
obs_gdf.sample(3)

Unnamed: 0,observation_uuid,observer_id,taxon_id,quality_grade,observed_on,continent,country_name
73905654,676725b2-9c0e-4272-8d55-cd7215c5e3aa,1138587,48233,research,2022-05-18,Oceania,Australia
95224968,4f88a514-4d4c-40e0-80bc-eeec28a67697,3708750,1308422,needs_id,2023-02-15,Oceania,Australia
120006233,592979ba-9265-407f-830d-c177948c606f,6709768,55576,research,2023-09-24,Europe,Latvia


# merge in the rest of the ods export to get taxonomy, photos, and observer login names

In [49]:
tax = pd.read_csv(
    inat_ods_dir / "taxa.csv",
    sep="\t",
    usecols=["taxon_id", "ancestry", "name", "active"],
)

In [50]:
tax = tax[tax.active==True]
tax = tax.dropna()

In [51]:
tax.rename({"name": "taxon_name"}, axis=1, inplace=True)
tax = tax[["taxon_id", "taxon_name", "ancestry"]]

In [None]:
tax.sample(3)

In [53]:
observers = pd.read_csv(
    inat_ods_dir / "observers.csv",
    sep="\t",
    usecols=["observer_id", "login"],
)
observers.rename({"login": "observer_login"}, axis=1, inplace=True)


In [None]:
observers.sample(3)

In [None]:
with open(inat_ods_dir / "photos.csv", "r") as f:
    photos_count = sum(1 for _ in f)

In [55]:
photos = pd.read_csv(
    inat_ods_dir / "photos.csv",
    sep="\t",
    usecols=["photo_id", "observation_uuid", "extension"],
    dtype={
        "photo_id": np.int32,
        "extension": "category",
    },
    chunksize=chunk_size
)

In [56]:
pbar = tqdm(
    total=chunk_count,
    bar_format="{l_bar}{bar:30}{r_bar}{bar:-30b}",
    dynamic_ncols=True
)

obs_with_other_data_chunks = []
for photo_chunk in photos:
    obs_with_photos_chunk = pd.merge(
        obs_gdf, photo_chunk, left_on="observation_uuid", right_on="observation_uuid",
    )
    obs_photos_with_taxa_chunk = pd.merge(
        obs_with_photos_chunk, tax, left_on="taxon_id", right_on="taxon_id"
    )
    obs_photos_taxa_with_observers_chunk = pd.merge(
        obs_photos_with_taxa_chunk, observers, left_on="observer_id", right_on="observer_id"
    )
    obs_with_other_data_chunks.append(obs_photos_taxa_with_observers_chunk)

In [57]:
obs_photos_taxa_with_observers = pd.concat(obs_with_other_data_chunks)

In [None]:
len(obs_photos_taxa_with_observers)

In [59]:
obs_photos_taxa_with_observers.to_csv(
    inat_ods_dir / "complete_obs_sample.csv",
    index=False,
)