# iNaturalist API v1 Get Observations Example
- Link: https://jumear.github.io/stirpy/lab?path=iNatAPIv1_get_observations.ipynb
- GitHub Repo: https://github.com/jumear/stirpy

## Get Data from the iNaturalist API

In [None]:
# load required modules
from urllib.parse import parse_qs # used for parsing URL parameters
from pyodide.http import pyfetch # used for asynchronous fetching
import asyncio # used for asynchronous fetching
from copy import deepcopy # used for deep copying
import math # used for a ceiling method
#from datetime import datetime # used to convert string datetimes into actual datetimes

In [None]:
# define the parameters needed for your request
req_params_string = 'verifiable=true&spam=false&user_id=pisum'
req_params = parse_qs(req_params_string)
req_headers_base = {'Content-Type': 'application/json', 'Accept': 'application/json'}

# to make authorized calls, set jwt to the "api_token" value from https://www.inaturalist.org/users/api_token.
# the JWT is valid for 24 hours. it can be used to do / access anything your iNat account can access. so keep it safe, and don't share it.
# you will also have to set use_authorization=True when making your API request below.
jwt = None

# define the GET /v1/observations endpoint
endpoint_get_obs = {
    'method': 'GET',
    'url': 'https://api.inaturalist.org/v1/observations',
    'max_records': 10000,
    'max_per_page': 200,
}

In [None]:
# basic function to fetch from API and convert repsonse to JSON
async def fetchdata(url, method='GET', use_authorization=False, delay=0):
    await asyncio.sleep(delay)
    req_headers = {}
    if use_authorization and jwt:
        req_headers = deepcopy(req_headers_base)
        req_headers['Authorization'] = jwt
    #print(req_headers)
    #print(f'begin fetch: {method} {url}')
    response = await pyfetch(url, method=method, headers=req_headers)
    data = await response.json()
    print(f'fetch complete: {method} {url}')
    return data

# function to GET total_results (count) from the API
async def gettotalresults(endpoint, params={}, use_authorization=False, delay=0):
    rp = deepcopy(params)
    rp.pop('per_page', None) # remove per_page parameter, if it exists
    rp['per_page'] = ['0'] # set this to 0, since we need only the count, not the actual records
    results = await fetchdata(urlwithparams(endpoint['url'], rp), use_authorization=use_authorization, delay=delay)
    total_results = results['total_results']
    print (f'total records: {str(total_results)}')
    return total_results

# function to GET results from the API
# if get_all_pages=True, then get all records, up to the limit that the API endpoint provides.
# query pages in parallel, with each page having a incrementally delayed start.
# (iNaturalist wants you to limit requests to ~1 req/second.)
async def getresults(endpoint, params={}, get_all_pages=False, use_authorization=False):
    results = []
    max_page = math.ceil(endpoint['max_records'] / endpoint['max_per_page']) if get_all_pages else 1
    if get_all_pages:
        # when getting all pages, make a small query first to find how many total records there are.
        # this allows us to calculate how many requests we need to make in total.
        # if total records exceeds the maximum that the API will return, then retrieve only up to the maximum.
        total_results = gettotalresults(endpoint, params, use_authorization)
        total_pages = math.ceil(total_results / endpoint['max_per_page'])
        if total_pages < max_page:
            max_page = total_pages
        print (f'pages to retrieve: {str(max_page)}')
    async with asyncio.TaskGroup() as tg:
        tasks = []
        for i in range(max_page):
            rp = deepcopy(params)
            if get_all_pages:
                # if getting all pages, remove per_page and page parameters if they exist in the base params
                # and then set per_page = max and increment page for each request
                rp.pop('per_page', None)
                rp.pop('page', None)
                rp['per_page'] = [str(endpoint['max_per_page'])] # set this to the max if we're getting all pages
                rp['page'] = [str(i+1)]
            tasks.append(tg.create_task(fetchdata(urlwithparams(endpoint['url'], rp), use_authorization=use_authorization, delay=i)))
    for t in tasks:
        data = t.result()
        #print(data)
        results+=data['results']
    print (f'total records retrieved: {str(len(results))}')
    return results

# function used by another function getfieldvalue to get a particular value from a results row
def getrefvalue(rec, ref):
    # if the reference is chained (ex. taxon.id), then split these apart, and iterate through each object / dict.
    # if any of the references is an index for an array / list (ex. index 0 in photos[0].id), then handle those, too.
    ref_chain = ref.split('.')
    value = rec
    for r in ref_chain:
        items = [];
        if r.find('[') >= 0:
            r = r.replace(']','')
            r = r.split('[')
            items = r[1:len(r)]
            r = r[0]
        #print(r)
        #print(items)
        value = value.get(r)
        if value is None:
            break
        if len(items) > 0:
            for i in map(int, items):
                if len(value or []) == 0:
                    value = None
                    break
                value = value[i]
        if value is None:
            break
    return value

# function used by another function parseresults to parse a results row and get / calculate the value for a particular field
def getfieldvalue(rec, field):
    value = getrefvalue(rec, field['ref'])
    if value is None and field.get('alt') is not None:
        value = getrefvalue(rec, field['alt'])
    if field.get('function') == 'count':
        value = len(value or [])
    elif value is not None:
        fp = field.get('params',{})
        if field.get('function') == 'split':
            value = value.split(fp.get('separator'))[fp.get('index')]
            try:
                value = int(value)
            except:
                try:
                    value = float(value)
                except:
                    None
        elif field.get('function') == 'join':
            value = value = fp.get('separator').join(map(str, value)) if len(value) > 0 else None
        elif field.get('function') == 'replace':
            value = value.replace(fp.get('old_text'), fp.get('new_text'))
        elif field.get('function') == 'multiselect':
            fvalue = []
            for r in value:
                sv = []
                for sr in fp.get('select_refs'):
                    sv.append(str(getrefvalue(r,sr)))
                svalue = fp.get('template')
                for i in range(len(sv)):
                    svalue = svalue.replace(f'{{{i}}}',sv[i])
                fvalue.append(svalue)
            value = fp.get('separator').join(map(str, fvalue)) if len(fvalue) > 0 else None
        elif field.get('function') == 'filtercount':
            count = 0
            for r in value:
                match = True
                for f in fp.get('filter'):
                    if f.get('value'):
                        match = match and getrefvalue(r,f.get('ref')) == f['value']
                    elif f.get('value_ref'):
                        match = match and getrefvalue(r,f.get('ref')) == getrefvalue(rec, f['value_ref'])
                    else:
                        match = False
                if match:
                    count += 1
            value = count
        elif field.get('function') == 'filterselect':
            fvalue = []
            for r in value:
                match = True
                for f in fp.get('filter'):
                    if f.get('value'):
                        match = match and getrefvalue(r,f.get('ref')) == f['value']
                    elif f.get('value_ref'):
                        match = match and getrefvalue(r,f.get('ref')) == getrefvalue(rec, f['value_ref'])
                    else:
                        match = False
                if match:
                    fvalue.append(getrefvalue(r, fp.get('select_ref')))
            value = fp.get('separator').join(map(str, fvalue)) if len(fvalue) > 0 else None
    return value

# funtion used to parse a results set based on a set of field definitions 
def parseresults(results, fields=[]):
    presults = []
    for r in results:
        #print (r['id'])
        row = {}
        for f in fields:
            row[f.get('label') or f.get('ref')] = getfieldvalue(r,f)
        presults.append(row)
    return presults

# function to get and parse observations
# get the results from API using function getresults.
# define the data we want in parse_fields.
# use function parseresults to transform the results to the data we want.
async def getobs(params={}, get_all_pages=False, use_authorization=False):
    results = await getresults(endpoint_get_obs, params, get_all_pages, use_authorization)
    # each object in the field definition must have at least a ref (reference).
    # use an optional label if you want the key to be different from the ref.
    # use an optional alt (alternative reference) if you want a fallback in case no data is found in ref.
    # use optional function + params to do more complicated parsing of the ref.
    parse_fields = [
        {'ref': 'id'},
        #{'ref': 'uuid'},
        {'ref': 'quality_grade'},
        #{'label': 'user_id', 'ref': 'user.id'},
        {'label': 'user_login', 'ref': 'user.login'},
        #{'label': 'user_name', 'ref': 'user.name'},
        {'label': 'taxon_id', 'ref': 'taxon.id'},
        {'label': 'taxon_name', 'ref': 'taxon.name'},
        {'label': 'taxon_preferred_common_name', 'ref': 'taxon.preferred_common_name'},
        {'label': 'taxon_rank', 'ref': 'taxon.rank'},
        #{'label': 'taxon_rank_level', 'ref': 'taxon.rank_level'},
        #{'label': 'taxon_ancestry', 'ref': 'taxon.ancestry'},
        #{'ref': 'observed_on_string'},
        {'ref': 'time_observed_at'},
        {'ref': 'created_at'},
        #{'ref': 'updated_at'},
        {'ref': 'place_guess'},
        #{'ref': 'location'},
        {'label': 'latitude', 'ref': 'location', 'function': 'split', 'params': {'separator': ',', 'index': 0}},
        {'label': 'longitude', 'ref': 'location', 'function': 'split', 'params': {'separator': ',', 'index': 1}},
        {'ref': 'public_positional_accuracy'},
        #{'ref': 'private_place_guess'},
        #{'ref': 'private_location'},
        #{'label': 'private_latitude', 'ref': 'private_location', 'function': 'split', 'params': {'separator': ',', 'index': 0}},
        #{'label': 'private_longitiude', 'ref': 'private_location', 'function': 'split', 'params': {'separator': ',', 'index': 1}},
        #{'ref': 'positional_accuracy'},
        {'ref': 'taxon_geoprivacy'},
        {'ref': 'privacy'},
        {'ref': 'description'},
        {'label': 'photos_count', 'ref':'photos', 'function': 'count'},
        #{'label': 'photo_1_id', 'ref': 'photos[0].id'},
        {'label': 'photo_1_url', 'ref': 'photos[0].url', 'function': 'replace', 'params': {'old_text': 'square', 'new_text': 'medium'}}, # size options are thumb, square, small, medium, large, and original
        {'label': 'photo_1_license_code', 'ref': 'photos[0].license_code'},
        {'label': 'sounds_count', 'ref':'sounds', 'function': 'count'},
        {'ref': 'comments_count'},
        #{'label': 'others_current_identifications_count', 'ref': 'identifications_count'},
        {'label': 'current_identifications_count', 'ref': 'identifications', 'function': 'filtercount', 'params': {'filter': [{'ref': 'current', 'value': True}]}},
        #{'label': 'current_identifications_by_observer', 'ref': 'identifications', 'function': 'filtercount', 'params': {'filter': [{'ref': 'current', 'value': True}, {'ref': 'user.id', 'value_ref': 'user.id'}]}},
        {'label': 'current_identification_by_observer', 'ref': 'identifications', 'function': 'filterselect', 'params': {'filter': [{'ref': 'current', 'value': True}, {'ref': 'user.id', 'value_ref': 'user.id'}], 'select_ref': 'taxon.name', 'separator': ', '}},
        {'label': 'current_identification_category_by_observer', 'ref': 'identifications', 'function': 'filterselect', 'params': {'filter': [{'ref': 'current', 'value': True}, {'ref': 'user.id', 'value_ref': 'user.id'}], 'select_ref': 'category', 'separator': ', '}},
        {'ref': 'owners_identification_from_vision'},
        {'label': 'prefers_community_taxon', 'ref': 'preferences.prefers_community_taxon', 'alt': 'user.preferences.prefers_community_taxa'},
        #{'label': 'identifier_ids', 'ref': 'identifications', 'function': 'filterselect', 'params': {'filter': [{'ref': 'current', 'value': True}], 'select_ref': 'user.id', 'separator': ', '}},
        {'label': 'identifier_logins', 'ref': 'identifications', 'function': 'filterselect', 'params': {'filter': [{'ref': 'current', 'value': True}], 'select_ref': 'user.login', 'separator': ', '}},
        {'label': 'reviewed_by_count', 'ref': 'reviewed_by', 'function': 'count'},
        #{'ref': 'reviewed_by', 'function': 'join', 'params': {'separator':', '}},
        #{'ref': 'captive'},
        {'label': 'annotations_count','ref':'annotations', 'function': 'count'},
        {'label': 'annotations', 'ref': 'annotations', 'function': 'multiselect', 'params': {'select_refs': ['controlled_attribute_id','controlled_value_id'], 'template': '{0}:{1}', 'separator': ', '}},
        {'label': 'observation_fields_count', 'ref':'ofvs', 'function': 'count'},
        {'label': 'observation_fields', 'ref': 'ofvs', 'function': 'multiselect', 'params': {'select_refs': ['name','field_id','value'], 'template': '{0} ({1}): {2}', 'separator': '; '}},
        {'label': 'tags_count', 'ref':'tags', 'function': 'count'},
        {'ref': 'tags', 'function': 'join', 'params': {'separator':', '}},
        #{'ref': 'oauth_application_id'},
        #{'ref': 'site_id'},
        {'label': 'gbif_occurence_url', 'ref': 'outlinks', 'function': 'filterselect', 'params': {'filter': [{'ref': 'source', 'value': 'GBIF'}], 'select_ref': 'url', 'separator': ', '}},
    ]
    obs = parseresults(results, parse_fields)
    print (f'observations parsed: {str(len(obs))}')
    return obs

# function to get a set of observation ids only
# if a separator string parameter is passed in, the function will return a string where the ids are separated by that separator.
# otherwise, the function will return a list of ids
async def getobsid(params={}, get_all_pages=False, use_authorization=False, separator=None):
    rp = deepcopy(params)
    rp['only_id'] = ['true'] # set this to true, since we only want ids
    obs = await getresults(endpoint_get_obs, rp, get_all_pages, use_authorization)
    obs = [o.get('id') for o in obs]
    print (f'observations parsed: {str(len(obs))}')
    if separator:
        obs = separator.join(map(str,obs))
    return obs

# function to get a series of counts
# base_params are the (fixed) parameters that will be applied when getting the count for each item in the series.
# series_params is a list of (variable) parameters (keys) to add to base_params for each item in the series.
# series is a list of dicts, each of which defines the parameter key/value pairs for each item in the series.
# each item in the series list can contain additional attributes that are not parameters, and it does not have to contain all the keys in the series_params list.
# if add_count_to_series is set to True, the function will add the counts to the original series object; otherwise, it just returns a (deep) copy of series with counts.
async def getcountseries(endpoint, series, series_params, base_params={}, count_label='rec_count', use_authorization=False, add_count_to_series=False):
    if len(series) == 0 or len(series_params) == 0:
        print(f'The series parameter must be a list of dicts with keys that include the values in the list passed in for series_params.')
        return None
    rv = []
    results = series if add_count_to_series else deepcopy(series) # results will look the same, but if add_count_to_series=True, the original series list wlll actually change
    async with asyncio.TaskGroup() as tg:
        tasks = []
        for i in range(len(results)):
            rp = deepcopy(base_params)
            for sp in series_params:
                if results[i].get(sp) is not None:
                    rp.pop(sp, None)
                    rp[sp] = [str(results[i].get(sp))] 
            #print(rp)
            tasks.append(tg.create_task(gettotalresults(endpoint, rp, use_authorization=use_authorization, delay=i)))
    for t in range(len(tasks)):
        results[t][count_label] = tasks[t].result()
    return results

# function to combine the base url with a set of parameters
# there's a urlencode method in urllib.parse, but it's easier to get exactly what I need using this custom code.
def urlwithparams(url_base, params={}):
    #print(params)
    url = url_base
    for p in list(params.keys()):
        #print(p)
        s = '?' if url.find('?') < 0 else '&'
        pv = ','.join(params[p])
        url += f'{s}{p}={pv}'
    # print(url)
    return url

In [None]:
# main execution section

# get observations
obs = await getobs(req_params, get_all_pages=False, use_authorization=False)
#obs

# get observation ids from obs
#obs_ids = ','.join(map(str,[o.get('id') for o in obs]))

# get observation ids without first getting obs
#obs_ids = await getobsid(req_params, get_all_pages=False, use_authorization=False, separator=',')
#print(f'https://www.inaturalist.org/observations/identify?id={obs_ids}')

# get just total results (count)
#obs_count = await gettotalresults(endpoint_get_obs, req_params, use_authorization=False)
#obs_count

# get a series of counts
#obs_count_series = [
#    {'label': 'Texas 2020', 'year': 2020, 'place_id': 18},
#    {'label': 'not Texas 2020', 'year': 2020, 'not_in_place': 18},
#    {'label': 'Texas 2021', 'year': 2021, 'place_id': 18},
#    {'label': 'not Texas 2021', 'year': 2021, 'not_in_place': 18},
#]
#await getcountseries(endpoint_get_obs, obs_count_series, ['year','place_id','not_in_place'], base_params=req_params, count_label='obs_count', use_authorization=False, add_count_to_series=True)
#obs_count_series

In [None]:
# if you order by id when you get observations (this is the default behavior if you don't specify an order_by parameter), 
# then it should be possible to work around the max 10000 record limit of the API by using the id_above or id_below parameters.
# i purposely am not automating this process completely (because I don't want to make it too easy to accidentally get a ton of data),
# but i'm including this bit of code here to provide an idea of how to do it.
# to use the code below, set get_more_obs = True before running.
get_more_obs = False
#if get_more_obs and obs and len(obs) >= endpoint_get_obs['max_records'] and len(obs) % endpoint_get_obs['max_records'] == 0:
if get_more_obs and obs:
    rp = deepcopy(req_params)
    if rp.get('order_by','id') == 'id': # this only works if the records were sorted by id
        if rp.get('order') == 'asc':
            max_id = max([o.get('id') for o in obs])
            print(f'getting additional observations for id_above={max_id}')
            rp.pop('id_above', None) # remove per_page parameter, if it exists
            rp['id_above'] = [str(max_id)] # set this to the max_id so that the records we get will have ids above the obs we already have
        else:
            min_id = min([o.get('id') for o in obs])
            print(f'getting additional observations for id_below={min_id}')
            rp.pop('id_below', None) # remove per_page parameter, if it exists
            rp['id_below'] = [str(min_id)] # set this to the min_id so that the records we get will have ids below the obs we already have
        obs += await getobs(rp, get_all_pages=True, use_authorization=False)
        print(f'observations accumulated: {len(obs)}')

## Write Data to CSV

In [None]:
# load required modules
import csv # used to output CSV files

In [None]:
# function write data to a CSV file
def datatocsv(data, csv_filename='export.csv'):
    csv_fields = list(data[0].keys()) # get fields from the keys of the first record in the dataset
    #print(len(data))
    with open(csv_filename, 'w', newline='') as csv_file:
        csv_writer = csv.DictWriter(csv_file, fieldnames=csv_fields)
        csv_writer.writeheader()
        csv_rows = 0
        for r in data:
            csv_writer.writerow(r)
            csv_rows+=1
        print (f'created CSV file {csv_filename} with {csv_rows} records.')

In [None]:
# export to CSV
datatocsv(obs,'observations.csv')

## Work with Data in a DataFrame

In [None]:
# load required modules
import pandas as pd

In [None]:
# load data into a DataFrame (df)
df = pd.DataFrame(obs)

In [None]:
# Get basic summary statistics for df
df.describe()

In [None]:
# Preview the contents of the df
df

In [None]:
# convert datetime columns to datetimes, localized to UTC
for k in ['time_observed_at','created_at','updated_at']:
    if k in df.columns:
        try:
            df[k] = pd.to_datetime(df[k], utc=True, errors='coerce')
        except:
            print(f'could not convert column {k} datetime')

# get count (of id) by observed year
df.groupby(df.time_observed_at.dt.year).id.count()

# get count (of id) by created year
#df.groupby(df.created_at.dt.year).id.count()

In [None]:
# records where observation_fields are not null
df.loc[df.observation_fields.notnull()]

# count (of id of) records where acc > 100
# df.loc[df.public_positional_accuracy > 100].id.count()