# iNaturalist API v1 Get Observations Example
- Link: https://jumear.github.io/stirpy/lab?path=iNat_APIv1_get_observations.ipynb
- GitHub Repo: https://github.com/jumear/stirpy

## Get Data from the iNaturalist API

This example gets either a single or mutliple pages of results from the [API](https://api.inaturalist.org/). The requests are made asynchronously (in parallel, with a small incremental delay between the initiation of each page request), allowing large recordsets to be fetched in the shortest amount of time while respectng iNaturalist's [suggested request limit](https://www.inaturalist.org/pages/developers) (about 1 per second).

The example also provides a model for parsing the results, including flattening some of the items returned in the results (for use cases where the data is expected to be tabular, such as when exporting to a CSV file). For example, a single observation can be associated with multiple identifications, and this example code can flatten those multiple identifications into a single string so that all the identifications can be written out on a single line with the same observation row. This example also provides a model for client-side filtering (as an alternative when server-side filtering is not possible). The example shows how to get additional information, such as annotation descriptions and standard place information for observations.

There are also examples of how to get counts of observations or a series of counts (ex. observation counts by state).

In [None]:
# load required modules
from urllib.parse import parse_qs # used for parsing URL parameters
import asyncio # used for asynchronous fetching
import math # used for a ceiling method
from functools import partial # used for pre-loading functions with some arguments
from datetime import datetime # used to convert string datetimes into actual datetimes

# use Pyodide's pyfetch module if possible, but fall back to urllib3 outside of Pyodide
try:
    from pyodide.http import pyfetch # Pyodide's fetch function (asynchronous)
except:
    #!pip install urllib3
    import urllib3 # fall back to urllib3 if pyfetch isn't available. it can be made asynchronous using asynchio.to_thread().

In [None]:
# define custom functions used for getting data

def params_to_dict(params_string):
    """Convert a parameter string into a dict.
    ex.: 'taxon_id=1&user_id=kueda,loarie' => {'taxon_id': ['1'], 'user_id': ['kueda', 'loarie']} 
    """
    params_dict = parse_qs(params_string)
    for p, v in params_dict.items():
        if v: # iNaturalist handles multiple values for the same parameter using comma separated values. since parse_qs doesn't handle that situation, this section will handle it.
            v = [(vv.split(',') if vv else vv) for vv in v]
            params_dict[p] = [vvv for vv in v for vvv in vv]
    return params_dict

def url_with_params(url_base, params=None):
    """Combine a base url with a set of parameters. Can handle the following types of cases:
    1. 'https://api.inaturalist.org/v1/observations' + {'taxon_id': [1], 'user_id': ['kueda','loarie']} => 'https://api.inaturlaist.org/v1/observations?taxon_id=1&user_id=kueda,loarie'
    2. 'https://api.inaturalist.org/v1/places/{id}' + {'id': [1,2,3], 'admin_level': [0,10]} => 'https://www.api.inaturalist.org/v1/places/1,2,3?admin_level=0,10'
    """
    if params is None:
        params = {}
    url = url_base
    for p, v in params.items():
        pv = ','.join(v)
        if url.find(pp:=f'{{{p}}}') >= 0:
            url = url.replace(pp, pv)
        else:
            s = '?' if url.find('?') < 0 else '&'
            url += f'{s}{p}={pv}'
    return url

async def fetch_data(url, method='GET', use_authorization=False, delay=0):
    """Fetch and convert repsonse to JSON"""
    await asyncio.sleep(delay)
    req_headers = {}
    if use_authorization and jwt:
        req_headers = req_headers_base.copy() # make a copy
        req_headers['Authorization'] = jwt
    if 'pyfetch' in globals():
        response = await pyfetch(url, method=method, headers=req_headers)
        data = await response.json()
    else:
        response = await asyncio.to_thread(urllib3.request, method, url, headers=req_headers)
        data = response.json()
    print(f'Fetch complete: {method} {url}')
    return data

async def get_total_results(endpoint, params=None, use_authorization=False, delay=0):
    """GET total_results (count) from the API"""
    if params is None:
        params = {}
    rp = params.copy() # make a copy
    rp.pop('per_page', None) # remove per_page parameter, if it exists
    rp['per_page'] = ['0'] # set this to 0, since we need only the count, not the actual records
    data = await fetch_data(url_with_params(endpoint['url'], rp), use_authorization=use_authorization, delay=delay)
    total_results = int(data['total_results'])
    print(f'Total records: {str(total_results)}')
    return total_results

async def get_results_single_page(endpoint, params=None, use_authorization=False, parse_function=None, pre_parse_filter_function=None, post_parse_filter_function=None, delay=0):
    """GET a single page of results from the API. Can be called directly but generally is intended to be called by get_results.
    Additional parsing and additional filtering before and after parsing can happen here, too.
    """
    if params is None:
        params = {}
    rp = params.copy() # make a copy
    data = await fetch_data(url_with_params(endpoint['url'], rp), use_authorization=use_authorization, delay=delay)
    results = data.get('results',[])
    if pre_parse_filter_function:
        results = list(filter(pre_parse_filter_function, results))
    if parse_function:
        results = parse_function(results)
    if post_parse_filter_function:
        results = list(filter(post_parse_filter_function, results))
    return results

async def get_results(endpoint, params=None, get_all_pages=False, use_authorization=False, parse_function=None, pre_parse_filter_function=None, post_parse_filter_function=None):
    """GET results from the API. When get_all_pages=True, get results over multiple pages using 1 of 2 methods:
    1. When the endpoint definition includes a page_key field, group key items into batches of up to a max number of records per page / batch.
       Suppose: endpoint = {'url': 'https://api.inaturalist.org/v1/taxa/{id}', 'page_key': 'id', 'max_per_page': 30 } and params = {'id': ['1','2','3',...,'60']}
       Then: GET https://api.inaturalist.org/v1/taxa/1,2,3,...,30; GET https://api.inaturalist.org/v1/taxa/31,32,33,...,60
    2. In other cases, get pages with the max records per page, up to the maximum record limit that the API endpoint provides.
       Suppose: endpoint = {'url': 'https://api.inaturalist.org/v1/observations', 'max_records': 10000, 'max_per_page': 200 } and params = {'taxon_id': ['1']}
       Then: GET https://api.inaturalist.org/v1/observations?taxon_id=1&per_page=200&page=1; GET https://api.inaturalist.org/v1/observations?taxon_id=1&per_page=200&page=2; etc...
    Get pages in parallel, with each page request having an incrementally delayed start. (iNaturalist suggests limiting requests to ~1 req/second.)
    """
    if params is None:
        params = {}
    results = []
    if (page_key := endpoint.get('page_key')):
        if not (page_key_values := params.get(page_key)):
            print(f'Cannot query from this endpoint without values for {page_key} parameter')
            return None
        # if more values are input than the max per page, split these into multiple batches
        max_per_page = endpoint['max_per_page']
        total_key_values = len(page_key_values)
        batches = [page_key_values[i:i+max_per_page] for i in range(0, total_key_values, max_per_page)]
        print(f'There are {total_key_values} {page_key} values, requiring {len(batches)} API requests to retrieve. Retrieving {"all sets" if get_all_pages else "only the first set"}...')
        async with asyncio.TaskGroup() as tg: # available in Python 3.11+
            tasks = []
            for i in (range(len(batches) if get_all_pages else 1)):
                rp = params.copy() # make a copy
                rp[page_key] = batches[i]
                tasks.append(tg.create_task(get_results_single_page(endpoint, params=rp, use_authorization=use_authorization, parse_function=parse_function, pre_parse_filter_function=pre_parse_filter_function, post_parse_filter_function=post_parse_filter_function, delay=i)))
        for t in tasks:
            results += t.result()
    else:
        max_page = math.ceil(endpoint['max_records'] / endpoint['max_per_page']) if get_all_pages else 1
        if get_all_pages:
            # when getting all pages, make a small query first to find how many total records there are.
            # this allows us to calculate how many requests we need to make in total.
            # if total records exceeds the maximum that the API will return, then retrieve only up to the maximum.
            total_results = await get_total_results(endpoint, params, use_authorization)
            total_pages = math.ceil(total_results / endpoint['max_per_page'])
            if total_pages < max_page:
                max_page = total_pages
            print(f'Pages to retrieve: {str(max_page)}')
        async with asyncio.TaskGroup() as tg: # available in Python 3.11+
            tasks = []
            for i in range(max_page):
                rp = params.copy() # make a copy
                if get_all_pages:
                    # if getting all pages, remove per_page and page parameters if they exist in the base params
                    # and then set per_page = max and increment page for each request
                    rp.pop('per_page', None)
                    rp.pop('page', None)
                    rp['per_page'] = [str(endpoint['max_per_page'])] # set this to the max if we're getting all pages
                    rp['page'] = [str(i+1)]
                tasks.append(tg.create_task(get_results_single_page(endpoint, params=rp, use_authorization=use_authorization, parse_function=parse_function, pre_parse_filter_function=pre_parse_filter_function, post_parse_filter_function=post_parse_filter_function, delay=i)))
        for t in tasks:
            results += t.result()
    print(f'Total records retrieved: {str(len(results))}')
    return results

def get_ref_value(rec, ref):
    """Used by another function get_field_value to get a particular value from a results row (rec).
    If the reference is chained, then traverse through the chain to get to the final item (ex. ref = 'identifications.taxon.id' + rec => identifications => taxon => id).
    If the chain includes list indexes (ex. index 0 in ref = 'photos[0].id'), then handle those situations, too.
    """
    value = rec
    dict_chain = ref.split('.')
    for r in dict_chain:
        list_chain = [];
        if r.find('[') >= 0:
            r = r.replace(']','')
            r = r.split('[')
            list_chain = r[1:len(r)]
            r = r[0]
        value = value.get(r)
        if list_chain and value is not None:
            for i in map(int, list_chain):
                if len(value) == 0:
                    value = None
                    break
                value = value[i]
        if value is None:
            break
    return value

def filter_ref_value(rec, value, params):
    """Used by another function get_field_value to filter a nested list based on certain filter parameters"""
    # filtered = [r for r in value if all([(get_ref_value(r,f.get('ref')) == (get_ref_value(rec, fvr) if (fvr := f.get('value_ref')) else f.get('value'))) for f in params])]
    # the code below seems to run a tiny bit faster than the commented out line above
    filtered = []
    for r in value:
        for f in params:
            if not (get_ref_value(r,f.get('ref')) == (get_ref_value(rec, fvr) if (fvr := f.get('value_ref')) else f.get('value'))):
                break
        else:
            filtered.append(r)
    return filtered

def get_field_value(rec, field):
    """used by another function parse_results to parse a results row (rec) and get / calculate the value based on a field definition"""
    # get the base value
    value = get_ref_value(rec, field['ref']) if field['ref'] else rec 
    if value is None and field.get('alt'):
        value = get_ref_value(rec, field['alt'])
    # apply a transformation / calculation based on a specified "function" + "params" in the field definition
    if (ff := field.get('function')) == 'count':
        value = len(value) if value else 0
    elif value is not None:
        fp = field.get('params',{})
        if ff == 'split':
            value = value.split(fp.get('separator'))[fp.get('index')]
        elif ff == 'join':
            value = value = fp.get('separator').join(map(str, value)) if value else None
        elif ff == 'replace':
            value = value.replace(fp.get('old_text'), fp.get('new_text'))
        elif ff == 'combine':
            cvalue = fp.get('template','')
            cref = fp.get('combine_refs',[])
            for i, cr in enumerate(cref):
                cvalue = cvalue.replace(f'{{{i}}}',str(get_ref_value(value,cr)))
            value = cvalue
        elif ff == 'filter_combine':
            filtered = filter_ref_value(rec, value, fp.get('filter',[]))
            fvalue = []
            for r in filtered:
                cvalue = fp.get('template','')
                cref = fp.get('combine_refs',[])
                for i, cr in enumerate(cref):
                    cvalue = cvalue.replace(f'{{{i}}}',str(get_ref_value(r,cr)))
                fvalue.append(cvalue)
            value = fp.get('separator').join(map(str, fvalue)) if fvalue else None
            if value == []:
                value = None
        elif ff == 'filter_count':
            filtered = filter_ref_value(rec, value, fp.get('filter',[]))
            value = len(filtered) if (dr := fp.get('distinct_ref')) is None else len(set([get_ref_value(r, dr) for r in filtered])) # get a distinct count if distinct_ref is defined
        elif ff == 'filter_select':
            filtered = filter_ref_value(rec, value, fp.get('filter',[]))
            fvalue = [get_ref_value(r, fp.get('select_ref')) for r in filtered]
            value = fp.get('separator').join(map(str, fvalue)) if fvalue else None
    # transform based on a custom function
    if (cf := field.get('custom_function')):
        value = cf(value)
    # cast the final value to a particular type specified in the field defintiion
    if (cast_as := field.get('type')):
        try:
            value = cast_as(value)
        except:
            pass
    return value

def parse_simple(results, field_list):
    """Return only specific fields from the results. Only top-level items may be specified in the field list, but children of selected items will be included with the returned values."""
    return [{k: r.get(k) for k in field_list} for r in results]

def parse_results(results, parse_fields, pre_parse_functions=None):
    """Parse a set of results, based on parse_fields defintiions. Pre-parse functions can be used to manipulate the results prior to parsing."""
    if pre_parse_functions is None:
        pre_parse_functions = []
    # parse based on the parse_fields defintion
    presults = []
    for r in results:
        # special processing prior to core processing
        for ppf in pre_parse_functions:
            ppf(r)
        # core processing
        row = {}
        for i, f in enumerate(parse_fields):
            row[f.get('label') or f.get('ref') or f'col_{i+1}'] = get_field_value(r,f)
        presults.append(row)
    return presults

def refs_in_parse_fields(parse_fields):
    """Return a distinct list of field references from a list of field definitions (parse_fields).
    This may be useful when v2 of the API allows the user to specify which fields to return in the API response.
    """
    refs = []
    for f in parse_fields:
        function = f.get('function')
        if (ref := f['ref']) is not None and (function is None or function in ['join','split','replace']):
            refs.append(ref)
        if (alt := f.get('alt')) is not None and (function is None or function in ['join','split','replace']):
            refs.append(alt)
        if (params := f.get('params')) is not None:
            if (select_ref := params.get('select_ref')) is not None:
                select_ref = f'{ref}.{select_ref}'
                refs.append(ref)
            if (distinct_ref := params.get('distinct_ref')) is not None:
                distinct_ref = f'{ref}.{distinct_ref}'
                refs.append(distinct_ref)
            if (combine_refs := params.get('combine_refs')) is not None:
                for cr in combine_refs:
                    if ref is not None:
                        cr = f'{ref}.{cr}'
                    refs.append(cr)
            if (filter_param := params.get('filter')) is not None:
                for fp in filter_param:
                    if (filter_ref := fp.get('ref')) is not None:
                        if ref is not None:
                            filter_ref = f'{ref}.{filter_ref}'
                        refs.append(filter_ref)
                    if (filter_value_ref := fp.get('value_ref')) is not None:
                        if ref is not None:
                            filter_value_ref = f'{ref}.{filter_value_ref}'
                        refs.append(filter_value_ref)
    for i, r in enumerate(refs): # remove list item indexes
        while (pos_beg := r.find('[')) >= 0 and (pos_end := r.find(']')) >= 0:
            r = r[:pos_beg] + r[pos_end+1:]
            refs[i] = r
    refs = list(set(refs)) # make distinct
    refs.sort()
    return refs

async def get_annotations():
    """Get annotation codes (controlled_term.ids, aka controlled_attribute_ids and controlled_value_ids) and descriptions (controlled_term.labels) from the API.
    Then create a code / description cross-reference and store it in an attribute on the function named xref. 
    (Only the codes are included in the GET /v1/observations results. So the cross-reference is needed to translate the codes to plain English.)
    """
    xref = getattr(get_annotations, 'xref', None)
    if xref is None:
        xref = {};
        terms = await(fetch_data(endpoint_get_controlled_terms['url']))
        for t in terms['results']:
            xref[t['id']] = t['label']
            for v in t['values']:
               xref[v['id']] = v['label']
        print(f'Retrieved annnotation cross-references ({len(xref)} items)')
        get_annotations.xref = xref
    return xref

def add_annot_descr_and_total_score(r):
    """Intended to be used as pre-parse function in parse_results when parsing observations.
    Add annotation descriptions from get_annotations to a set of observation results. (Assumes get_annotations has been previously run.)
    Also add total score equal to the existing vote_score + 1 (since the annotation itself should be considered a vote).
    """
    for a in r.get('annotations',[]):
        a['controlled_attribute'] = get_annotations.xref[a['controlled_attribute_id']]
        a['controlled_value'] = get_annotations.xref[a['controlled_value_id']]
        a['total_score'] = (1 + (vote_score if (vote_score := a.get('vote_score')) else 0)) # note: it's possible a single user can annotate and upvote, but there will be no special handling to eliminate such upvotes

def add_obs_field_taxon_or_value(r):
    """Intended to be used as pre-parse function in parse_results when parsing observations.
    Add a field with either taxon name + id (when obs field value is a taxon) or a plain value (when obs field value is any other kind of value)
    """
    for of in r.get('ofvs',[]):
        of['taxon_or_value'] = f'{of["taxon"]["name"]} ({of["taxon"]["id"]})' if of['datatype'] == 'taxon' and of.get('taxon') else of['value']

def add_ident_vs_obs_comparison(r):
    """Intended to be used as pre-parse function in parse_results when parsing observations.
    Add a field "vs_obs" which compares an identification's taxon vs the observation taxon (vs_obs = 'same', 'ancestor', 'descendant', 'different', or 'none')
    """
    #ic = 0
    for i, id in enumerate(r.get('identifications',[])):
        #id['seq'] = i+1
        #if id['current'] == 'true':
        #    ic += 1
        #    id['seq_current'] = ic
        if not (rt := r.get('taxon')) or not (idt := id.get('taxon')):
            id['vs_obs'] = 'none'
        elif rt['id'] == idt['id']:
            id['vs_obs'] = 'same'
        elif (idta := idt.get('ancestry')) is not None and rt['id'] in map(int, idta.split('/')):
            id['vs_obs'] = 'descendant'
        elif (rta := rt.get('ancestry')) is not None and idt['id'] in map(int, rta.split('/')):
            id['vs_obs'] = 'ancestor'
        else:
            id['vs_obs'] = 'different'

def add_obs_taxon_ancestors(r):
    """Intended to be used as pre-parse function in parse_results when parsing observations.
    The observation taxon itself has an ancestor list but no detailed ancestor information; however, the taxon fields in the identiifcations do have ancestor details.
    So this adds ancestor details to the observation taxon, based on the ancestor details in the identifications (since the observation taxon should always be included in the indentification taxa or their ancestors).
    """
    ancestors = []
    rank_level_kingdom = 70 # this is the highest-level taxon stored in identification[i].ancestors
    if (rt := r.get('taxon')) and (taxon_id := rt.get('id')) is not None and (rank_level := rt.get('rank_level')) < rank_level_kingdom:
        for id in r.get('identifications',[]):
            if (idt := id.get('taxon')):
                if idt['id'] == taxon_id:
                    ancestors = list(idt['ancestors'])
                    break
                if (idta := idt['ancestors']):
                    for i, atid in enumerate([a['id'] for a in idta]):
                        if atid == taxon_id:
                            ancestors = idta[0:i] # add everything above this taxon (will add this taxon later below)
                            break
                if ancestors:
                    break
    if rt and rank_level <= rank_level_kingdom:
        ancestors.append(rt.copy())
        rt['ancestors'] = ancestors

async def get_obs(params=None, get_all_pages=False, use_authorization=False, parse_function=None, pre_parse_filter_function=None, post_parse_filter_function=None):
    """Get and parse observations from the API"""
    if params is None:
        params = {}
    pre_parse_functions = []
    post_get_functions = []
    if parse_function is None:
        if params.get('only_id',['false']) == ['true']: # if only_id=true, then don't parse fields because only id will exist in the results
            parse_function = None
        else: # if a custom parse_function is not specified, use parse_results with some default field definitions
            # each dict in the field definition must have at least a ref (reference) key. (note: if ref is set to None, the observation row will be retrieved as the value.)
            # use an optional label if you want the key to be different from the ref.
            # use an optional type to cast the field to a specific data type.
            # use an optional alt (alternative reference) if you want a fallback ref in case no data is found in ref.
            # use optional function + params to do more complicated parsing of the ref,
            # even more complicated logic can be handled with a custom_function, pre_parse_functions, or post_get_functions. 
            parse_fields = [
                {'ref': 'id'},
                #{'label': 'url', 'ref': None, 'function': 'combine', 'params': {'combine_refs': ['id'], 'template': 'https://www.inaturalist.org/observations/{0}'}},
                #{'ref': 'uuid'},
                {'ref': 'quality_grade'},
                #{'label': 'user_id', 'ref': 'user.id'},
                {'label': 'user_login', 'ref': 'user.login'},
                #{'label': 'user_login_id', 'ref': 'user', 'function': 'combine', 'params': {'combine_refs': ['login','id'], 'template': '{0} ({1})'}},
                #{'label': 'user_name', 'ref': 'user.name'},
                #{'label': 'taxon_ancestors', 'ref': 'taxon.ancestors', 'function': 'filter_combine', 'params': {'combine_refs': ['name','rank','id'], 'template': '{0} ({1}) ({2})', 'separator': ', '}},
                #{'label': 'kingdom', 'ref': 'taxon.ancestors', 'function': 'filter_select', 'params': {'filter': [{'ref': 'rank', 'value': 'kingdom'}], 'select_ref': 'name', 'separator': ', '}},
                #{'label': 'phylum', 'ref': 'taxon.ancestors', 'function': 'filter_select', 'params': {'filter': [{'ref': 'rank', 'value': 'phylum'}], 'select_ref': 'name', 'separator': ', '}},
                #{'label': 'class', 'ref': 'taxon.ancestors', 'function': 'filter_select', 'params': {'filter': [{'ref': 'rank', 'value': 'class'}], 'select_ref': 'name', 'separator': ', '}},
                #{'label': 'order', 'ref': 'taxon.ancestors', 'function': 'filter_select', 'params': {'filter': [{'ref': 'rank', 'value': 'order'}], 'select_ref': 'name', 'separator': ', '}},
                #{'label': 'family', 'ref': 'taxon.ancestors', 'function': 'filter_select', 'params': {'filter': [{'ref': 'rank', 'value': 'family'}], 'select_ref': 'name', 'separator': ', '}},
                #{'label': 'genus', 'ref': 'taxon.ancestors', 'function': 'filter_select', 'params': {'filter': [{'ref': 'rank', 'value': 'genus'}], 'select_ref': 'name', 'separator': ', '}},
                #{'label': 'species', 'ref': 'taxon.ancestors', 'function': 'filter_select', 'params': {'filter': [{'ref': 'rank', 'value': 'species'}], 'select_ref': 'name', 'separator': ', '}},
                {'label': 'taxon_id', 'ref': 'taxon.id'},
                {'label': 'taxon_name', 'ref': 'taxon.name'},
                {'label': 'taxon_preferred_common_name', 'ref': 'taxon.preferred_common_name'},
                {'label': 'taxon_rank', 'ref': 'taxon.rank'},
                #{'label': 'taxon_rank_level', 'ref': 'taxon.rank_level'},
                #{'label': 'taxon_ancestry', 'ref': 'taxon.ancestry'},
                #{'ref': 'observed_on_string'},
                {'ref': 'time_observed_at'},
                {'ref': 'created_at'},
                #{'ref': 'updated_at'},
                {'ref': 'place_guess'},
                #{'ref': 'location'},
                {'label': 'latitude', 'ref': 'location', 'type': float, 'function': 'split', 'params': {'separator': ',', 'index': 0}},
                {'label': 'longitude', 'ref': 'location', 'type': float, 'function': 'split', 'params': {'separator': ',', 'index': 1}},
                {'ref': 'public_positional_accuracy'},
                #{'ref': 'private_place_guess'},
                #{'ref': 'private_location'},
                #{'label': 'private_latitude', 'ref': 'private_location', 'function': 'split', 'params': {'separator': ',', 'index': 0}},
                #{'label': 'private_longitiude', 'ref': 'private_location', 'function': 'split', 'params': {'separator': ',', 'index': 1}},
                #{'ref': 'positional_accuracy'},
                {'ref': 'taxon_geoprivacy'},
                {'ref': 'privacy'},
                {'ref': 'description'},
                {'label': 'photos_count', 'ref':'photos', 'function': 'count'},
                #{'label': 'photo_1_id', 'ref': 'photos[0].id'},
                {'label': 'photo_1_url', 'ref': 'photos[0].url', 'function': 'replace', 'params': {'old_text': 'square', 'new_text': 'medium'}}, # size options are thumb, square, small, medium, large, and original
                {'label': 'photo_1_license_code', 'ref': 'photos[0].license_code'},
                {'label': 'sounds_count', 'ref':'sounds', 'function': 'count'},
                {'ref': 'comments_count'},
                #{'label': 'others_current_identifications_count', 'ref': 'identifications_count'},
                {'label': 'current_identifications_count', 'ref': 'identifications', 'function': 'filter_count', 'params': {'filter': [{'ref': 'current', 'value': True}]}},
                #{'label': 'current_identifications_by_observer', 'ref': 'identifications', 'function': 'filter_count', 'params': {'filter': [{'ref': 'user.id', 'value_ref': 'user.id'}, {'ref': 'current', 'value': True}]}},
                #{'label': 'current_identification_by_observer', 'ref': 'identifications', 'function': 'filter_select', 'params': {'filter': [{'ref': 'user.id', 'value_ref': 'user.id'}, {'ref': 'current', 'value': True}], 'select_ref': 'taxon.name', 'separator': ', '}},
                #{'label': 'current_identification_category_by_observer', 'ref': 'identifications', 'function': 'filter_select', 'params': {'filter': [{'ref': 'user.id', 'value_ref': 'user.id'}, {'ref': 'current', 'value': True}], 'select_ref': 'category', 'separator': ', '}},
                #{'ref': 'owners_identification_from_vision'},
                {'label': 'prefers_community_taxon', 'ref': 'preferences.prefers_community_taxon', 'alt': 'user.preferences.prefers_community_taxa'},
                #{'label': 'identifier_ids', 'ref': 'identifications', 'function': 'filter_select', 'params': {'filter': [{'ref': 'current', 'value': True}], 'select_ref': 'user.id', 'separator': ', '}},
                #{'label': 'identifier_logins', 'ref': 'identifications', 'function': 'filter_select', 'params': {'filter': [{'ref': 'current', 'value': True}], 'select_ref': 'user.login', 'separator': ', '}},
                {'label': 'identifications', 'ref': 'identifications', 'function': 'filter_combine', 'params': {'filter': [{'ref': 'current', 'value': True}], 'combine_refs': ['user.login','taxon.name','taxon.id'], 'template': '{0}: {1} ({2})', 'separator': ', '}},
                #{'label': 'days_to_first_id', 'ref': None, 'custom_function': (lambda x: (datetime.fromisoformat(first_id_date) - datetime.fromisoformat(x['created_at'])).days if (first_id_date := get_ref_value(x,'identifications[0].created_at')) is not None else None)},
                {'label': 'days_to_first_id_by_observer', 'ref': None, 'custom_function': (lambda x: (datetime.fromisoformat(dates_of_ids_by_observer[0]) - datetime.fromisoformat(x['created_at'])).days if (dates_of_ids_by_observer := [xi['created_at'] for xi in x.get('identifications') if (xi['user']['id']==x['user']['id'])]) else None)},
                {'label': 'days_to_first_id_by_others', 'ref': None, 'custom_function': (lambda x: (datetime.fromisoformat(dates_of_ids_by_others[0]) - datetime.fromisoformat(x['created_at'])).days if (dates_of_ids_by_others := [xi['created_at'] for xi in x.get('identifications') if (xi['user']['id']!=x['user']['id'])]) else None)},
                #{'label': 'identification_date_first', 'ref': 'identifications[0].created_at'},
                #{'label': 'identification_date_last', 'ref': 'identifications[-1].created_at'},
                #{'label': 'identifications_vs_obs', 'ref': 'identifications', 'function': 'filter_select', 'params': {'filter': [{'ref': 'current', 'value': True}], 'select_ref': 'vs_obs', 'separator': ', '}},
                {'label': 'identifications_vs_obs_same', 'ref': 'identifications', 'function': 'filter_count', 'params': {'filter': [{'ref': 'vs_obs', 'value': 'same'}, {'ref': 'current', 'value': True}]}},
                #{'label': 'ident_taxa_vs_obs_same', 'ref': 'identifications', 'function': 'filter_count', 'params': {'filter': [{'ref': 'vs_obs', 'value': 'same'}, {'ref': 'current', 'value': True}], 'distinct_ref': 'taxon.id'}},
                #{'label': 'identifications_vs_obs_ancestor', 'ref': 'identifications', 'function': 'filter_count', 'params': {'filter': [{'ref': 'vs_obs', 'value': 'ancestor'}, {'ref': 'current', 'value': True}]}},
                {'label': 'ident_taxa_vs_obs_ancestor', 'ref': 'identifications', 'function': 'filter_count', 'params': {'filter': [{'ref': 'vs_obs', 'value': 'ancestor'}, {'ref': 'current', 'value': True}], 'distinct_ref': 'taxon.id'}},
                #{'label': 'identifications_vs_obs_descendant', 'ref': 'identifications', 'function': 'filter_count', 'params': {'filter': [{'ref': 'vs_obs', 'value': 'descendant'}, {'ref': 'current', 'value': True}]}},
                {'label': 'ident_taxa_vs_obs_descendant', 'ref': 'identifications', 'function': 'filter_count', 'params': {'filter': [{'ref': 'vs_obs', 'value': 'descendant'}, {'ref': 'current', 'value': True}], 'distinct_ref': 'taxon.id'}},
                #{'label': 'identifications_vs_obs_different', 'ref': 'identifications', 'function': 'filter_count', 'params': {'filter': [{'ref': 'vs_obs', 'value': 'different'}, {'ref': 'current', 'value': True}]}},
                {'label': 'ident_taxa_vs_obs_different', 'ref': 'identifications', 'function': 'filter_count', 'params': {'filter': [{'ref': 'vs_obs', 'value': 'different'}, {'ref': 'current', 'value': True}], 'distinct_ref': 'taxon.id'}},
                {'label': 'ident_disagreement_vs_obs', 'ref': 'identifications', 'function': 'filter_select', 'params': {'filter': [{'ref': 'disagreement', 'value': True}, {'ref': 'current', 'value': True}], 'select_ref': 'vs_obs', 'separator': ', '}},
                {'label': 'reviewed_by_count', 'ref': 'reviewed_by', 'function': 'count'},
                #{'ref': 'reviewed_by', 'function': 'join', 'params': {'separator':', '}},
                #{'ref': 'captive'},
                {'label': 'annotations_count','ref':'annotations', 'function': 'count'},
                #{'label': 'annotations_ids', 'ref': 'annotations', 'function': 'filter_combine', 'params': {'combine_refs': ['controlled_attribute_id','controlled_value_id'], 'template': '{0}:{1}', 'separator': ', '}},
                {'label': 'annotations', 'ref': 'annotations', 'function': 'filter_combine', 'params': {'combine_refs': ['controlled_attribute','controlled_value'], 'template': '{0}: {1}', 'separator': ', '}}, # note: this relies on some pre-procesing to create the controlled_attribute and controlled_value fields
                #{'label': 'annotations_sex', 'ref': 'annotations', 'function': 'filter_count', 'params': {'filter': [{'ref': 'controlled_attribute_id', 'value': 9}]}},
                #{'label': 'annot_score_sex_female', 'ref': 'annotations', 'function': 'filter_select', 'params': {'filter': [{'ref': 'controlled_attribute_id', 'value': 9}, {'ref': 'controlled_value_id', 'value': 10}], 'select_ref': 'total_score', 'separator': ', '}},
                #{'label': 'annot_score_sex_male', 'ref': 'annotations', 'function': 'filter_select', 'params': {'filter': [{'ref': 'controlled_attribute_id', 'value': 9}, {'ref': 'controlled_value_id', 'value': 11}], 'select_ref': 'total_score', 'separator': ', '}},
                #{'label': 'annot_score_sex_cannot_det', 'ref': 'annotations', 'function': 'filter_select', 'params': {'filter': [{'ref': 'controlled_attribute_id', 'value': 9}, {'ref': 'controlled_value_id', 'value': 20}], 'select_ref': 'total_score', 'separator': ', '}},
                #{'label': 'annotations_phenology', 'ref': 'annotations', 'function': 'filter_count', 'params': {'filter': [{'ref': 'controlled_attribute_id', 'value': 12}]}},
                #{'label': 'annot_score_phen_flowering', 'ref': 'annotations', 'function': 'filter_select', 'params': {'filter': [{'ref': 'controlled_attribute_id', 'value': 12}, {'ref': 'controlled_value_id', 'value': 13}], 'select_ref': 'total_score', 'separator': ', '}},
                #{'label': 'annot_score_phen_fruiting', 'ref': 'annotations', 'function': 'filter_select', 'params': {'filter': [{'ref': 'controlled_attribute_id', 'value': 12}, {'ref': 'controlled_value_id', 'value': 14}], 'select_ref': 'total_score', 'separator': ', '}},
                #{'label': 'annot_score_phen_flower_bud', 'ref': 'annotations', 'function': 'filter_select', 'params': {'filter': [{'ref': 'controlled_attribute_id', 'value': 12}, {'ref': 'controlled_value_id', 'value': 15}], 'select_ref': 'total_score', 'separator': ', '}},
                #{'label': 'annot_score_phen_no_evid_of_flower', 'ref': 'annotations', 'function': 'filter_select', 'params': {'filter': [{'ref': 'controlled_attribute_id', 'value': 12}, {'ref': 'controlled_value_id', 'value': 21}], 'select_ref': 'total_score', 'separator': ', '}},
                #{'label': 'annotations_life_stage', 'ref': 'annotations', 'function': 'filter_count', 'params': {'filter': [{'ref': 'controlled_attribute_id', 'value': 1}]}},
                #{'label': 'annot_score_stage_adult', 'ref': 'annotations', 'function': 'filter_select', 'params': {'filter': [{'ref': 'controlled_attribute_id', 'value': 1}, {'ref': 'controlled_value_id', 'value': 2}], 'select_ref': 'total_score', 'separator': ', '}},
                #{'label': 'annot_score_stage_teneral', 'ref': 'annotations', 'function': 'filter_select', 'params': {'filter': [{'ref': 'controlled_attribute_id', 'value': 1}, {'ref': 'controlled_value_id', 'value': 3}], 'select_ref': 'total_score', 'separator': ', '}},
                #{'label': 'annot_score_stage_pupa', 'ref': 'annotations', 'function': 'filter_select', 'params': {'filter': [{'ref': 'controlled_attribute_id', 'value': 1}, {'ref': 'controlled_value_id', 'value': 4}], 'select_ref': 'total_score', 'separator': ', '}},
                #{'label': 'annot_score_stage_nymph', 'ref': 'annotations', 'function': 'filter_select', 'params': {'filter': [{'ref': 'controlled_attribute_id', 'value': 1}, {'ref': 'controlled_value_id', 'value': 5}], 'select_ref': 'total_score', 'separator': ', '}},
                #{'label': 'annot_score_stage_larva', 'ref': 'annotations', 'function': 'filter_select', 'params': {'filter': [{'ref': 'controlled_attribute_id', 'value': 1}, {'ref': 'controlled_value_id', 'value': 6}], 'select_ref': 'total_score', 'separator': ', '}},
                #{'label': 'annot_score_stage_egg', 'ref': 'annotations', 'function': 'filter_select', 'params': {'filter': [{'ref': 'controlled_attribute_id', 'value': 1}, {'ref': 'controlled_value_id', 'value': 7}], 'select_ref': 'total_score', 'separator': ', '}},
                #{'label': 'annot_score_stage_juvenile', 'ref': 'annotations', 'function': 'filter_select', 'params': {'filter': [{'ref': 'controlled_attribute_id', 'value': 1}, {'ref': 'controlled_value_id', 'value': 8}], 'select_ref': 'total_score', 'separator': ', '}},
                #{'label': 'annot_score_stage_subimago', 'ref': 'annotations', 'function': 'filter_select', 'params': {'filter': [{'ref': 'controlled_attribute_id', 'value': 1}, {'ref': 'controlled_value_id', 'value': 16}], 'select_ref': 'total_score', 'separator': ', '}},
                {'label': 'observation_fields_count', 'ref':'ofvs', 'function': 'count'},
                #{'label': 'observation_fields', 'ref': 'ofvs', 'function': 'filter_combine', 'params': {'combine_refs': ['name','field_id','value'], 'template': '{0} ({1}): {2}', 'separator': '; '}},
                {'label': 'observation_fields', 'ref': 'ofvs', 'function': 'filter_combine', 'params': {'combine_refs': ['name','field_id','taxon_or_value'], 'template': '{0} ({1}): {2}', 'separator': '; '}}, # note: this relies on pre-procesing to create a field that contains either taxon or value
                {'label': 'tags_count', 'ref':'tags', 'function': 'count'},
                {'ref': 'tags', 'function': 'join', 'params': {'separator': ', '}},
                #{'ref': 'oauth_application_id'},
                #{'ref': 'site_id'},
                {'label': 'gbif_occurence_url', 'ref': 'outlinks', 'function': 'filter_select', 'params': {'filter': [{'ref': 'source', 'value': 'GBIF'}], 'select_ref': 'url', 'separator': ', '}},
                #{'ref': 'place_ids'}, # if this is retrieved, this will be replaced with standard place information (later via a post-get function)
            ]
            parse_field_refs = refs_in_parse_fields(parse_fields)
            # use pre-parse functions to handle more complicated stiuations which can be handled per record / page
            pre_parse_function_associations = [
                {'function': add_annot_descr_and_total_score, 'new_refs': ['annotations.controlled_attribute', 'annotations.controlled_value','annotations.total_score'], 'prereq_function': get_annotations},
                {'function': add_obs_taxon_ancestors, 'new_refs': ['taxon.ancestors']},
                {'function': add_obs_field_taxon_or_value, 'new_refs': ['ofvs.taxon_or_value']},
                {'function': add_ident_vs_obs_comparison, 'new_refs': ['identifications.vs_obs']},
            ]
            for ppfa in pre_parse_function_associations: # run certain pre-parse_functions in cases where parse_fields defintiions reference fields created by these functions
                if len([nr for nr in ppfa['new_refs'] if len([ppr for ppr in parse_field_refs if ppr.startswith(nr)]) > 0]) > 0:
                    if (prereq_function := ppfa.get('prereq_function')):
                        await prereq_function()
                    pre_parse_functions.append(ppfa['function'])
            # use post-get functions to handle more complicated situations which are best handled once the entire set of observations is available
            if 'place_ids' in parse_field_refs:
                post_get_functions.append(add_std_places)
            parse_function = partial(parse_results, parse_fields=parse_fields, pre_parse_functions=pre_parse_functions) # pre-load parse_fields with these parse_fields and pre_parse_functions
    print('Getting observations...')
    results = await get_results(endpoint_get_obs, params, get_all_pages, use_authorization, parse_function, pre_parse_filter_function, post_parse_filter_function)
    for pgf in post_get_functions:
        try:
            await pgf(results) # assume async function by default
        except:
            pgf(results) # fall back to regular execution for non-async functions
    return results

async def add_std_places(obs, remove_place_ids=True):
    """Intended to be run as a post-get function in parse_obs (after getting all pages of observations, to minimize the number of requests to get place data).
    Add human-friendly standard place information to observations, if places_ids are included in (parsed) results
    """
    # get a unique list of place_ids associated with the observations
    place_ids = [pid if (pid := o.get('place_ids')) else [] for o in obs]
    place_ids = set([pp for p in place_ids for pp in p])
    if not place_ids:
        return obs
    print('Adding standard place info to observations...')
    # request info from the API for only the "standard" places (continents, plus country-, state-, county-, and town-equivalent places)
    admin_level_xref = { # define these in the order these should be displayed in the results
        #'30': 'town', # these are available only for certain states in the USA
        '20':'county',
        '10':'state',
        '0':'country',
        '-10': 'continent',
    }
    place_req_params = {'id': list(map(str,place_ids)), 'admin_level': list(admin_level_xref.keys())}
    std_places = await get_results(endpoint_get_places, place_req_params, get_all_pages=True, parse_function=partial(parse_simple, field_list=['id','admin_level','name','slug']))
    if std_places is None:
        std_places = []
    # add std place info to observations
    for o in obs:
        osp = [sp for sp in std_places if sp['id'] in op] if (op := o.get('place_ids')) else [] # find standard places associated with this obs
        for alc, ald in admin_level_xref.items():
            alp = [ospp for ospp in osp if ospp['admin_level'] == int(alc)] # find the place with this admin level
            o[f'std_place_{ald}'] = alp[0]['name'] if alp else None
        if remove_place_ids: 
            o.pop('place_ids',None) # remove this column from the parsed results, since it's no longer needed
    return obs

async def get_count_series(endpoint, series, series_params, base_params=None, count_label='rec_count', use_authorization=False, add_count_to_series=False):
    """Get a series of counts
    Base_params are the (fixed) parameters that will be applied when getting the count for each item in the series.
    Series_params is a list of (variable) parameters (keys) to add to base_params for each item in the series.
    Series is a list of dicts, each of which defines the parameter key/value pairs for each item in the series.
    Each item in the series list can contain additional attributes that are not parameters, and it does not have to contain all the keys in the series_params list.
    If add_count_to_series is set to True, the function will add the counts to the original series object; otherwise, it just returns a (somewhat deep) copy of series with counts.
    """
    if base_params is None:
        base_params = {}
    if not series or not series_params:
        print(f'The series parameter must be a list of dicts with keys that include the values in the list passed in for series_params.')
        return None
    rv = []
    results = series if add_count_to_series else series.copy() # return values will look the same, but if add_count_to_series=True, the original series list wlll actually change
    async with asyncio.TaskGroup() as tg: # available in Python 3.11+
        tasks = []
        for i, r in enumerate(results):
            rp = base_params.copy()
            for sp in series_params:
                if (spv := r.get(sp)) is not None:
                    rp.pop(sp, None)
                    rp[sp] = [str(spv)]
            tasks.append(tg.create_task(get_total_results(endpoint, rp, use_authorization=use_authorization, delay=i)))
    for i, t in enumerate(tasks):
        if not add_count_to_series:
            results[i] = results[i].copy() # return values will look the same, but if add_count_to_series=True, the original series list wlll actually change
        results[i][count_label] = t.result()
    return results

def items_to_batches(items, max_batch_size=500, separator=',', prefix=''):
    """String together a list of items into batches of up to a max number of items per set.
    (The original intended use case is to create URLs linking to the iNaturalist Explore or Identification page, filtered for batches of specific observations.)
    """
    batches = []
    for i in range(0, len(items), max_batch_size):
        items_string = prefix + separator.join(map(str, items[i:i+max_batch_size]))
        batches.append(items_string)
        print(f'Batch {int(i/max_batch_size+1)}: {items_string}')
    return batches

In [None]:
# define the parameters needed for your request
req_params_string = 'verifiable=true&spam=false'
req_params = params_to_dict(req_params_string)
req_headers_base = {'Content-Type': 'application/json', 'Accept': 'application/json'}

# to make authorized calls, set jwt to the "api_token" value from https://www.inaturalist.org/users/api_token.
# the JWT is valid for 24 hours. it can be used to do / access anything your iNat account can access. so keep it safe, and don't share it.
# you will also have to set use_authorization=True when making your API request below.
jwt = None

# define endpoints
endpoint_get_obs = {
    'method': 'GET',
    'url': 'https://api.inaturalist.org/v1/observations',
    'max_records': 10000,
    'max_per_page': 200,
}
endpoint_get_controlled_terms = {
    'method': 'GET' ,
    'url': 'https://api.inaturalist.org/v1/controlled_terms',
}
endpoint_get_places = {
    'method': 'GET',
    'url': 'https://api.inaturalist.org/v1/places/{id}',
    'max_per_page': 500,
    'page_key': 'id'
}

In [None]:
# main execution section

# get observations
obs = await get_obs(req_params, get_all_pages=False, use_authorization=False)
#obs

# when possible, it's always best to filter on the server side by using filter parameters when making API requests.
# but when a particular filter is not available in the API, it may still be possible to filter on the client side (as opposed to server side)
# use pre_parse_filter_function when you can filter based on the results directly from the API response.
# use post_parse_filter_function when you must rely on the values in a parsed field to do the filtering.
# (you can always filter separately *after* getting observations, of course, but filtering *while* getting obs saves on system memory when getting multiple pages of results from the API.)
# here's an example of how to do client-side filtering for observations which have >1 (current) identification using post_parse_filter_function
#obs = await get_obs(req_params, get_all_pages=False, use_authorization=False, post_parse_filter_function=(lambda x: x['current_identifications_count'] > 1))

# get observation ids from obs
#obs_ids = [o.get('id') for o in obs]
#obs_id_sets = items_to_batches(obs_ids, prefix='https://www.inaturalist.org/observations/identify?id=')

# get just total results (count)
#obs_count = await get_total_results(endpoint_get_obs, req_params, use_authorization=False)
#obs_count

# get a series of counts
#obs_count_series = [
#    {'label': 'Texas 2020', 'year': 2020, 'place_id': 18},
#    {'label': 'not Texas 2020', 'year': 2020, 'not_in_place': 18},
#    {'label': 'Texas 2021', 'year': 2021, 'place_id': 18},
#    {'label': 'not Texas 2021', 'year': 2021, 'not_in_place': 18},
#]
#await get_count_series(endpoint_get_obs, obs_count_series, ['year','place_id','not_in_place'], base_params=req_params, count_label='obs_count', use_authorization=False, add_count_to_series=True)
#obs_count_series

In [None]:
# if you order by id when you get observations (this is the default behavior if you don't specify an order_by parameter), 
# then it should be possible to work around the max 10000 record limit of the API by using the id_above or id_below parameters.
# i purposely am not automating this process completely (because I don't want to make it too easy to accidentally get a ton of data),
# but i'm including this bit of code here to provide an idea of how to do it.
# to use the code below, set get_more_obs = True before running.
get_more_obs = False
#if get_more_obs and obs and len(obs) >= endpoint_get_obs['max_records'] and len(obs) % endpoint_get_obs['max_records'] == 0:
if get_more_obs and obs:
    rp = req_params.copy() # make a copy
    if rp.get('order_by',['id']) == ['id']: # this only works if the records were sorted by id
        if rp.get('order',['desc']) == ['asc']:
            max_id = max([o.get('id') for o in obs])
            print(f'Getting additional observations for id_above={max_id}')
            rp.pop('id_above', None) # remove per_page parameter, if it exists
            rp['id_above'] = [str(max_id)] # set this to the max_id so that the records we get will have ids above those of the obs we already have
        else:
            min_id = min([o.get('id') for o in obs])
            print(f'Getting additional observations for id_below={min_id}')
            rp.pop('id_below', None) # remove per_page parameter, if it exists
            rp['id_below'] = [str(min_id)] # set this to the min_id so that the records we get will have ids below those of the obs we already have
        obs += await get_obs(rp, get_all_pages=True, use_authorization=False)
        print(f'Observations accumulated: {len(obs)}')

## Write Data to CSV

Ths takes the results retrieved above and writes them to a CSV file. The file will appear in the main folder of the file tree (the topmost tab in the left pane of the JupyterLab interface). Files generated in JupyterLite are saved to the browser's storage. So those will need to be downloaded to a more permanent location if they need to be archived more permanently.

In [None]:
# load required modules
import csv # used to output CSV files

In [None]:
def data_to_csv(data, csv_filename='export.csv'):
    """Write data to a CSV file"""
    csv_fields = list(data[0]) # get fields from the keys of the first record in the dataset
    with open(csv_filename, 'w', newline='') as csv_file:
        csv_writer = csv.DictWriter(csv_file, fieldnames=csv_fields)
        csv_writer.writeheader()
        csv_writer.writerows(data)
        print(f'Created CSV file {csv_filename} with {len(data)} records.')

In [None]:
# export to CSV
data_to_csv(obs,'observations.csv')

## Work with Data in a DataFrame

Since many Python analysis / visualization modules and workflows rely on getting data into a `pandas` dataframe, this provides a very barebones example of getting the data into a dataframe. The dataframe should generally handle most of the data type conversions, but there's a little bit more effort to get dates into a datetime typed column in the dataframe.

In [None]:
# load required modules
import pandas as pd

In [None]:
# load data into a DataFrame (df)
df = pd.DataFrame(obs)

# pandas can export to CSV, too
#df.to_csv('observations_from_df.csv', index=False)

In [None]:
# Preview the contents of the df
df

# miscellaneous other examples:

# first 10 records
#df[0:9]

# records where observation_fields are not null
#df.loc[df.observation_fields.notnull()]

# count (of id of) records where acc > 100
#df.loc[df.public_positional_accuracy > 100].id.count()

# id, lat, and long, sorted by latitude
#df[['id','latitude','longitude']].sort_values('latitude', ascending=True)

# count by taxon rank, and sort by count descending
#df.groupby('taxon_rank').id.count().sort_values(ascending=False)

In [None]:
# Get basic summary statistics for df
df.describe()

In [None]:
# convert datetime columns to datetimes, localized to UTC
for k in ['time_observed_at','created_at','updated_at']:
    if k in df.columns:
        try:
            df[k] = pd.to_datetime(df[k], utc=True, errors='coerce')
        except:
            print(f'Could not convert column {k} to datetime')

# get count (of id) by observed year
df.groupby(df.time_observed_at.dt.year).id.count()

# get count (of id) by created year
#df.groupby(df.created_at.dt.year).id.count()