# Test-EBird-Dec2020

# Description
Examine bulk data from eBird for December 2020

https://ebird.org/science/use-ebird-data/download-ebird-data-products  
https://ebird.org/data/download  


# Environment

## Library Imports

In [None]:
import warnings
# warnings.simplefilter('always') # 'error' to break\n", \"always\"

warnings.filterwarnings("ignore", category=RuntimeWarning, module='geopandas')

In [None]:
import sys
sys.path.append('common')
sys.path.append('textextractor')
sys.path.append('taxonomy')

import pandas as pd
import numpy as np
from pathlib import Path
from datetime import datetime
import geopandas as gpd

from shapely import geometry
from shapely.geometry import Point

## Local Imports

In [None]:
import common_jupyter

# https://medium.com/@rrfd/cookiecutter-data-science-organize-your-projects-atom-and-jupyter-2be7862f487e
from common_paths import *

from local_translation_context import LocalTranslationContext
from taxonomy import Taxonomy
from ebird_extras import EBirdExtra
from parameters import Parameters

from count_day_tasks import summarize_checklists, create_full_circle_summary, get_participants, \
    subids_for_pete_dunten, add_bob_hirt, get_personal_checklist_details

from datetime_manipulation import create_count_week
# from checklist_manipulation import create_checklist_meta

# from write_final_checklist import write_final_checklist_spreadsheet, excel_columns, \
#     sheet_info_for_party_efforts, sheet_info_for_party_details, sheet_info_for_rarities, sheet_info_for_filers
# from autoparty import sheet_info_for_autoparty, generate_autoparty
from locations_map import create_coverage_map, create_potential_duplicates_map
from utilities_kml import build_geodata, build_location_data, update_geo_data_with_clustering, build_location_meta
from ebird_visits import transform_visits, visits_in_circle
# from utilities_clustering import generate_cluster_table, plot_elbow_curve
# from filers_matrix import create_filers_matrix
from checklist_manipulation import create_checklist_meta, write_checklist_meta, find_location_near_duplicates
# from checklist_manipulation import construct_team_details, construct_team_efforts

# Code

In [None]:
def visits_in_circle(ebirders, geo_data, circle_code, visits):
    # Also filters by participants
    circle_geometry = geo_data[(geo_data.CircleCode == circle_code) &
                               (geo_data.type == 'circle')].geometry.values[0]

    # Note that by construction, visits only contains data for dates we care about
    # so we don't need to filter for that. We pass them to get_details grouped by date though.
    mask = [pt.within(circle_geometry) for pt in visits.geometry.values]
    if ebirders is not None:
        mask &= visits.Name.isin(ebirders)
    visits_of_interest = visits[mask].sort_values(by=['locId'])

    return visits_of_interest


In [None]:
import dateutil.parser as parser

def normalize_time_for_visits(time_str: str) -> str:
    # visits has e.g. obsTime 17:23, with no seconds
    xtime = parser.parse(time_str).strftime('%H:%M')

    return xtime

In [None]:
def load_bulk_data() -> pd.DataFrame():
    bulk_data = None
    # This is really specific, so hardwire paths for now
    bulk_data_dir = raw_data_path / 'ebd_US-CA_202012_202101_prv_relDec-2020'
    bulk_data_path = bulk_data_dir / 'ebd_US-CA_202012_202101_prv_relDec-2020.txt'
    if not bulk_data_path.exists():
        return None
    
    bulk_data = pd.read_csv(bulk_data_path, dtype=str, header=0, sep='\t', low_memory=False).fillna('')
    provisional_data_path = bulk_data_dir / 'ebd_US-CA_202012_202101_prv_relDec-2020_provisional.txt'
    if provisional_data_path.exists():
        prov_data  = pd.read_csv(provisional_data_path, dtype=str, header=0, sep='\t', low_memory=False).fillna('')
        bulk_data = pd.concat([bulk_data, prov_data], axis=0, ignore_index=True)
        
    return bulk_data

In [None]:
from typing import List, Optional
def find_missing_subids(visits: pd.DataFrame, bulk_data: Optional[pd.DataFrame], 
                        xdates: List[str], region_codes: List[str]):
    if bulk_data is None:
        return []
    mask = (bulk_data['OBSERVATION DATE'].isin(xdates)) & (bulk_data['COUNTY CODE'].isin(region_codes))
    bulk_subids = set(bulk_data[mask]['SAMPLING EVENT IDENTIFIER'].values)
    base_subids = set(visits.subId.values)
    
    return sorted(list(bulk_subids - set(base_subids)))

In [None]:
def use_basic_dataset(visits: pd.DataFrame, xdates: List[str], region_codes: List[str]) -> pd.DataFrame:
    # Consult Basic Dataset (EBD) bulk data from eBird to find missing subIds
    # Append records to visits if any are found
    # Takes about 13s to load BDS for Dec 2020
    bulk_data = load_bulk_data()
    if bulk_data is None:
        return visits

    missing_subids = find_missing_subids(visits, bulk_data, xdates, region_codes)
    bds = bulk_data[bulk_data['SAMPLING EVENT IDENTIFIER'].isin(missing_subids)].copy().reset_index(drop=True)
    if bds.empty:
        return visits
    
    # Names match those in visits
    new_col_names = {
        'LOCALITY ID': 'locId', 'SAMPLING EVENT IDENTIFIER': 'subId', 'OBSERVER ID': 'Name',
         'OBSERVATION DATE': 'obsDt',   'TIME OBSERVATIONS STARTED': 'obsTime',
        'LOCALITY': 'loc_name',  'LATITUDE': 'latitude', 'LONGITUDE': 'longitude', 
    }
    bds.rename(columns=new_col_names, inplace=True)

    numSpecies_df = bds.groupby(['subId']).size().reset_index(name='numSpecies').sort_values(by=['subId'])

    bds = bds.drop_duplicates(['subId', 'obsDt', 'obsTime', 'latitude', 'longitude']).reset_index(drop=True)

    bds['numSpecies'] = numSpecies_df.numSpecies.values
    bds.obsTime = bds.obsTime.apply(normalize_time_for_visits)

    new_col_order = ['locId', 'subId', 'Name', 'numSpecies', 'obsDt', 'obsTime', 'loc_name', 'latitude', 'longitude']
    bds = bds[new_col_order].sort_values(by=['subId']).reset_index(drop=True)

    for col in ['latitude', 'longitude']:
        bds[col] = bds[col].apply(pd.to_numeric).fillna(0).astype(float)

    vgeometry = [Point(x, y) for x, y in zip(bds.longitude, bds.latitude)]  # Longitude first
    bds['geometry'] = vgeometry

    # We could fix 'Name' with 'userDisplayName' field from get_details, but not important here

    return pd.concat([visits, bds], axis=0, ignore_index=True)

# Initializations

In [None]:
# Initializations
print(f'Start : {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')
print('Initializing...')

create_project_paths()

# Overrides - Credentials
# See Samples/eBirdCredentials.yml for an example
my_credentials_storage_path = Path('/Volumes/TSecure3/other/')
eBirdCredential_path = my_credentials_storage_path / 'eBirdCredentials.yml'

# -----------------------------------------------------------------------------------------
# Override - This will find the correct parameter file out of many in Local folder
# -----------------------------------------------------------------------------------------
circle_prefix = 'CACR-2020-'

# Parameters
parameters = Parameters(local_parameters_path, system_parameters_path, circle_prefix, False)

local_translation_context = LocalTranslationContext(local_parameters_path, system_parameters_path)
local_translation_context.reload() # DEBUG; allows test/edit cycle without restarting kernel (singleton)

# Singletons
country = parameters.parameters.get('NationalCode', 'US')
ebird_extra = EBirdExtra(eBirdCredential_path, cache_path, country)
taxonomy = Taxonomy(cache_path, ebird_extra)

# Convenient Parameters
circle_code = parameters.parameters.get('CircleAbbrev', 'XXXX')
date_of_count = parameters.parameters['CountDate']
count_week_start = parameters.parameters.get('CountWeekStart', date_of_count)
count_week_end = parameters.parameters.get('CountWeekEnd', date_of_count)
region_codes = [xs.strip() for xs in parameters.parameters['eBirdRegion'].split(',')]

# Will drop any dates in the future
count_week = create_count_week(count_week_start, count_week_end)

print('Initialization complete')

# Main

In [None]:
if __name__ == '__main__':

    geo_data = build_geodata(parameters)
    
    # May need bootstrapping
    participants = get_participants(circle_prefix)

    xdates = [date_of_count] #if count_day_only else count_week
    visits = ebird_extra.get_visits_for_dates(region_codes, xdates)
    print(f'Checklists filed in count circle: {visits.shape[0]}')
    visits = transform_visits(visits)
    
    # Add bulk data extras here
    visits = use_basic_dataset(visits, xdates, region_codes)
    print(f'Checklists after basic dataset: {visits.shape[0]}')

    visits_of_interest = visits_in_circle(participants, geo_data, circle_code, visits)
    visits_of_interest.shape, visits.shape

In [None]:
1/0

# Experiments

In [None]:
%%time
bulk_data = load_bulk_data()

In [None]:
find_missing_subids(visits, bulk_data, date_of_count, region_codes)

In [None]:
!ls {raw_data_path / 'ebd_US-CA_202012_202101_prv_relDec-2020'}

In [None]:
ebird_dec2020_path = raw_data_path / 'ebd_US-CA-085_202012_202101_prv_relDec-2020'
!ls {ebird_dec2020_path}

In [None]:
raw_dec2020_path = ebird_dec2020_path / 'ebd_US-CA-085_202012_202101_prv_relDec-2020.txt'
raw_dec2020 = pd.read_csv(raw_dec2020_path, dtype=str, header=0, sep='\t',
                                         low_memory=False).fillna('')
raw_dec2020.shape

In [None]:
raw_dec2020.columns

In [None]:
raw_dec2020.head(3)

In [None]:
cacr_subset = raw_dec2020[raw_dec2020['OBSERVATION DATE']==date_of_count].copy()

In [None]:
# ', '.join(sorted(list(set(casj_subset['SAMPLING EVENT IDENTIFIER'].values) - set(visits.subId.values))))

In [None]:
len(set(cacr_subset['SAMPLING EVENT IDENTIFIER'].values))

In [None]:
', '.join(sorted(list(set(cacr_subset['SAMPLING EVENT IDENTIFIER'].values) - set(visits.subId.values))))

In [None]:
raw_geo_columns = ['LOCALITY', 'LOCALITY ID', 
                   'LATITUDE', 'LONGITUDE', 'OBSERVATION DATE',
                  'TIME OBSERVATIONS STARTED', 'SAMPLING EVENT IDENTIFIER', 
                   'OBSERVER ID', 'OBSERVATION COUNT']
count_subset = raw_dec2020[raw_dec2020['OBSERVATION DATE']==date_of_count].copy()
count_geo = count_subset[raw_geo_columns].copy().drop_duplicates(['SAMPLING EVENT IDENTIFIER']).reset_index(drop=True)

count_geo.rename(columns={'LOCALITY': 'loc_name', 'LOCALITY ID': 'locId', 
        'LATITUDE': 'latitude', 'LONGITUDE': 'longitude', 'OBSERVATION DATE': 'obsDt',
       'TIME OBSERVATIONS STARTED': 'obsTime', 'SAMPLING EVENT IDENTIFIER': 'subId',
                         'OBSERVER ID': 'Name', 'OBSERVATION COUNT': 'numSpecies'}, inplace=True)

new_col_order = ['locId', 'subId', 'obsDt', 'obsTime', 'loc_name', 'latitude', 'longitude', 'Name', 'numSpecies']
count_geo = count_geo[new_col_order]

for col in ['latitude', 'longitude']:
    count_geo[col] = count_geo[col].apply(pd.to_numeric).fillna(0).astype(float)

vgeometry = [Point(x, y) for x, y in zip(count_geo.longitude, count_geo.latitude)]  # Longitude first
count_geo['geometry'] = vgeometry

In [None]:
count_geo.shape

In [None]:
count_geo.columns

In [None]:
visits_of_interest.head(3)

In [None]:
visits_of_interest.columns

In [None]:
# count_geo.rename(columns={'LOCALITY': 'loc_name', 'LOCALITY ID': 'locId', 
#         'LATITUDE': 'latitude', 'LONGITUDE': 'longitude', 'OBSERVATION DATE': 'obsDt',
#        'TIME OBSERVATIONS STARTED': 'obsTime', 'SAMPLING EVENT IDENTIFIER': 'subId',
#                          'OBSERVER ID': 'Name', 'OBSERVATION COUNT': 'numSpecies'}, inplace=True)

# new_col_order = ['locId', 'subId', 'obsDt', 'obsTime', 'loc_name', 'latitude', 'longitude', 'Name', 'numSpecies']
# count_geo = count_geo[new_col_order]

# for col in ['latitude', 'longitude']:
#     count_geo[col] = count_geo[col].apply(pd.to_numeric).fillna(0).astype(int)

# vgeometry = [Point(x, y) for x, y in zip(count_geo.longitude, count_geo.latitude)]  # Longitude first
# count_geo['geometry'] = vgeometry

In [None]:
voi = visits_in_circle(None, geo_data, circle_code, count_geo)
voi.shape

In [None]:
circle_code

In [None]:
circle_geometry = geo_data[(geo_data.CircleCode == circle_code) &
                               (geo_data.type == 'circle')].geometry.values[0]

# # Note that by construction, visits only contains data for dates we care about
# # so we don't need to filter for that. We pass them to get_details grouped by date though.
mask = [pt.within(circle_geometry) for pt in count_geo.geometry.values]
# if ebirders is not None:
#     mask &= visits.Name.isin(ebirders)
# visits_of_interest = visits[mask].sort_values(by=['locId'])

In [None]:
sum(mask)

In [None]:
geo_data

In [None]:
count_geo.geometry.values[0]

In [None]:
sum([circle_geometry.contains(pt) for pt in count_geo.geometry.values])

In [None]:
count_geo.geometry.values[0]

In [None]:
count_geo.columns

In [None]:
create_coverage_map(count_geo, parameters, geo_data, None, None)

In [None]:
parameters.parameters

In [None]:
count_geo.head()

In [None]:
', '.join(sorted(list(set(voi.subId.values) - set(visits_of_interest.subId.values))))

In [None]:
len(['S77994527, S77994556, S77995886, S78003733, S78011449, S78012810, S78016009, S78035225, S78036994, S78037576, S78038313'])

In [None]:
raw_dec2020.APPROVED.value_counts()

In [None]:
raw_dec2020['HAS MEDIA'].value_counts()

# Compare to eBird API

In [None]:
region_codes

In [None]:
raw_visits = ebird_extra.ebird_client.get_visits('US-CA-085', date_of_count)

In [None]:
raw_visits.columns

In [None]:
pd.DataFrame(raw_visits)

In [None]:
# ['locId', 'subId', 'userDisplayName', 'numSpecies', 'obsDt', 'obsTime', 'subID', 'loc']
pd.DataFrame(raw_visits).columns

In [None]:
rawdf = pd.DataFrame(raw_visits)

In [None]:
rawdf.iloc[0]['loc']

In [None]:
raw_dec2020.columns

In [None]:
subid = 'S78043369'
s78043369 = ebird_extra.ebird_client.get_checklist(subid)
s78043369 #.columns

In [None]:
rowdict = s78043369.copy()
obs = rowdict['obs']
del rowdict['obs']
rowdict.update(obs[0])
rowdict

In [None]:
list(rowdict.keys())

In [None]:
raw_dec2020.columns.to_csv('/tmp/raw_dec2020_columns.csv')

In [None]:
pd.Series(list(rowdict.keys()))

In [None]:
colmap = pd.DataFrame()
colmap['raw'] = list(raw_dec2020.columns)
colmap['api'] = pd.Series(list(rowdict.keys())).pad(limit=len(list(raw_dec2020.columns)))
colmap.to_csv('/tmp/raw_dec2020_columns.csv')

In [None]:
# The date in the raw data dump is most similar to the data returned from the eBird API
# get_details. Our main purpose here is to find any missing subIds. We could morph the
# data in the bulk dump, but for the small number of missing records, it is easier to 
# just add them to visits

In [None]:
visits.head(3)

In [None]:
raw_dec2020['COUNTY CODE'].value_counts()

In [None]:
base_subids = visits.subId.values
len(base_subids)

In [None]:
bulk_subids = set(raw_dec2020[raw_dec2020['OBSERVATION DATE']==date_of_count]['SAMPLING EVENT IDENTIFIER'].values)
len(bulk_subids)

In [None]:
sorted(list(set(base_subids) ^ bulk_subids))

In [None]:
sorted(list(bulk_subids - set(base_subids)))

In [None]:
# https://ebird.org/checklist/S79107962
sorted(list(set(base_subids) - bulk_subids ))

In [None]:
find_missing_subids(visits, raw_dec2020, date_of_count, region_codes)

In [None]:
raw_dec2020.head(3)

In [None]:
raw_geo_columns = ['LOCALITY', 'LOCALITY ID', 
                   'LATITUDE', 'LONGITUDE', 'OBSERVATION DATE',
                  'TIME OBSERVATIONS STARTED', 'SAMPLING EVENT IDENTIFIER', 
                   'OBSERVER ID', 'OBSERVATION COUNT']
count_subset = raw_dec2020[raw_dec2020['OBSERVATION DATE']==date_of_count].copy()
count_geo = count_subset[raw_geo_columns].copy().drop_duplicates(['SAMPLING EVENT IDENTIFIER']).reset_index(drop=True)

count_geo.rename(columns={'LOCALITY': 'loc_name', 'LOCALITY ID': 'locId', 
        'LATITUDE': 'latitude', 'LONGITUDE': 'longitude', 'OBSERVATION DATE': 'obsDt',
       'TIME OBSERVATIONS STARTED': 'obsTime', 'SAMPLING EVENT IDENTIFIER': 'subId',
                         'OBSERVER ID': 'Name', 'OBSERVATION COUNT': 'numSpecies'}, inplace=True)

new_col_order = ['locId', 'subId', 'obsDt', 'obsTime', 'loc_name', 'latitude', 'longitude', 'Name', 'numSpecies']
count_geo = count_geo[new_col_order]

for col in ['latitude', 'longitude']:
    count_geo[col] = count_geo[col].apply(pd.to_numeric).fillna(0).astype(float)

vgeometry = [Point(x, y) for x, y in zip(count_geo.longitude, count_geo.latitude)]  # Longitude first
count_geo['geometry'] = vgeometry

In [None]:
raw_data_path

# Can we make fake visits entries with bulk data?

In [None]:
# Basic Dataset (EBD)

In [None]:
missing_subids = find_missing_subids(visits, bulk_data, date_of_count, region_codes)

In [None]:
visits.head(3)

In [None]:
visits.columns

In [None]:
bds = bulk_data[bulk_data['SAMPLING EVENT IDENTIFIER'].isin(missing_subids)].copy().reset_index()
bds.shape

In [None]:
bds.head(2)

In [None]:
bds.groupby(['SAMPLING EVENT IDENTIFIER'])

In [None]:
bds.groupby(['SAMPLING EVENT IDENTIFIER']).size().reset_index(name='numSpecies').head(5)

In [None]:
for ix, (subid, grp) in enumerate(bds.groupby(['SAMPLING EVENT IDENTIFIER'])):


In [None]:
bds = bulk_data[bulk_data['SAMPLING EVENT IDENTIFIER'].isin(missing_subids)].copy().reset_index(drop=True)

# Names match those in visits
new_col_names = {
    'LOCALITY ID': 'locId', 'SAMPLING EVENT IDENTIFIER': 'subId', 'OBSERVER ID': 'Name',
     'OBSERVATION DATE': 'obsDt',   'TIME OBSERVATIONS STARTED': 'obsTime',
    'LOCALITY': 'loc_name',  'LATITUDE': 'latitude', 'LONGITUDE': 'longitude', 
        
                         'OBSERVATION COUNT': 'numSpecies'
}
bds.rename(columns=new_col_names, inplace=True)

numSpecies_df = bds.groupby(['subId']).size().reset_index(name='numSpecies').sort_values(by=['subId'])

new_col_order = ['locId', 'subId', 'Name', 'numSpecies', 'obsDt', 'obsTime', 'loc_name', 'latitude', 'longitude']
bds = bds[new_col_order].sort_values(by=['subId']).reset_index(drop=True)

bds = bds.drop_duplicates(['subId', 'obsDt', 'obsTime', 'latitude', 'longitude']).reset_index(drop=True)

bds['numSpecies'] = numSpecies_df.numSpecies.values
bds.obsTime = bds.obsTime.apply(normalize_time_for_visits)

for col in ['latitude', 'longitude']:
    bds[col] = bds[col].apply(pd.to_numeric).fillna(0).astype(float)

vgeometry = [Point(x, y) for x, y in zip(bds.longitude, bds.latitude)]  # Longitude first
bds['geometry'] = vgeometry

# We could fix 'Name' with 'userDisplayName' field from get_details, but not important here

In [None]:
bds.head()

In [None]:
bds.shape

In [None]:
len(missing_subids)

In [None]:
for ix, (subid, grp) in enumerate(bds.groupby(['subId', 'obsDt', 'obsTime', 'latitude', 'longitude'])):
    display(grp)

In [None]:
bds.obsTime.apply(normalize_time_for_visits)

In [None]:
subid = 'S77994527'
s77994527 = ebird_extra.ebird_client.get_checklist(subid)
s77994527