# IMOS realtime data to DwC Event Core - SMRU Example

Plan: Convert the realtime QCed IMOS marine mammal position data to DwC, and then publish the result to the IPT.

Contemporary notes from our meet w/ Ian Jonsen here: https://docs.google.com/document/d/1hibIxBbyGwa7b5-LRpnKIyjnr41EkUPKBzdUJoyAfaU/edit#heading=h.6bqw4binj5hq

### Inputs / configuration parameters:

* QCed data for a given campaign or project as the exported CSVs with appended position correction data as per https://github.com/ianjonsen/ArgosQC
* credentials for IPT and 
* corresponding project ID to associate new data with
* minimum quality hit to keep

In [1]:
import pandas as pd

metadata_df = pd.read_csv('input/ct180/metadata_ct180_nrt.csv')
loc_df = pd.read_csv('input/ct180/diag_ct180_nrt.csv')

In [2]:
metadata_df[0:10]

Unnamed: 0,sattag_program,device_id,ptt,body,device_wmo_ref,tag_type,common_name,species,release_longitude,release_latitude,...,release_date,recovery_date,age_class,sex,length,estimated_mass,actual_mass,state_country,qc_start_date,qc_end_date
0,ct180,ct180-156-BAT-15,14156,196997,Q9902018,CTD_QUOT_23A,Southern elephant seal,Mirounga leonina,70.218,-49.3496,...,2023-12-21T00:00:00Z,,,m,2.36,,255.5,French Overseas Territory,2024-01-02T12:00:00Z,2024-01-06T18:00:00Z
1,ct180,ct180-172-BAT3-15,14172,153719,Q9902019,CTD_QUOT_23A,Southern elephant seal,Mirounga leonina,70.218,-49.3496,...,2023-12-23T00:00:00Z,,,m,2.65,,344.5,French Overseas Territory,2023-12-24T10:00:00Z,2024-04-24T16:00:00Z
2,ct180,ct180-946-BAT-18,14946,221816,Q9902020,CTD_QUOT_23A,Southern elephant seal,Mirounga leonina,70.218,-49.3496,...,2023-12-21T00:00:00Z,,,m,3.02,,577.0,French Overseas Territory,2024-01-10T13:00:00Z,2024-03-31T07:00:00Z
3,ct180,ct180-963-BAT-18,14963,221821,Q9902021,CTD_QUOT_23A,Southern elephant seal,Mirounga leonina,70.218,-49.3496,...,2023-12-21T00:00:00Z,,,m,2.55,,328.5,French Overseas Territory,2023-12-24T14:00:00Z,2024-09-30T20:00:00Z
4,ct180,ct180-183-BAT2-20,15183,196991,Q9902022,CTD_QUOT_23A,Southern elephant seal,Mirounga leonina,70.218,-49.3496,...,2024-01-09T00:00:00Z,,,f,2.15,,249.5,French Overseas Territory,2024-01-09T12:00:00Z,2024-04-06T00:00:00Z
5,ct180,ct180-C184-BAT-20,15184,221802,Q9902023,CTD_QUOT_23A,Southern elephant seal,Mirounga leonina,70.218,-49.3496,...,2023-12-20T00:00:00Z,,,m,2.52,,373.0,French Overseas Territory,2024-01-07T18:00:00Z,2024-01-13T18:00:00Z
6,ct180,ct180-266-BAT-20,15266,204698,Q9902038,CTD_QUOT_23A,Southern elephant seal,Mirounga leonina,70.218,-49.3496,...,2024-02-11T00:00:00Z,,,f,2.38,,250.5,French Overseas Territory,2024-02-18T11:00:00Z,2024-10-06T23:00:00Z
7,ct180,ct180-302-BAT-20,15302,204693,Q9902024,CTD_QUOT_23A,Southern elephant seal,Mirounga leonina,70.218,-49.3496,...,2023-12-22T00:00:00Z,,,m,2.97,,608.5,French Overseas Territory,2024-01-14T12:00:00Z,2024-01-15T00:00:00Z
8,ct180,ct180-304-BAT-20,15304,204688,Q9902025,CTD_QUOT_23A,Southern elephant seal,Mirounga leonina,70.218,-49.3496,...,2023-12-23T00:00:00Z,,,m,2.53,,315.5,French Overseas Territory,2024-01-09T18:00:00Z,2024-07-01T00:00:00Z
9,ct180,ct180-306-BAT-20,15306,204699,Q9902033,CTD_QUOT_23A,Southern elephant seal,Mirounga leonina,70.218,-49.3496,...,2024-02-08T00:00:00Z,,,f,2.19,,248.0,French Overseas Territory,2024-02-08T19:00:00Z,2024-11-19T19:00:00Z


## Metadata - create events and occurrences for each row

Process the metadata csv into Event Core (animal releases + tag attachments) + Occurrences (HumanObservations) + emofs for same (biological measurements are here)

In [3]:
metadata_df

# event entries: eventID = [body]-[release_date]
#                eventDate =  [release_date]
#                latitude = [release_latitude]
#                longitude = [release_longitude]
#                modified = current_date()
#                geodeticDatum = EPSG:4326
#                country = state_country  (error in current dataset, French Overseas Territory should be French Southern Lands)

column_map = {'release_date':'eventDate',
              'release_latitude':'decimalLatitude',
              'release_longitude':'decimalLongitude',
              'state_country':'country'}

event_df = metadata_df.rename(columns=column_map)
event_df['modified'] = pd.to_datetime('now', utc=True).round(freq='s')
# eventID is instrument serial number (body) + release datetime (eventDate)
event_df['eventID'] = event_df['body'].astype(str).str.cat(event_df['eventDate'].astype(str), sep='-')
event_df['geodeticDatum'] = 'EPSG:4326'
# Optional: truncate the extra columns from the core
event_df =  event_df[['eventID', 'eventDate', 'decimalLatitude', 'decimalLongitude', 'modified', 'geodeticDatum', 'country']]

In [4]:
event_df[0:5]

Unnamed: 0,eventID,eventDate,decimalLatitude,decimalLongitude,modified,geodeticDatum,country
0,196997-2023-12-21T00:00:00Z,2023-12-21T00:00:00Z,-49.3496,70.218,2024-11-20 17:16:46+00:00,EPSG:4326,French Overseas Territory
1,153719-2023-12-23T00:00:00Z,2023-12-23T00:00:00Z,-49.3496,70.218,2024-11-20 17:16:46+00:00,EPSG:4326,French Overseas Territory
2,221816-2023-12-21T00:00:00Z,2023-12-21T00:00:00Z,-49.3496,70.218,2024-11-20 17:16:46+00:00,EPSG:4326,French Overseas Territory
3,221821-2023-12-21T00:00:00Z,2023-12-21T00:00:00Z,-49.3496,70.218,2024-11-20 17:16:46+00:00,EPSG:4326,French Overseas Territory
4,196991-2024-01-09T00:00:00Z,2024-01-09T00:00:00Z,-49.3496,70.218,2024-11-20 17:16:46+00:00,EPSG:4326,French Overseas Territory


In [5]:
# EMOFs to harvest
# for the release events
# instrument manufacturer and model  (SMRU + [tag type])
# PTT
# device id
# WMO ref


In [6]:
# occ ext. entries:    occurrenceID = [body]-[release_date]
#                      species = [species]
#                      sex = [sex]
#                      eventID = [body]-[release_date]
#                      organismID = [body]-[release_date]

occ_column_map = {'release_date':'eventDate',
                  'species':'scientificName'}
occ_df = metadata_df.rename(columns=occ_column_map)
occ_df['occurrenceID'] = occ_df['body'].astype(str).str.cat(occ_df['eventDate'].astype(str), sep='-')
occ_df['eventID'] = occ_df['body'].astype(str).str.cat(occ_df['eventDate'].astype(str), sep='-')
occ_df['organismID'] = occ_df['body'].astype(str).str.cat(occ_df['eventDate'].astype(str), sep='-')
occ_df['basisOfRecord'] = 'HumanObservation'
occ_df = occ_df[['occurrenceID', 'organismID','eventID', 'sex', 'scientificName', 'basisOfRecord']]

In [7]:
occ_df[0:5]

Unnamed: 0,occurrenceID,organismID,eventID,sex,scientificName,basisOfRecord
0,196997-2023-12-21T00:00:00Z,196997-2023-12-21T00:00:00Z,196997-2023-12-21T00:00:00Z,m,Mirounga leonina,HumanObservation
1,153719-2023-12-23T00:00:00Z,153719-2023-12-23T00:00:00Z,153719-2023-12-23T00:00:00Z,m,Mirounga leonina,HumanObservation
2,221816-2023-12-21T00:00:00Z,221816-2023-12-21T00:00:00Z,221816-2023-12-21T00:00:00Z,m,Mirounga leonina,HumanObservation
3,221821-2023-12-21T00:00:00Z,221821-2023-12-21T00:00:00Z,221821-2023-12-21T00:00:00Z,m,Mirounga leonina,HumanObservation
4,196991-2024-01-09T00:00:00Z,196991-2024-01-09T00:00:00Z,196991-2024-01-09T00:00:00Z,f,Mirounga leonina,HumanObservation


In [8]:
# EMOFs to harvest
# for the occurrences:
# sex
# length
# weight

In [9]:
# Create event and occurrence entries from the locations data file
# 
# Event entries:  eventID = organismID + date_detected
#                 latitude = ssm_lat if exists else lat
#                 longitude = ssm_lon if exists else lon
#                 eventDate = d_date
#                 geodeticDatum = EPSG:4326
#                 coordinateUncertaintyInMeters = max(ssm_x_se, ssm_y_se)  -- 1 SE or 2 SE?
#                 

# add the relevant columns to loc_df from metadata_df to create organismID
loc_df = loc_df.merge(metadata_df[['device_id', 'body', 'release_date', 'species']], 
                      how='left', left_on='ref', right_on='device_id')


In [10]:
# combine the organismID + the detection date into the eventID
loc_df['eventID'] = loc_df['body'].astype(str).str.cat(loc_df[['release_date', 'd_date']], sep='-')

In [11]:

# Check: is this correct to do in all cases?
# where there has been no correction made (corrected positions = NA, 
#       then use the raw position data
loc_df['decimalLatitude'] = loc_df['ssm_lat'].fillna(loc_df['lat'])
loc_df['decimalLongitude'] = loc_df['ssm_lon'].fillna(loc_df['lon'])
loc_df['eventDate'] = loc_df['d_date']
loc_df['modified'] = pd.to_datetime('now', utc=True).round(freq='s') 

# constant
loc_df['geodeticDatum'] = 'EPSG:4326'

# Ian's got his ssm_x and ssm_y in km, not in m
loc_df['coordinateUncertaintyInMeters'] = loc_df[['ssm_x_se', 'ssm_y_se']].max(axis=1) * 1000

# revisit uncertainty - make a radius based on the max, but uncertainty is an ellipse
# OBIS doesn't know about it but we can make a Polygon and include it somewhere to preserve the better
# knowledge that we have.

# Where there are multiple hits for a given time step (many satellites have opinions on position at once), 

loc_df = loc_df.sort_values(['ref', 'd_date', 'lq'], ascending=False)
# drop all but the best of the location qualities
loc_df = loc_df.drop_duplicates(subset=['ref','d_date'], keep='first', inplace=False)


In [12]:
loc_df['lq'].unique()

array([-2, -1,  0, -9,  1,  2,  3])

In [13]:
# fallback - where coordinateUncertaintyInMeters is still null (un-QCed), 
# let's do something with the class of fix from Argos. set a lookup table as in the ATN example?

# Ian recommends - dan costa, accuracy of argos locations at sea pinnipeds
# in that article they compared GPS to Argos locations.
# https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0008677

# While this paper recommends different regimes per species due to differences in surfacing behaviour, 
# we don't have that kind of broad data in real-time land.

# So we take their recommendations for marine mammals here, and maybe we'd leave the door open to use a non-mammal error chart
# for non-airbreathers

# Other methodologies have thrown out the A and B quality hits altogther. 
# I'm not opposed to doing that but i'll confirm it with the SME beforehand
# because the QC algorithm is re-positioning the bad hits for us already.
# and if they didn't throw them out, we may not want to either.

# 68th percentile location error distances from Costa et al, in metres
# old LQ designations - CLS moved to a Kalman-filtered location set in ~2011
# error ellipses are now de riguer - semimajor and 
# semiminor ellipse axis/orientation to quantify uncertainty
# So we could harvest those first.

error_table = {3:490,
               2:1010,
               1:1200,
               0:4180,
               -1:6190,
               -2:10280,
               -9:10280} # TODO : What is the corresponding code to -9 LQ? 
                         # AniMotum thinks it's a class B
missing_errors = loc_df['coordinateUncertaintyInMeters'].isna()
loc_df.loc[missing_errors, 'coordinateUncertaintyInMeters'] = loc_df.loc[missing_errors, 'lq'].map(error_table)

In [14]:
loc_df['coordinateUncertaintyInMeters'].describe()

count    55639.000000
mean      4479.525698
std       3434.655844
min          3.273000
25%       1739.659500
50%       3002.481000
75%       6190.000000
max      18024.848000
Name: coordinateUncertaintyInMeters, dtype: float64

In [15]:
loc_df['modified']

13026   2024-11-20 17:16:59+00:00
13025   2024-11-20 17:16:59+00:00
13024   2024-11-20 17:16:59+00:00
13023   2024-11-20 17:16:59+00:00
13022   2024-11-20 17:16:59+00:00
                   ...           
4       2024-11-20 17:16:59+00:00
3       2024-11-20 17:16:59+00:00
2       2024-11-20 17:16:59+00:00
1       2024-11-20 17:16:59+00:00
0       2024-11-20 17:16:59+00:00
Name: modified, Length: 55639, dtype: datetime64[ns, UTC]

In [16]:
# Select the columns and append to the event_df

event_df = pd.concat([event_df, loc_df[['eventID', 'eventDate', 'decimalLatitude', 'decimalLongitude', 'modified','geodeticDatum', 'coordinateUncertaintyInMeters']]])

In [17]:
event_df

Unnamed: 0,eventID,eventDate,decimalLatitude,decimalLongitude,modified,geodeticDatum,country,coordinateUncertaintyInMeters
0,196997-2023-12-21T00:00:00Z,2023-12-21T00:00:00Z,-49.34960,70.21800,2024-11-20 17:16:46+00:00,EPSG:4326,French Overseas Territory,
1,153719-2023-12-23T00:00:00Z,2023-12-23T00:00:00Z,-49.34960,70.21800,2024-11-20 17:16:46+00:00,EPSG:4326,French Overseas Territory,
2,221816-2023-12-21T00:00:00Z,2023-12-21T00:00:00Z,-49.34960,70.21800,2024-11-20 17:16:46+00:00,EPSG:4326,French Overseas Territory,
3,221821-2023-12-21T00:00:00Z,2023-12-21T00:00:00Z,-49.34960,70.21800,2024-11-20 17:16:46+00:00,EPSG:4326,French Overseas Territory,
4,196991-2024-01-09T00:00:00Z,2024-01-09T00:00:00Z,-49.34960,70.21800,2024-11-20 17:16:46+00:00,EPSG:4326,French Overseas Territory,
...,...,...,...,...,...,...,...,...
4,196997-2023-12-21T00:00:00Z-2023-12-21T18:30:36Z,2023-12-21T18:30:36Z,-49.35232,70.22904,2024-11-20 17:16:59+00:00,EPSG:4326,,4180.0
3,196997-2023-12-21T00:00:00Z-2023-12-21T18:29:15Z,2023-12-21T18:29:15Z,-49.35449,70.22380,2024-11-20 17:16:59+00:00,EPSG:4326,,490.0
2,196997-2023-12-21T00:00:00Z-2023-12-21T16:49:47Z,2023-12-21T16:49:47Z,-49.35438,70.20778,2024-11-20 17:16:59+00:00,EPSG:4326,,490.0
1,196997-2023-12-21T00:00:00Z-2023-12-21T16:49:17Z,2023-12-21T16:49:17Z,-49.35458,70.20625,2024-11-20 17:16:59+00:00,EPSG:4326,,1010.0


In [18]:
# Occurrence entries: occurrenceID = eventID
#                     eventID = eventID
#                     species = species
#                     organismID = body + release_date

loc_df['occurrenceID'] = loc_df['eventID']
loc_df['organismID'] =  loc_df['body'].astype(str).str.cat(loc_df['release_date'].astype(str), sep='-') 

In [19]:
# Decimate to first each hour per animal. Acoustics would also use per-receiver location, argos and sat won't need that.
dets_df = loc_df
dets_df['scientificName'] = dets_df['species']
dets_df['basisOfRecord'] = 'MachineObservation'
dets_df['Date'] = pd.to_datetime(dets_df['d_date']).dt.date
dets_df['hr'] = pd.to_datetime(dets_df['d_date']).dt.hour
dets_df['binsize'] = dets_df.groupby(['organismID', 'Date', 'hr']).size().reset_index(name='binsize')['binsize']
dets_df.drop_duplicates(subset=['organismID','Date', 'hr'], keep='first', inplace=True)
dets_df.drop('hr', axis=1, inplace=True)
dets_df

Unnamed: 0,ref,ptt,d_date,lq,lat,lon,alt_lat,alt_lon,n_mess,n_mess_120,...,eventDate,modified,geodeticDatum,coordinateUncertaintyInMeters,occurrenceID,organismID,scientificName,basisOfRecord,Date,binsize
13026,ct180-C184-BAT-20,221802,2024-04-30T06:41:08Z,-2,-49.40065,70.15018,-49.40065,70.15018,2,0,...,2024-04-30T06:41:08Z,2024-11-20 17:16:59+00:00,EPSG:4326,10280.0,221802-2023-12-20T00:00:00Z-2024-04-30T06:41:08Z,221802-2023-12-20T00:00:00Z,Mirounga leonina,MachineObservation,2024-04-30,2.0
13025,ct180-C184-BAT-20,221802,2024-04-10T02:13:44Z,-1,-49.36508,70.24834,-57.83478,17.69246,3,0,...,2024-04-10T02:13:44Z,2024-11-20 17:16:59+00:00,EPSG:4326,6190.0,221802-2023-12-20T00:00:00Z-2024-04-10T02:13:44Z,221802-2023-12-20T00:00:00Z,Mirounga leonina,MachineObservation,2024-04-10,2.0
13024,ct180-C184-BAT-20,221802,2024-03-15T17:26:21Z,-2,-50.22177,67.71829,-50.22177,67.71829,1,0,...,2024-03-15T17:26:21Z,2024-11-20 17:16:59+00:00,EPSG:4326,10280.0,221802-2023-12-20T00:00:00Z-2024-03-15T17:26:21Z,221802-2023-12-20T00:00:00Z,Mirounga leonina,MachineObservation,2024-03-15,2.0
13023,ct180-C184-BAT-20,221802,2024-03-15T07:13:38Z,-2,-52.70706,69.24513,-47.82350,81.83484,1,0,...,2024-03-15T07:13:38Z,2024-11-20 17:16:59+00:00,EPSG:4326,10280.0,221802-2023-12-20T00:00:00Z-2024-03-15T07:13:38Z,221802-2023-12-20T00:00:00Z,Mirounga leonina,MachineObservation,2024-03-15,1.0
13022,ct180-C184-BAT-20,221802,2024-03-14T16:49:33Z,-2,-56.07488,71.43531,-48.75767,82.61287,2,0,...,2024-03-14T16:49:33Z,2024-11-20 17:16:59+00:00,EPSG:4326,10280.0,221802-2023-12-20T00:00:00Z-2024-03-14T16:49:33Z,221802-2023-12-20T00:00:00Z,Mirounga leonina,MachineObservation,2024-03-14,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11,ct180-156-BAT-15,196997,2023-12-21T21:03:26Z,2,-49.36094,70.22439,-49.36094,70.22439,5,0,...,2023-12-21T21:03:26Z,2024-11-20 17:16:59+00:00,EPSG:4326,1010.0,196997-2023-12-21T00:00:00Z-2023-12-21T21:03:26Z,196997-2023-12-21T00:00:00Z,Mirounga leonina,MachineObservation,2023-12-21,1.0
10,ct180-156-BAT-15,196997,2023-12-21T20:13:51Z,2,-49.36058,70.22365,-49.36058,70.22365,4,0,...,2023-12-21T20:13:51Z,2024-11-20 17:16:59+00:00,EPSG:4326,1010.0,196997-2023-12-21T00:00:00Z-2023-12-21T20:13:51Z,196997-2023-12-21T00:00:00Z,Mirounga leonina,MachineObservation,2023-12-21,1.0
7,ct180-156-BAT-15,196997,2023-12-21T19:26:17Z,2,-49.35243,70.23752,-49.35243,70.23752,9,0,...,2023-12-21T19:26:17Z,2024-11-20 17:16:59+00:00,EPSG:4326,1010.0,196997-2023-12-21T00:00:00Z-2023-12-21T19:26:17Z,196997-2023-12-21T00:00:00Z,Mirounga leonina,MachineObservation,2023-12-21,3.0
5,ct180-156-BAT-15,196997,2023-12-21T18:33:18Z,1,-49.34763,70.22586,-49.34763,70.22586,7,0,...,2023-12-21T18:33:18Z,2024-11-20 17:16:59+00:00,EPSG:4326,1200.0,196997-2023-12-21T00:00:00Z-2023-12-21T18:33:18Z,196997-2023-12-21T00:00:00Z,Mirounga leonina,MachineObservation,2023-12-21,1.0


In [20]:
dets_df['binsize'].describe()

count    20479.000000
mean         1.584599
std          0.830232
min          1.000000
25%          1.000000
50%          1.000000
75%          2.000000
max          7.000000
Name: binsize, dtype: float64

In [21]:
dets_df['dataGeneralizations'] = dets_df['binsize'].apply(lambda x: 'subsampled by hour, first of {} record(s)'.format(x))

In [22]:
occ_df = pd.concat([occ_df, dets_df[['occurrenceID', 'eventID', 'scientificName', 'organismID', 'basisOfRecord']]])

In [23]:
# flesh out the occurrence taxonomic entries with kingdom, phylum, class, order, family
import pyworms

lookup_dict = {}
for name in occ_df['scientificName'].unique():
    resp = pyworms.aphiaRecordsByMatchNames(name)
    if len(resp[0]) == 0:
        print('\nNo match for name "{}"'.format(name))
        continue
    elif len(resp[0]) > 1:
        print('\nMultiple matches for name "{}"'.format(name))
        pprint.pprint(resp[0], indent=4)
        continue
    else:
        worms = resp[0][0]
        lookup_dict[name]={'scientificName': name,
                           'scientificNameID': worms['lsid'],
                           'taxonRank': worms['rank'],
                           'kingdom': worms['kingdom'],
                           'phylum': worms['phylum'],
                           'class': worms['class'],
                           'order': worms['order'],
                           'family': worms['family']}
        
lookup_df = pd.DataFrame.from_dict(lookup_dict, orient='index')

In [24]:
lookup_df

Unnamed: 0,scientificName,scientificNameID,taxonRank,kingdom,phylum,class,order,family
Mirounga leonina,Mirounga leonina,urn:lsid:marinespecies.org:taxname:231413,Species,Animalia,Chordata,Mammalia,Carnivora,Phocidae


In [25]:
occ_df = occ_df.join(lookup_df, how='left', on='scientificName', rsuffix='_worms')

In [26]:
occ_df

Unnamed: 0,occurrenceID,organismID,eventID,sex,scientificName,basisOfRecord,scientificName_worms,scientificNameID,taxonRank,kingdom,phylum,class,order,family
0,196997-2023-12-21T00:00:00Z,196997-2023-12-21T00:00:00Z,196997-2023-12-21T00:00:00Z,m,Mirounga leonina,HumanObservation,Mirounga leonina,urn:lsid:marinespecies.org:taxname:231413,Species,Animalia,Chordata,Mammalia,Carnivora,Phocidae
1,153719-2023-12-23T00:00:00Z,153719-2023-12-23T00:00:00Z,153719-2023-12-23T00:00:00Z,m,Mirounga leonina,HumanObservation,Mirounga leonina,urn:lsid:marinespecies.org:taxname:231413,Species,Animalia,Chordata,Mammalia,Carnivora,Phocidae
2,221816-2023-12-21T00:00:00Z,221816-2023-12-21T00:00:00Z,221816-2023-12-21T00:00:00Z,m,Mirounga leonina,HumanObservation,Mirounga leonina,urn:lsid:marinespecies.org:taxname:231413,Species,Animalia,Chordata,Mammalia,Carnivora,Phocidae
3,221821-2023-12-21T00:00:00Z,221821-2023-12-21T00:00:00Z,221821-2023-12-21T00:00:00Z,m,Mirounga leonina,HumanObservation,Mirounga leonina,urn:lsid:marinespecies.org:taxname:231413,Species,Animalia,Chordata,Mammalia,Carnivora,Phocidae
4,196991-2024-01-09T00:00:00Z,196991-2024-01-09T00:00:00Z,196991-2024-01-09T00:00:00Z,f,Mirounga leonina,HumanObservation,Mirounga leonina,urn:lsid:marinespecies.org:taxname:231413,Species,Animalia,Chordata,Mammalia,Carnivora,Phocidae
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11,196997-2023-12-21T00:00:00Z-2023-12-21T21:03:26Z,196997-2023-12-21T00:00:00Z,196997-2023-12-21T00:00:00Z-2023-12-21T21:03:26Z,,Mirounga leonina,MachineObservation,Mirounga leonina,urn:lsid:marinespecies.org:taxname:231413,Species,Animalia,Chordata,Mammalia,Carnivora,Phocidae
10,196997-2023-12-21T00:00:00Z-2023-12-21T20:13:51Z,196997-2023-12-21T00:00:00Z,196997-2023-12-21T00:00:00Z-2023-12-21T20:13:51Z,,Mirounga leonina,MachineObservation,Mirounga leonina,urn:lsid:marinespecies.org:taxname:231413,Species,Animalia,Chordata,Mammalia,Carnivora,Phocidae
7,196997-2023-12-21T00:00:00Z-2023-12-21T19:26:17Z,196997-2023-12-21T00:00:00Z,196997-2023-12-21T00:00:00Z-2023-12-21T19:26:17Z,,Mirounga leonina,MachineObservation,Mirounga leonina,urn:lsid:marinespecies.org:taxname:231413,Species,Animalia,Chordata,Mammalia,Carnivora,Phocidae
5,196997-2023-12-21T00:00:00Z-2023-12-21T18:33:18Z,196997-2023-12-21T00:00:00Z,196997-2023-12-21T00:00:00Z-2023-12-21T18:33:18Z,,Mirounga leonina,MachineObservation,Mirounga leonina,urn:lsid:marinespecies.org:taxname:231413,Species,Animalia,Chordata,Mammalia,Carnivora,Phocidae


In [27]:
occ_df['organismID'].unique()

array(['196997-2023-12-21T00:00:00Z', '153719-2023-12-23T00:00:00Z',
       '221816-2023-12-21T00:00:00Z', '221821-2023-12-21T00:00:00Z',
       '196991-2024-01-09T00:00:00Z', '221802-2023-12-20T00:00:00Z',
       '204698-2024-02-11T00:00:00Z', '204693-2023-12-22T00:00:00Z',
       '204688-2023-12-23T00:00:00Z', '204699-2024-02-08T00:00:00Z',
       '204703-2024-02-09T00:00:00Z', '204702-2023-12-20T00:00:00Z',
       '204701-2023-12-22T00:00:00Z', '204717-2024-02-09T00:00:00Z',
       '204704-2024-01-26T00:00:00Z', '204739-2023-12-23T00:00:00Z',
       '204732-2024-02-09T00:00:00Z', '204738-2023-12-31T00:00:00Z',
       '221824-2024-02-11T00:00:00Z', '221820-2023-12-19T00:00:00Z',
       '221798-2023-12-20T00:00:00Z', '221799-2024-02-11T00:00:00Z',
       '221801-2024-01-26T00:00:00Z', '221814-2023-12-19T00:00:00Z',
       '221810-2023-12-21T00:00:00Z'], dtype=object)

In [28]:
# Any EMOFs to harvest from detection occurrences?
# 
# 

In [29]:
# Push them out to files and an archive:

occ_df.to_csv('output/occurrences.csv', date_format='%Y-%m-%dT%H:%M:%S')
event_df.to_csv('output/events.csv', date_format='%Y-%m-%dT%H:%M:%S')
# emof_df.to_csv('output/emof.csv', date_format='%Y-%m-%dT%H:%M:%S')

In [30]:
# Zip and ship to an IPT

# Either via a form fill-in, or via depositing the archive on the IPT's filesystem?

# TODO: Try the form-fill first - use the OTN IPT workflows from ipython-utilities
# import requests  # session with the forms themselves
# import selenium  # or pick-n-click

### Debugging cells:

In [31]:
# throwing out all but max lq will help us de-duplicate these same-time-same-tag hits?
loc_df['lq'].describe()

count    35098.000000
mean        -1.341415
std          1.056915
min         -9.000000
25%         -2.000000
50%         -2.000000
75%         -1.000000
max          3.000000
Name: lq, dtype: float64