---
# Creating occurrence file

Read in the datafile we just created from `sound_production_to_presence.ipynb`

In [2]:
import pandas as pd
import numpy as np

fname = 'data/sanctsound_presence.zip'

df_pres = pd.read_csv(fname,
                       compression='zip')

df_pres['station'] = df_pres['dataset_id'].str.split("_",expand=True)[1]+"_"+df_pres['dataset_id'].str.split("_",expand=True)[2]

df_pres.sample(n=5)

  df_pres = pd.read_csv(fname,


Unnamed: 0,start_time,dolphin_presence,dataset_id,WKT,decimalLatitude,decimalLongitude,vernacularName,time,bluewhale_presence,bluewhale_manual_presence,...,minkewhale_presence,plainfinmidshipman_presence,northatlanticrightwhale_presence,scientificName,scientificNameID,taxonRank,kingdom,propagationFrequency,eventDate,station
486495,2021-08-27 19:18:03.080000000,,noaaSanctSound_MB01_09_bluewhale,POINT (36.798 -122.9758),36.798,-122.9758,blue whale,,1.0,,...,,,,Balaenoptera musculus,urn:lsid:marinespecies.org:taxname:137090,Species,Animalia,63,2021-08-27 19:18:03.080000000,MB01_09
256671,,,noaaSanctSound_CI04_06_bluewhale,POINT (33.8489 -120.1174),33.8489,-120.1174,blue whale,2020-10-27 20:53:57.440000000,1.0,,...,,,,Balaenoptera musculus,urn:lsid:marinespecies.org:taxname:137090,Species,Animalia,63,2020-10-27 20:53:57.440000000,CI04_06
80736,,,noaaSanctSound_CI02_07_bluewhale,POINT (34.0855 -120.5224),34.0855,-120.5224,blue whale,2021-07-27 18:20:24.048000000,1.0,,...,,,,Balaenoptera musculus,urn:lsid:marinespecies.org:taxname:137090,Species,Animalia,63,2021-07-27 18:20:24.048000000,CI02_07
611812,2019-11-15 12:52:43.911000064,,noaaSanctSound_CI03_02_bocaccio,POINT (33.48687 -119.01609),33.48687,-119.01609,bocaccio,,,,...,,,,Sebastes paucispinis,urn:lsid:marinespecies.org:taxname:274833,Species,Animalia,300,2019-11-15 12:52:43.911000064,CI03_02
261304,,,noaaSanctSound_CI04_06_bluewhale,POINT (33.8489 -120.1174),33.8489,-120.1174,blue whale,2020-11-04 16:37:37.984000000,1.0,,...,,,,Balaenoptera musculus,urn:lsid:marinespecies.org:taxname:137090,Species,Animalia,63,2020-11-04 16:37:37.984000000,CI04_06


# Truncate presence observations down to the day

What we need to end up with is a table with the following information.

**Species `x` made an acoustic sound at `y` location on `z` day.**

Our sounding information is high resolution, so we want to group the sounds into daily sounds. But, we need to preserve the species, and location information.

Using [pandas.groupby()](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.groupby.html) we can regroup the data down to the day. This is doing the work of capturing daily observations (truncate by day).

Here we are grouping the dataframe by the species, spatial coordinates, and date.

In [34]:
df_pres['eventDate'] = pd.to_datetime(df_pres['eventDate'], format='%Y-%m-%d %H:%M:%S.%f')

df_pres.sort_values(by='eventDate', inplace=True)

group = df_pres.groupby(
    [df_pres.eventDate.dt.date,'station','vernacularName','scientificNameID','scientificName','taxonRank','kingdom','decimalLatitude','decimalLongitude','propagationFrequency']
)

# identify which columns are presence
cols = [col for col in df_pres.columns if 'presence' in col]
cols.extend([col for col in df_pres.columns if 'detection_count' in col])

# we just need to know if the species was present on that date and location.
counts = group[cols].count() 

counts = counts.reset_index(level=[0,1,2,3,4,5,6,7,8,9], allow_duplicates=True)

counts.sample(n=5)

Unnamed: 0,eventDate,station,vernacularName,scientificNameID,scientificName,taxonRank,kingdom,decimalLatitude,decimalLongitude,propagationFrequency,...,pinniped_presence,seiwhale_presence,atlanticcod_presence,humpbackwhale_presence,killerwhale_presence,minkewhale_presence,plainfinmidshipman_presence,northatlanticrightwhale_presence,redgrouper_detection_count,blackgrouper_detection_count
6742,2019-09-14,SB01_05,minke whale,urn:lsid:marinespecies.org:taxname:137087,Balaenoptera acutorostrata,Species,Animalia,42.436785,-70.546435,20,...,0,0,0,0,0,1,0,0,0,0
14901,2020-07-24,CI04_05,blue whale,urn:lsid:marinespecies.org:taxname:137090,Balaenoptera musculus,Species,Animalia,33.8489,-120.1171,63,...,0,0,0,0,0,0,0,0,0,0
10152,2019-12-30,CI03_02,bocaccio,urn:lsid:marinespecies.org:taxname:274833,Sebastes paucispinis,Species,Animalia,33.48687,-119.01609,300,...,0,0,0,0,0,0,0,0,0,0
20088,2021-03-26,MB03_04,humpback whale,urn:lsid:marinespecies.org:taxname:137092,Megaptera novaeangliae,Species,Animalia,36.37021,-122.314903,300,...,0,0,0,1,0,0,0,0,0,0
20098,2021-03-27,SB01_15,dolphin,urn:lsid:marinespecies.org:taxname:2688,Cetacea,Infraorder,Animalia,42.438427,-70.54542,5000,...,0,0,0,0,0,0,0,0,0,0


In [35]:
counts['eventDate']=pd.to_datetime(counts['eventDate'])

counts.set_index(counts['eventDate'], inplace=True)

counts.drop(columns=['eventDate'], inplace=True)

counts.sample(n=5)

Unnamed: 0_level_0,station,vernacularName,scientificNameID,scientificName,taxonRank,kingdom,decimalLatitude,decimalLongitude,propagationFrequency,dolphin_presence,...,pinniped_presence,seiwhale_presence,atlanticcod_presence,humpbackwhale_presence,killerwhale_presence,minkewhale_presence,plainfinmidshipman_presence,northatlanticrightwhale_presence,redgrouper_detection_count,blackgrouper_detection_count
eventDate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-12-10,HI03_02,dolphin,urn:lsid:marinespecies.org:taxname:2688,Cetacea,Infraorder,Animalia,21.28542,-157.60012,5000,16,...,0,0,0,0,0,0,0,0,0,0
2020-01-17,MB03_02,dolphin,urn:lsid:marinespecies.org:taxname:2688,Cetacea,Infraorder,Animalia,36.37021,-122.314903,5000,12,...,0,0,0,0,0,0,0,0,0,0
2019-08-17,CI01_03,bocaccio,urn:lsid:marinespecies.org:taxname:274833,Sebastes paucispinis,Species,Animalia,34.04383,-120.081,300,0,...,0,0,0,0,0,0,0,0,0,0
2018-11-03,CI04_01,fin whale,urn:lsid:marinespecies.org:taxname:137091,Balaenoptera physalus,Species,Animalia,33.849,-120.118,20,0,...,0,0,0,0,0,0,0,0,0,0
2019-07-31,GR02_02,dolphin,urn:lsid:marinespecies.org:taxname:2688,Cetacea,Infraorder,Animalia,31.376133,-80.839133,5000,2,...,0,0,0,0,0,0,0,0,0,0


Let's confirm that we have unique location, date, and species.

For one species, tell me if we have occurrences of other species. For this we use the describe() function to give us a summary of all the other presence variables. If the statistics are all 0 for the other columns, than we are doing this correctly.

In [36]:
counts.loc[counts['scientificNameID']=='urn:lsid:marinespecies.org:taxname:274833',cols].describe()

Unnamed: 0,dolphin_presence,bluewhale_presence,bluewhale_manual_presence,bocaccio_presence,finwhale_presence,pinniped_presence,seiwhale_presence,atlanticcod_presence,humpbackwhale_presence,killerwhale_presence,minkewhale_presence,plainfinmidshipman_presence,northatlanticrightwhale_presence,redgrouper_detection_count,blackgrouper_detection_count
count,1391.0,1391.0,1391.0,1391.0,1391.0,1391.0,1391.0,1391.0,1391.0,1391.0,1391.0,1391.0,1391.0,1391.0,1391.0
mean,0.0,0.0,0.0,39.652049,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
std,0.0,0.0,0.0,78.589241,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
min,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,13.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,36.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,0.0,0.0,0.0,741.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Cool! Looks like we are doing what we think we're doing. Now we can drop all of the presence columns, as we're identifying presence by indicating scientificNameID in the row. 

In [37]:
#columns = cols.extend(['eventDate'])

df_occur = counts.drop(columns=cols)

df_occur.sample(n=5)

Unnamed: 0_level_0,station,vernacularName,scientificNameID,scientificName,taxonRank,kingdom,decimalLatitude,decimalLongitude,propagationFrequency
eventDate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2020-04-12,CI03_03,dolphin,urn:lsid:marinespecies.org:taxname:2688,Cetacea,Infraorder,Animalia,33.48687,-119.01609,5000
2019-07-29,GR02_02,dolphin,urn:lsid:marinespecies.org:taxname:2688,Cetacea,Infraorder,Animalia,31.376133,-80.839133,5000
2020-08-07,CI04_05,blue whale,urn:lsid:marinespecies.org:taxname:137090,Balaenoptera musculus,Species,Animalia,33.8489,-120.1171,63
2020-06-09,SB01_09,humpback whale,urn:lsid:marinespecies.org:taxname:137092,Megaptera novaeangliae,Species,Animalia,42.437243,-70.54595,300
2019-11-03,GR03_03,dolphin,urn:lsid:marinespecies.org:taxname:2688,Cetacea,Infraorder,Animalia,31.367517,-80.8949,5000


Now we have an occurrence table! We have a species at a time and location.

## Build `occurrenceID`

Let's make an appropriate `occurrenceID` by appending the `vernacularName`, `station`, and `eventDate`.

Second we want to check that all occurrences are unique using this method. If the resultant table is empty, we have made locally unique identifiers.

In [38]:
df_occur['site'] = df_occur['station'].str.split("_",expand=True)[0]

df_occur['station'] = df_occur['station'].str.split("_",expand=True)[1]

df_occur['occurrenceID'] = 'site' + df_occur['site'] + "_" + 'station' + df_occur['station'] + '_' + df_occur['vernacularName'].str.replace(' ','_') + "_" + df_occur.index.strftime('%Y-%m-%d')

df_occur.sample(n=5)

Unnamed: 0_level_0,station,vernacularName,scientificNameID,scientificName,taxonRank,kingdom,decimalLatitude,decimalLongitude,propagationFrequency,site,occurrenceID
eventDate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2019-06-13,2,plainfin midshipman,urn:lsid:marinespecies.org:taxname:275658,Porichthys notatus,Species,Animalia,36.798,-121.976,300,MB01,siteMB01_station02_plainfin_midshipman_2019-06-13
2021-01-10,13,dolphin,urn:lsid:marinespecies.org:taxname:2688,Cetacea,Infraorder,Animalia,42.471128,-70.241957,5000,SB02,siteSB02_station13_dolphin_2021-01-10
2020-03-28,4,plainfin midshipman,urn:lsid:marinespecies.org:taxname:275658,Porichthys notatus,Species,Animalia,34.04363,-120.08073,300,CI01,siteCI01_station04_plainfin_midshipman_2020-03-28
2021-02-12,4,blue whale,urn:lsid:marinespecies.org:taxname:137090,Balaenoptera musculus,Species,Animalia,36.37021,-122.314903,63,MB03,siteMB03_station04_blue_whale_2021-02-12
2020-05-10,4,pinniped,urn:lsid:marinespecies.org:taxname:148736,Pinnipedia,Infraorder,Animalia,34.08532,-120.523,1000,CI02,siteCI02_station04_pinniped_2020-05-10


### Check `occurrenceID` for duplicates

In [40]:
df_occur.loc[df_occur['occurrenceID'].duplicated(keep=False)]

Unnamed: 0_level_0,station,vernacularName,scientificNameID,scientificName,taxonRank,kingdom,decimalLatitude,decimalLongitude,propagationFrequency,site,occurrenceID
eventDate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1


Let's write it out as an occurrence file.

In [41]:
#df_occur.dop(columns=['eventDate'], inplace=True) # we can drop eventDate column now (it's our index)

fname_occur = 'data/occurrence.zip'
df_occur.to_csv(fname_occur,date_format='%Y-%m-%d', index=True, compression='zip')