In [29]:
# constructs level2 summary file from level 1b file using manual ifcb data
# requires: level_1b.csv, resolved_manual.csv, volumes.csv, geographic_subset.csv
import pandas as pd
import numpy as np

In [30]:
# read in level 1b file
columns = ['associatedMedia', 'data_provider_category_HumanObservation', 
               'higherClassification_group', 'scientificName_HumanObservation',
               'scientificNameID_HumanObservation', 'Biovolume']
# initialize data frame from input
samples = pd.read_csv("../auto_join/level_1b.csv", usecols=columns)

In [33]:
samples[samples.Biovolume.isna()]

Unnamed: 0,associatedMedia,data_provider_category_HumanObservation,scientificName_HumanObservation,scientificNameID_HumanObservation,higherClassification_group,Biovolume
5762,http://ifcb-data.whoi.edu/NESLTER_transect/D20...,detritus,NotApplicable,NotApplicable,NotApplicable,
16905,http://ifcb-data.whoi.edu/NESLTER_transect/D20...,mix,Eukaryota,urn:lsid:algaebase.org:taxname:86701,other than diatoms dinoflagellates or haptophytes,
17161,http://ifcb-data.whoi.edu/NESLTER_transect/D20...,detritus,NotApplicable,NotApplicable,NotApplicable,
18288,http://ifcb-data.whoi.edu/NESLTER_transect/D20...,detritus,NotApplicable,NotApplicable,NotApplicable,
27649,http://ifcb-data.whoi.edu/NESLTER_transect/D20...,detritus,NotApplicable,NotApplicable,NotApplicable,
79470,http://ifcb-data.whoi.edu/NESLTER_transect/D20...,detritus,NotApplicable,NotApplicable,NotApplicable,
162275,http://ifcb-data.whoi.edu/NESLTER_transect/D20...,detritus,NotApplicable,NotApplicable,NotApplicable,
190024,http://ifcb-data.whoi.edu/NESLTER_transect/D20...,,,,,
203885,http://ifcb-data.whoi.edu/NESLTER_transect/D20...,,,,,
218655,http://ifcb-data.whoi.edu/NESLTER_transect/D20...,,,,,


In [22]:
# read in taxonomic information
man_taxon_info = pd.read_csv('../auto_join/resolved_manual_matched_matchIDs_LOOKUPsorted.csv', 
                              usecols=['name', 'resolved_names', 'alt_datasource'])
# merge to get taxa data
samples = pd.merge(samples, man_taxon_info, how='left', left_on='data_provider_category_HumanObservation', right_on='name')
samples.rename(columns={'higherClassification_group':'higherClassification_group_manual'}, inplace=True)

In [23]:
# separate out roi id from associatedMedia
samples['roi'] = samples['associatedMedia']
samples.roi = samples.roi.str.slice(68, 74)
# gets rid of leading zeros
samples.roi = samples.roi.str.lstrip("0")
# cut associatedMedia to just be associatedMedia of sample
samples.associatedMedia = samples.associatedMedia.str.slice(0, 67)

In [24]:
# read in volume data
volumes = pd.read_csv("volumes.csv")
# merge with samples
volumes.rename(columns={'permalink': 'associatedMedia'}, inplace=True)
samples = pd.merge(samples, volumes, how='left', on='associatedMedia')

In [25]:
# read in geolocation data
geo_data = pd.read_csv("geographic_subset.csv", usecols=['gps_furuno_latitude', 'gps_furuno_longitude', 
                                                         'date', 'pid'])
# geo_data.pid = geo_data.pid + '.html'
samples = pd.merge(samples, geo_data, how='left', left_on='associatedMedia', right_on='pid')

In [26]:
# append .html to associatedMedia
samples['associatedMedia'] = samples['associatedMedia'] + '.html'

In [27]:
# make level_2 summary file from manual data 
level_2 = samples
level_2 = level_2.groupby(['associatedMedia', 'data_provider_category_HumanObservation']).agg(
    {
        'scientificName_HumanObservation': 'first',
        'scientificNameID_HumanObservation': 'first',
        'higherClassification_group_manual': 'first',
        'roi': 'count',
        'Biovolume': 'sum',
        'volume_imaged': 'first',
        'date': 'first',
        'gps_furuno_latitude': 'first',
        'gps_furuno_longitude': 'first'
    }
).reset_index()
# rename roi to be abundance
level_2.rename(columns={'roi': 'Abundance'}, inplace=True)

In [28]:
# save output to csv file
level_2.to_csv('level_2_manual.csv', index=None, header=True)