In [52]:
# constructs level2 summary file from level 1b file using manual ifcb data
# requires: level_1b.csv, resolved_manual.csv, volumes.csv, geographic_subset.csv
import pandas as pd
import numpy as np

In [120]:
# read in level 1b file
columns = ['associatedMedia', 'data_provider_category_HumanObservation', 
               'higherClassification_group', 'scientificName_HumanObservation',
               'scientificNameID_HumanObservation', 'Biovolume']
# initialize data frame from input
samples = pd.read_csv("../auto_join/level_1b_manual.csv", usecols=columns)

In [132]:
# separate out roi id from associatedMedia
zeros = pd.read_csv("../auto_join/zero_features.csv")
zeros['roi'] = zeros['associatedMedia']
zeros.roi = zeros.roi.str.slice(68, 74)
# gets rid of leading zeros
zeros.roi = zeros.roi.str.lstrip("0")
# cut associatedMedia to just be associatedMedia of sample
zeros.associatedMedia = zeros.associatedMedia.str.slice(0, 67)
zeros

Unnamed: 0,associatedMedia,data_provider_category_MachineObservation,scientificNameID_MachineObservation,data_provider_category_HumanObservation,scientificName_HumanObservation,scientificNameID_HumanObservation,higherClassification_group,Area,Biovolume,maxFeretDiameter,minFeretDiameter,roi
0,http://ifcb-data.whoi.edu/NESLTER_transect/D20...,detritus,-999999,detritus,NotApplicable,NotApplicable,NotApplicable,,,,,1018.0
1,http://ifcb-data.whoi.edu/NESLTER_transect/D20...,mix,urn:lsid:algaebase.org:taxname:86701,mix,Eukaryota,urn:lsid:algaebase.org:taxname:86701,other than diatoms dinoflagellates or haptophytes,,,,,384.0
2,http://ifcb-data.whoi.edu/NESLTER_transect/D20...,Bacillariophyceae,148899,detritus,NotApplicable,NotApplicable,NotApplicable,,,,,790.0
3,http://ifcb-data.whoi.edu/NESLTER_transect/D20...,detritus,-999999,detritus,NotApplicable,NotApplicable,NotApplicable,,,,,472.0
4,http://ifcb-data.whoi.edu/NESLTER_transect/D20...,camera_spot,1372979,detritus,NotApplicable,NotApplicable,NotApplicable,,,,,1165.0
5,http://ifcb-data.whoi.edu/NESLTER_transect/D20...,detritus,-999999,detritus,NotApplicable,NotApplicable,NotApplicable,,,,,4270.0
6,http://ifcb-data.whoi.edu/NESLTER_transect/D20...,detritus,-999999,detritus,NotApplicable,NotApplicable,NotApplicable,,,,,2710.0
7,http://ifcb-data.whoi.edu/NESLTER_transect/D20...,detritus,-999999,,,,,,,,,1905.0
8,http://ifcb-data.whoi.edu/NESLTER_transect/D20...,detritus,-999999,,,,,,,,,751.0
9,http://ifcb-data.whoi.edu/NESLTER_transect/D20...,Bacillariophyceae,148899,,,,,,,,,2504.0


In [121]:
# read in taxonomic information
man_taxon_info = pd.read_csv('../auto_join/resolved_manual_matched_matchIDs_LOOKUPsorted.csv', 
                              usecols=['name', 'resolved_names', 'alt_datasource'])
# merge to get taxa data
samples = pd.merge(samples, man_taxon_info, how='left', left_on='data_provider_category_HumanObservation', right_on='name')
samples.rename(columns={'higherClassification_group':'higherClassification_group_manual'}, inplace=True)

In [122]:
# separate out roi id from associatedMedia
samples['roi'] = samples['associatedMedia']
samples.roi = samples.roi.str.slice(68, 74)
# gets rid of leading zeros
samples.roi = samples.roi.str.lstrip("0")
# cut associatedMedia to just be associatedMedia of sample
samples.associatedMedia = samples.associatedMedia.str.slice(0, 67)

In [123]:
# read in volume data
volumes = pd.read_csv("volumes.csv")
# merge with samples
volumes.rename(columns={'permalink': 'associatedMedia'}, inplace=True)
samples = pd.merge(samples, volumes, how='left', on='associatedMedia')

In [124]:
# read in geolocation data
geo_data = pd.read_csv("geographic_subset.csv", usecols=['gps_furuno_latitude', 'gps_furuno_longitude', 
                                                         'date', 'pid'])
# geo_data.pid = geo_data.pid + '.html'
samples = pd.merge(samples, geo_data, how='left', left_on='associatedMedia', right_on='pid')

In [125]:
# append .html to associatedMedia
samples['associatedMedia'] = samples['associatedMedia'] + '.html'

In [155]:
# choose known one sample with a ROI (detritus) that has a Biovolume = nan
check = 'http://ifcb-data.whoi.edu/NESLTER_transect/D20180201T112345_IFCB102.html'
# samples = level_1b dataframe
sample = samples[samples['associatedMedia'] == check]
# print number of all ROIs in this sample
print("# of ROIs without nan: ", len(list(sample.Biovolume)))
# print total biovolume of all samples, including one with nan value
print(np.sum(sample[sample.data_provider_category_HumanObservation == 'detritus'].Biovolume))
# exclude the ROI with nan
no_nan = sample[sample.Biovolume.notna()]
print("# of ROIs without nan: ", len(list(no_nan.Biovolume)))
print(np.sum(no_nan[no_nan.data_provider_category_HumanObservation == 'detritus'].Biovolume))

# of ROIs without nan:  2384
787301.2619999999
# of ROIs without nan:  2383
787301.2620000001


In [77]:
# make level_2 summary file from manual data 
level_2 = samples
level_2 = level_2.groupby(['associatedMedia', 'data_provider_category_HumanObservation']).agg(
    {
        'scientificName_HumanObservation': 'first',
        'scientificNameID_HumanObservation': 'first',
        'higherClassification_group_manual': 'first',
        'roi': 'count',
        'Biovolume': 'sum',
        'volume_imaged': 'first',
        'date': 'first',
        'gps_furuno_latitude': 'first',
        'gps_furuno_longitude': 'first'
    }
).reset_index()
# rename roi to be abundance
level_2.rename(columns={'roi': 'Abundance'}, inplace=True)

In [60]:
# save output to csv file
level_2.to_csv('level_2_manual.csv', index=None, header=True)