In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import subprocess

In [None]:
# Prompt user for manual or automated argument
while (True):
    dataset_type = str(input("Please enter 'manual' or 'automated' for the type of data set you are processing: "))
    # make case insenstive
    dataset_type = dataset_type.lower()
    if (dataset_type == "manual" or dataset_type == "automated"):
        break
    print("Invalid argument")

In [None]:
# first call bash script to get necessary file inputs
# for geographic script, choose query_samples for underway gps data alignment as input script
# for worms script, choose names_ids.csv for input script
# subprocess.call(['./exec.sh'])

In [None]:
# read different columns based on dataset type
if (dataset_type == 'manual'):
    columns = ['permalink', 'namespace_manual', 'worms_higher_order_manual', 'Biovolume', 'MajorAxisLength']
else:
    columns = ['permalink', 'namespace_automated', 'Biovolume', 'MajorAxisLength']

In [None]:
# read in level 1_b file
all_rois = pd.read_csv('level_1b.csv', usecols=columns)

In [None]:
'''# Prompt user for type of analsyis
while (True):
    analysis_type = str(input("Please enter 'species', 'higher ranks', or 'all' for the type analysis: "))
    # make case insenstive
    analsyis_type = analysis_type.lower()
    if (analysis_type == 'species' or analysis_type == 'higher ranks' or analysis_type == 'all'):
        break
    print("Invalid argument")
# Show drop down list if 'species' is chosen
if (analysis_type == 'species'):
    higher_rank = 'worms_higher_order_{}'.format(dataset_type)
    all_species = set(all_rois.loc[all_rois[higher_rank].notna(), 
                                  'namespace_{}'.format(dataset_type)])
    print(all_species)'''

In [None]:
# prompt user to ask for desired size distribution, keep at 20 for now
threshold = input('Please enter minimum Major Axis Length to analyze (micrometers): ')
threshold = float(threshold)

In [None]:
# get higher order data if dataset is automated
if (dataset_type == 'automated'):
    auto_taxon_info = pd.read_csv('resolved_auto.csv', 
                                  usecols=['name', 'resolved_names', 'resolved_higher_order_fromgnr'])
    # merge to get taxa data
    all_rois = pd.merge(all_rois, auto_taxon_info, how='left', left_on='namespace_automated', right_on='name')
    # rename resolved_higher_order column to match
    all_rois.rename(columns={'resolved_higher_order_fromgnr':'worms_higher_order_automated'}, inplace=True)
else:
    man_taxon_info = pd.read_csv('resolved_manual.csv', 
                                  usecols=['name', 'resolved_names'])
    # merge to get taxa data
    samples = pd.merge(all_rois, man_taxon_info, how='left', left_on='namespace_manual', right_on='name')

In [None]:
# separate out roi id from permalink
all_rois['roi'] = all_rois['permalink']
all_rois.roi = all_rois.roi.str.slice(68, 74)
# gets rid of leading zeros
all_rois.roi = all_rois.roi.str.lstrip("0")
# cut permalink to just be permalink of sample
all_rois.permalink = all_rois.permalink.str.slice(0, 67)

In [None]:
# read in gps coordinates file
coords = pd.read_csv('comparison.csv', usecols=['pid', 'gps_furuno_latitude'])
# merge with all_rois based on smaple ids
all_rois = pd.merge(all_rois, coords, how='left', left_on='permalink', right_on='pid')

In [None]:
# calculate total biovolume per sample
total = all_rois.groupby('permalink')['Biovolume'].sum().reset_index()
total.rename(columns={'Biovolume':'total_biovolume'}, inplace=True)
# merge 2 data frames based on sample_identifier
all_rois = pd.merge(all_rois, total, how='left', on='permalink')

In [None]:
# filter out rows below threshold
all_rois = all_rois[all_rois.MajorAxisLength > threshold]

In [None]:
# group by sample identifiers and higher ranks to calculate total biovolume per ranking
rois = all_rois.groupby(
    ['permalink', 'worms_higher_order_{}'.format(dataset_type)]).agg(
    {
        'Biovolume': 'sum',
        'gps_furuno_latitude': 'first',
        'total_biovolume': 'first'
    }
).reset_index()
# calculate percent biovolume
rois['percent_biovolume'] = rois['Biovolume']/rois['total_biovolume']

In [None]:
# take duplicate latitudes and add their concentrations together
fig, ax = plt.subplots(figsize=(15,7))
# use unstack()
rois = rois.groupby(['gps_furuno_latitude','worms_higher_order_{}'.format(dataset_type)]).sum()['percent_biovolume']
rois.unstack().plot(ax=ax)
# add titles and axes labels
plt.xlabel('LTER Stations')
plt.ylabel('% Biovolume in targets with Major Axis Length > {} micrometers'.format(threshold))
plt.title('MVCO Phytoplankton Biovolume ({} classifications)'.format(dataset_type))
plt.grid(True)
# set stations as tick marks
ax.set_xticks([41.1967, 41.03, 40.8633, 40.6967, 40.5133, 40.3633, 40.2267, 40.1367, 40.0983, 39.94, 39.7733])
ax.set_xticklabels(np.arange(1,12))
# set comments
ax.text(41.1967, -0.15, "Coast", size = 15, ha = 'center')
ax.text(39.7733, -0.15, "Offshore", size = 15, ha = 'center')
# invert x axis
ax.invert_xaxis()
plt.show()