In [1]:
# clears memory in case want to run script again with new threshold
for name in dir():
    if not name.startswith('_'):
        del globals()[name]

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import subprocess

In [None]:
# first call bash script to get necessary file inputs
# for geographic script, choose query_samples for underway gps data alignment as input script
# for worms script, choose names_ids.csv for input script
subprocess.call(['./exec.sh'])

In [3]:
# read in level 1_b file
all_rois = pd.read_csv('file_1b.csv')
# prompt user to ask for desired size distribution, keep at 20 for now
threshold = input('Please enter minimum Major Axis Length to analyze (micrometers): ')
threshold = float(threshold)

Please enter minimum Major Axis Length to analyze (micrometers): 20


In [4]:
# read in gps coordinates file
coords = pd.read_csv('comparison.csv', usecols=['pid', 'gps_furuno_latitude', 'gps_furuno_longitude'])
# merge with all_rois based on smaple ids
all_rois = pd.merge(all_rois, coords, how='left', left_on='sample_identifier', right_on='pid')
print(all_rois)

         Area     Biovolume  EquivDiameter  MajorAxisLength  MinorAxisLength  \
0        1623  2.435732e+04      45.458418        77.822688        29.890917   
1        9433  2.591726e+05     109.592284       202.786943        90.134400   
2         274  1.897522e+03      18.677999        27.889806        13.014720   
3         504  7.636426e+03      25.332049        25.485570        25.282862   
4       20913  3.260394e+05     163.178609      1132.735167        24.406576   
5         240  6.346017e+02      17.480775        50.928938         6.284877   
6         254  1.835194e+03      17.983405        25.001681        15.419562   
7         201  1.526814e+03      15.997536        19.599726        13.603451   
8         381  3.818606e+03      22.025083        28.950308        17.518677   
9         463  4.228584e+03      24.279825        39.014287        16.430956   
10        421  5.426316e+03      23.152405        25.348423        23.647674   
11        224  1.303761e+03      16.8880

In [None]:
# calculate total biovolume from samples

In [None]:
# filter out rows below threshold
all_rois = all_rois[all_rois.MajorAxisLength > threshold]

In [None]:
# group by sample identifiers and higher ranks to calculate total biovolume per ranking
rois = all_rois.groupby(
    ['sample_identifier', 'resolved_higher_order_fromgnr']).agg(
    {
        'Biovolume': 'sum',
        'gps_furuno_latitude': 'first'
    }
).reset_index()
# calculate total biovolume per sample
total = rois.groupby('sample_identifier')['Biovolume'].sum().reset_index()
total.rename(columns={'Biovolume':'total_biovolume'}, inplace=True)
# merge 2 data frames based on sample_identifier
rois = pd.merge(rois, total, how='left', on='sample_identifier')
# calculate percent biovolume
rois['percent_biovolume'] = rois['Biovolume']/rois['total_biovolume']

In [None]:
# take duplicate latitudes and add their concentrations together
fig, ax = plt.subplots(figsize=(15,7))
# use unstack()
rois = rois.groupby(['gps_furuno_latitude','resolved_higher_order_fromgnr']).sum()['percent_biovolume']
rois.unstack().plot(ax=ax)
# add titles and axes labels
plt.xlabel('LTER Stations')
plt.ylabel('% Biovolume in targets with Major Axis Length > {} micrometers'.format(threshold))
plt.title('MVCO Phytoplankton Biovolume')
plt.grid(True)
# set stations as tick marks
ax.set_xticks([41.1967, 41.03, 40.8633, 40.6967, 40.5133, 40.3633, 40.2267, 40.1367, 40.0983, 39.94, 39.7733])
ax.set_xticklabels(np.arange(1,12))
# set comments
ax.text(41.1967, -0.15, "Coast", size = 15, ha = 'center')
ax.text(39.7733, -0.15, "Offshore", size = 15, ha = 'center')
# invert x axis
ax.invert_xaxis()
plt.show()