In [43]:
import plotly.offline as py
import plotly.graph_objs as go
import plotly.figure_factory as ff
import pandas as pd
import numpy as np

In [129]:
py.init_notebook_mode(connected=True)

In [195]:
# WARNING: There is a huge problem with CSV import: the numbers are exported as formatted,
# which sometimes results in loosing the precision points.
# E.g. coefficient 0.022112 can be exported as 0.2, which results in huge errors!
# MAKE SURE YOUR SPREADSHEET FORMATTING SHOWS ALL THE PRECISION YOU WANT EXPORTED!
fish = pd.read_csv('/Users/pawelsz/Downloads/mcp/mcp analytics/Random Data Sheet- Fish.csv',
                   usecols=range(0,40),
                   dtype={u'Water Temp':np.float32})
print(len(fish))

7191


In [196]:
fish.columns

Index([u'Species', u'Scientifical name (automatic)', u'Family (automatic)',
       u'Number', u'Avg number', u'Size', u'Max Size',
       u'Average size \r(automatic)', u'Coeff a', u'Coeff b', u'Biomass kg',
       u'Primary Function (automatic)', u'Secondary Function (automatic)',
       u'Commercial (automatic)', u'Validation\r(automatic)', u'Site',
       u'Protection (automatic)', u'Municipality (automatic)', u'Month',
       u'Season (automatic)', u'Date', u'Year', u'Depth Range', u'Zone',
       u'Water Temp', u'Visibilty', u'Visibilty avg', u'Name', u'Name.1',
       u'OBSERVED', u'ROW N'\r How many rows ?', u'COUNT  (automatic)',
       u'Survey N' \r(per day per team)',
       u'Replicate number per site, season, depth \r(don't fill up)',
       u'Total replicates per site, season, per depth \r(don't fill up)',
       u'Replicas per site, season \r(automatic)', u'Unnamed: 36',
       u'Unnamed: 37', u'Unnamed: 38', u'Unnamed: 39'],
      dtype='object')

In [197]:
# observations is the sanitized table with all the data.
title = lambda x:x.title()  # Convert 'new yOrk' to 'New York'.
observations = pd.DataFrame(dict(
    species = fish['Species'].apply(title),
    scientific_name = fish['Scientifical name (automatic)'],
    family = fish['Family (automatic)'],
    number = fish['Number'],
    size = fish['Size'],
    survey_key = fish['Date'].apply(str) + fish["Survey N' \r(per day per team)"].apply(str) + fish['Name'],
    max_size = fish['Max Size'],
    average_size = fish['Average size \r(automatic)'],
    # Do not compute from coefficients until we have coefficients at high precision.
#   biomass_kg = fish['Number'] * fish['Coeff a'] * (fish['Average size \r(automatic)'] ** fish['Coeff b']) / 1000,
    biomass_kg = fish['Biomass kg'] * fish["Total replicates per site, season, per depth \r(don't fill up)"],
    primary_function = fish['Primary Function (automatic)'],
    secondary_function = fish['Secondary Function (automatic)'],
    commercial = fish['Commercial (automatic)'],
    site = fish['Site'].apply(title),
    protection = fish['Protection (automatic)'],
    municipality = fish['Municipality (automatic)'],
    season = fish['Season (automatic)'],
    date = fish['Date'],
    depth_range = fish['Depth Range'],
    zone = fish['Zone'],
    water_temp = fish['Water Temp'],
    visibility = fish['Visibilty'],
    visibility_avg = fish['Visibilty avg'],
    survey_id = fish["Survey N' \r(per day per team)"],
    observer1 = fish['Name'],
    observer2 = fish['Name.1'],
    depth_replicas = fish["Total replicates per site, season, per depth \r(don't fill up)"]
))

In [198]:
# Insert biomass=0 observations for all surveys for all species.
# This is to account for fish that were not observed during some surveys.
obs_empty = observations[['survey_key', 'site', 'depth_range', 'season']].drop_duplicates()
# Assign zero biomass to all rows.
obs_empty['biomass_kg'] = 0.0
obs_empty['key'] = 1  # Fake key to join on.
all_species = pd.DataFrame(dict(species=observations['species'])).drop_duplicates()
all_species['key'] = 1  # Fake key to join on.
# Cross product of survey data and all fish species.
obs_empty = pd.merge(obs_empty, all_species, on='key')
# Append to observations.
observations = observations.append(obs_empty)
print(len(obs_empty))

23920


In [199]:
# Remove cases where we have multiple entries for one species within a survey.
surveys_species = observations.groupby(
    ['survey_key', 'season', 'site', 'depth_range', 'species'])['biomass_kg'].sum().reset_index()
print(len(surveys))

299


In [227]:
# Biomass aggregated by species and depth.

# cisq represents squared confidence interval at 95% confidence level.
# np.var(x,ddof=1) represents sample variance and corresponds to VAR_SAMP SQL function.
cisq = lambda x : 1.96*1.96*np.var(x,ddof=1)/len(x)

biomass_species_depth = surveys_species.groupby(['season', 'site', 'depth_range', 'species'])['biomass_kg'].agg(
    # We use 3 aggregation functions: mean for biomass,
    # len for number of observations and cisq for squared confidence range.
    [np.mean, len, cisq]).rename(
    columns={'mean': 'biomass_kg', 'len': 'num_surveys', '<lambda>': 'biomass_ci_sq'}).reset_index()
biomass_species_depth.head()

Unnamed: 0,season,site,depth_range,species,biomass_kg,num_surveys,biomass_ci_sq
0,Aug-Oct,Andulay,15-19,Angelfish,0.34,6.0,0.033751
1,Aug-Oct,Andulay,15-19,Barred Thicklip,0.0,6.0,0.0
2,Aug-Oct,Andulay,15-19,Blackeye Thicklip,0.0,6.0,0.0
3,Aug-Oct,Andulay,15-19,Blackfin Barracuda,0.0,6.0,0.0
4,Aug-Oct,Andulay,15-19,Bleekers Parrotfish,0.0,6.0,0.0


In [228]:
biomass_specie = biomass_species_depth.groupby(['season', 'site', 'species']).sum().reset_index()
biomass_specie['biomass_ci'] = biomass_specie['biomass_ci_sq'].apply(np.sqrt)
biomass_specie.head()

Unnamed: 0,season,site,species,biomass_kg,num_surveys,biomass_ci_sq,biomass_ci
0,Aug-Oct,Andulay,Angelfish,1.406,18.0,0.180168,0.424462
1,Aug-Oct,Andulay,Barred Thicklip,0.002,18.0,1.5e-05,0.00392
2,Aug-Oct,Andulay,Blackeye Thicklip,0.004,18.0,6.1e-05,0.00784
3,Aug-Oct,Andulay,Blackfin Barracuda,0.0,18.0,0.0,0.0
4,Aug-Oct,Andulay,Bleekers Parrotfish,0.091,18.0,0.031812,0.17836


In [229]:
surveys = surveys_species.groupby(['survey_key','season', 'site', 'depth_range']).sum().reset_index()
surveys.head()

Unnamed: 0,survey_key,season,site,depth_range,biomass_kg
0,01/09/20171Izzy Morgante,Aug-Oct,Guinsuan,9-13,1.584
1,01/09/20172Izzy Morgante,Aug-Oct,Guinsuan,9-13,6.016
2,01/09/20173Izzy Morgante,Aug-Oct,Guinsuan,9-13,2.28
3,01/09/20174Izzy Morgante,Aug-Oct,Guinsuan,9-13,3.824
4,02/01/20181Alex Clancy,Nov-Jan,Kookoos,15-19,10.532


In [236]:
# DEBUG ONE SITE
#surveys[surveys['site']=='Turtle Heaven'].reset_index().sort_values(by=['survey_key'])

In [237]:
len(surveys)

299

In [238]:
biomass_depth = surveys.groupby(['season', 'site', 'depth_range'])['biomass_kg'].agg(
    # We use 3 aggregation functions: mean for biomass,
    # len for number of observations and cisq for squared confidence range.
    [np.mean, len, cisq]).rename(
    columns={'mean': 'biomass_kg', 'len': 'num_surveys', '<lambda>': 'biomass_ci_sq'}).reset_index()
biomass_depth.head()

Unnamed: 0,season,site,depth_range,biomass_kg,num_surveys,biomass_ci_sq
0,Aug-Oct,Andulay,15-19,15.617,6.0,63.119572
1,Aug-Oct,Andulay,3-7,8.283,6.0,4.661014
2,Aug-Oct,Andulay,9-13,9.191,6.0,24.987033
3,Aug-Oct,Dauin,15-19,38.569,6.0,156.801039
4,Aug-Oct,Dauin,3-7,17.131,6.0,44.695824


In [239]:
biomass_per_site = biomass_depth.groupby(['season', 'site']).sum().reset_index().rename(
    columns={'biomass_kg': 'biomass_sum'})
biomass_per_site['biomass_ci'] = biomass_per_site['biomass_ci_sq'].apply(np.sqrt)
biomass_per_site

Unnamed: 0,season,site,biomass_sum,num_surveys,biomass_ci_sq,biomass_ci
0,Aug-Oct,Andulay,33.091,18.0,92.76762,9.631595
1,Aug-Oct,Dauin,81.869,18.0,436.651434,20.896206
2,Aug-Oct,Guinsuan,24.595,10.0,305.576685,17.480752
3,Aug-Oct,Kookoos,15.038,11.0,14.861925,3.855117
4,Aug-Oct,Lutoban Pier,29.262,19.0,173.178822,13.159742
5,Aug-Oct,Lutoban South,28.715,12.0,102.006751,10.099839
6,Aug-Oct,Malatapay,40.488,12.0,559.321309,23.649975
7,Aug-Oct,Masaplod,139.178,14.0,2972.106398,54.517029
8,Aug-Oct,Turtle Heaven,62.0245,12.0,4208.237797,64.870932
9,Aug-Oct,Unity Point,17.066,12.0,252.162959,15.87964


In [240]:
biomass_per_site = biomass_per_site.sort_values(['biomass_sum'], ascending=[False])
season1 = go.Bar(x=biomass_per_site[biomass_per_site['season'] == "Aug-Oct"].site,
                 y=biomass_per_site[biomass_per_site['season'] == "Aug-Oct"].biomass_sum,
                 error_y=dict(
                   type='data',
                   array=biomass_per_site[biomass_per_site['season'] == "Aug-Oct"].biomass_ci,
                   visible=True
                 ),
                 name='Aug-Oct')
season2 = go.Bar(x=biomass_per_site[biomass_per_site['season'] == "Nov-Jan"].site,
                 y=biomass_per_site[biomass_per_site['season'] == "Nov-Jan"].biomass_sum,
                 error_y=dict(
                   type='data',
                   array=biomass_per_site[biomass_per_site['season'] == "Nov-Jan"].biomass_ci,
                   visible=True
                 ),
                 name='Nov-Jan')
layout = go.Layout(title="Biomass per Site",
                xaxis=dict(title='Site'),
                yaxis=dict(title='Average biomass per survey [kg]'))
fig = go.Figure(data=[season1, season2], layout=layout)
py.iplot(fig, show_link=False)
# Use py.plot(..., image='png', filename=...) to export an image.