In [1]:
import datascience as ds
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plots

In [2]:
values = ds.Table.read_table("https://raw.githubusercontent.com/boettiger-lab/espm-88b/master/modules/fish/data/RAM-Legacy-DB/values.csv")

In [3]:
def collapsed(an_array):
    col_max = np.nanmax(an_array)
    if np.isnan(col_max) or np.isnan(an_array[-1]):
        return np.nan
    else:
        return an_array[-1] < 0.1 * col_max 

In [4]:
df_tables = values.select(['assessid', 'ssb', 'r', 'total', 'catch_landings']).group('assessid', collapsed)
df_tables



assessid,ssb collapsed,r collapsed,total collapsed,catch_landings collapsed
ADFG-HERRPWS-1980-2006-COLLIE,1,,1.0,1.0
ADFG-HERRSITKA-1978-2007-COLLIE,0,,0.0,0.0
AFSC-ALPLAICBSAI-1972-2008-MELNYCHUK,0,,0.0,
AFSC-ARFLOUNDBSAI-1970-2008-STANTON,0,,0.0,0.0
AFSC-ARFLOUNDGA-1958-2010-STANTON,0,,,
AFSC-ATKABSAI-1976-2009-STANTON,0,,0.0,
AFSC-BKINGCRABPI-1960-2008-JENSEN,1,,1.0,1.0
AFSC-BKINGCRABSMI-1960-2008-JENSEN,0,,,1.0
AFSC-CABEZNCAL-1916-2005-STANTON,0,,0.0,
AFSC-CABEZSCAL-1932-2005-STANTON,0,,0.0,


In [5]:
x = values.select(["assessid", "ssb"]).where("assessid", "AFSC-BKINGCRABPI-1960-2008-JENSEN")
collapsed(x["ssb"])

True

----------------


# Pandas-based version

In [6]:
import pandas as pd
pd.set_option("display.max_rows", 5)

In [40]:

values2 = pd.read_csv("https://raw.githubusercontent.com/boettiger-lab/espm-88b/master/modules/fish/data/RAM-Legacy-DB/values.csv")
#values2 = pd.read_csv("values.csv")

def collapsed(x):
    # Note pandas requires `iloc` to pick by position, otherwise aggregate will be angry about index missing
    return x.iloc[-1] < 0.1 * np.max(x)

df_pandas = values2[['assessid', 'ssb', 'r', 'total', 'catch_landings']].groupby('assessid').aggregate(collapsed)

df_pandas

Unnamed: 0_level_0,ssb,r,total,catch_landings
assessid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ADFG-HERRPWS-1980-2006-COLLIE,1,0,1,1
ADFG-HERRSITKA-1978-2007-COLLIE,0,0,0,0
...,...,...,...,...
WGSSDS-SOLEVIIe-1968-2006-JENNINGS,0,0,0,0
WGSSDS-WHITVIIek-1982-2007-JENNINGS,0,0,0,0


-----------------------

# R-based version

In [8]:
# Enable R execution
%load_ext rpy2.ipython


In [19]:
%%R -o df_R

## R chunks share environment with other R boxes, but are isolated from python chunks
## Note the magic above to export the Rtable object to the rest of the (python) notebook.
## Not that it's very compatible export type. Reading out to csv and back in is better

suppressPackageStartupMessages(library("dplyr"))
values <- download.file("https://raw.githubusercontent.com/boettiger-lab/espm-88b/master/modules/fish/data/RAM-Legacy-DB/values.csv", "values.csv", method="wget")
values <- read.csv("values.csv")
collapsed <- function(x) 
  x[length(x)] < 0.1 * max(x, na.rm=TRUE)

df_R <- values %>% group_by(assessid) %>% summarise(collapsed(ssb))
write.csv(df_R, "df_R.csv")
    
    

# Compare

We read in the R data frame, and compare the results from each method.  Note that `np.array(x, dtype=bool)` will coerce `x` 1's and 0's into boolean `True`/`False`, but will convert `nan` (any non-`0`) to `True`.

In [46]:
df_R = ds.Table.read_table("df_R.csv")

In [45]:
ds.Table([df_tables["ssb collapsed"], 
          np.array(df_pandas["ssb"]), 
          df_R["collapsed(ssb)"]], 
         ["tables", "pandas", "R"]).show()

tables,pandas,R
1.0,1,True
0.0,0,False
0.0,0,False
0.0,0,False
0.0,0,False
0.0,0,False
1.0,1,True
0.0,0,False
0.0,0,False
0.0,0,False


The only difference here is that our `pandas` method doesn't handle the All-NaN case, but classifies these as `False` (`0`).  The Trues all agree.  