In [1]:
import numpy as np
import pandas as pd

import statsmodels.api
import scipy.stats

import bokeh.io
import bokeh.plotting

#local .py file for some plotting functions and non-parametric bootstrapping utils
import plotting_utils

import numba

bokeh.io.output_notebook()

We can now read in the data into a dataframe for analyis.

In [2]:
df = pd.read_csv("./20190322_supp_table_2.csv")

We take a look at the format for the data.

In [3]:
df['species_underscore'] = [spec.replace(" ", "_") for  spec in df['species']]
df.head()

Unnamed: 0.1,Unnamed: 0,subfamily,species,sex,mass (g),spiracle,area (mm^2),depth (mm),species_underscore
0,0,Cetoniinae,Goliathus goliathus,M,16.28,6,0.274408,2.512648,Goliathus_goliathus
1,1,Cetoniinae,Goliathus goliathus,F,18.15,6,0.134949,1.606189,Goliathus_goliathus
2,2,Cetoniinae,Coelorrhina hornimani,M,1.13,6,0.212131,0.553833,Coelorrhina_hornimani
3,3,Cetoniinae,Dicronorrhina derbyana,M,2.12,6,0.039532,0.473369,Dicronorrhina_derbyana
4,4,Cetoniinae,Dicronorrhina derbyana,F,2.145,6,0.049701,0.49632,Dicronorrhina_derbyana


For some of this analysis, we will look at the per-species averages for our measurements. To get this, we use a simple aggregate function on the dataframe and take a look at the results.

In [4]:
df_averages = df.groupby(['species', 'species_underscore', 'spiracle'], as_index=False).aggregate(np.average)
df_averages['subfamily'] = df.groupby(['species', 'species_underscore', 'spiracle'], as_index=False).aggregate(max)['subfamily']
df_averages.head()

Unnamed: 0.1,species,species_underscore,spiracle,Unnamed: 0,mass (g),area (mm^2),depth (mm),subfamily
0,Coelorrhina hornimani,Coelorrhina_hornimani,1,87.0,1.13,0.135347,0.416717,Cetoniinae
1,Coelorrhina hornimani,Coelorrhina_hornimani,2,70.0,1.13,0.084207,0.451409,Cetoniinae
2,Coelorrhina hornimani,Coelorrhina_hornimani,3,53.0,1.13,0.106693,0.325444,Cetoniinae
3,Coelorrhina hornimani,Coelorrhina_hornimani,4,36.0,1.13,0.115574,0.481558,Cetoniinae
4,Coelorrhina hornimani,Coelorrhina_hornimani,5,19.0,1.13,0.119145,0.506751,Cetoniinae


Let's take a look at the number of species per subfamily in the data.

In [5]:
species_per_subfam=df_averages.groupby(['subfamily', 'spiracle'], as_index=False).count().groupby('subfamily').aggregate(max).reset_index()[['subfamily', 'species']]
species_per_subfam.columns = ('subfamily', 'subfam_count')
species_per_subfam

Unnamed: 0,subfamily,subfam_count
0,Cetoniinae,6
1,Dynastinae,3
2,Rutelinae,1


In [6]:
df_averages = df_averages.merge(species_per_subfam, on='subfamily')

For our plots, we will log transform the data. We will add a column to the dataframe with the log transformed data. We will also need some transforms of our data, which we will do here.

In [7]:
df_averages['log area (mm^2)'] = np.log10(df_averages['area (mm^2)'])
df_averages['log dist'] = np.log10(df_averages['depth (mm)'])
df_averages['log mass (g)'] = np.log10(df_averages['mass (g)'])
df_averages['log area/dist'] = np.log10(df_averages['area (mm^2)']/df_averages['depth (mm)'])
df_averages['log area^2/dist'] = np.log10(df_averages['area (mm^2)']**2/df_averages['depth (mm)'])

df_averages.head()

Unnamed: 0.1,species,species_underscore,spiracle,Unnamed: 0,mass (g),area (mm^2),depth (mm),subfamily,subfam_count,log area (mm^2),log dist,log mass (g),log area/dist,log area^2/dist
0,Coelorrhina hornimani,Coelorrhina_hornimani,1,87.0,1.13,0.135347,0.416717,Cetoniinae,6,-0.868551,-0.380159,0.053078,-0.488392,-1.356943
1,Coelorrhina hornimani,Coelorrhina_hornimani,2,70.0,1.13,0.084207,0.451409,Cetoniinae,6,-1.074651,-0.34543,0.053078,-0.729221,-1.803872
2,Coelorrhina hornimani,Coelorrhina_hornimani,3,53.0,1.13,0.106693,0.325444,Cetoniinae,6,-0.971862,-0.487524,0.053078,-0.484339,-1.456201
3,Coelorrhina hornimani,Coelorrhina_hornimani,4,36.0,1.13,0.115574,0.481558,Cetoniinae,6,-0.937142,-0.317351,0.053078,-0.61979,-1.556932
4,Coelorrhina hornimani,Coelorrhina_hornimani,5,19.0,1.13,0.119145,0.506751,Cetoniinae,6,-0.923923,-0.295205,0.053078,-0.628717,-1.55264


In addition to log transforming the species averaged data, we will do the same for the whole data set.

In [8]:
df['log area (mm^2)'] = np.log10(df['area (mm^2)'])
df['log dist'] = np.log10(df['depth (mm)'])
df['log mass (g)'] = np.log10(df['mass (g)'])
df['log area/dist'] = np.log10(df['area (mm^2)']/df['depth (mm)'])
df['log area^2/dist'] = np.log10(df['area (mm^2)']**2/df['depth (mm)'])
df.head()

Unnamed: 0.1,Unnamed: 0,subfamily,species,sex,mass (g),spiracle,area (mm^2),depth (mm),species_underscore,log area (mm^2),log dist,log mass (g),log area/dist,log area^2/dist
0,0,Cetoniinae,Goliathus goliathus,M,16.28,6,0.274408,2.512648,Goliathus_goliathus,-0.561603,0.400132,1.211654,-0.961735,-1.523338
1,1,Cetoniinae,Goliathus goliathus,F,18.15,6,0.134949,1.606189,Goliathus_goliathus,-0.869831,0.205797,1.258877,-1.075628,-1.945459
2,2,Cetoniinae,Coelorrhina hornimani,M,1.13,6,0.212131,0.553833,Coelorrhina_hornimani,-0.673395,-0.256621,0.053078,-0.416774,-1.090169
3,3,Cetoniinae,Dicronorrhina derbyana,M,2.12,6,0.039532,0.473369,Dicronorrhina_derbyana,-1.403054,-0.3248,0.326336,-1.078254,-2.481309
4,4,Cetoniinae,Dicronorrhina derbyana,F,2.145,6,0.049701,0.49632,Dicronorrhina_derbyana,-1.303635,-0.304238,0.331427,-0.999397,-2.303033


In [9]:
df_test = df_averages.copy()
df_test['area/depth'] = (df_averages['area (mm^2)']/df_averages['depth (mm)'])/10
df_test['area^2/depth'] = ((df_averages['area (mm^2)']**2/df_averages['depth (mm)']))/(1000*1000*1000)

df_test['g_diff'] = df_test['area/depth']*0.178*404
df_test['g_adv'] = df_test['area^2/depth']*(1/(np.pi*8*1.86*10**(-8)))

df_summed = df_test.groupby('species').median().reset_index()[['species', 'log mass (g)', 'g_diff', 'g_adv']]
df_summed['g_diff'] = df_test.groupby('species').sum().reset_index()['g_diff']
df_summed['g_adv'] = df_test.groupby('species').sum().reset_index()['g_adv']

In [31]:
plots = []
resample_size = 10_000
lw = 2
cs = 12

m = df_summed['log mass (g)']
g_diff = np.log10(df_summed['g_diff']*2)

slope, intercept = np.polyfit(m, g_diff, deg=1)
x = np.array([m.min(), m.max()])
y = slope * x + intercept

p = bokeh.plotting.figure(plot_height=300, plot_width=400,
                          x_range=(x[0]-0.1, x[1]+0.1), y_range=(g_diff.min()-0.1, g_diff.max()+0.2))
p.outline_line_color = None
p.yaxis.minor_tick_line_color = None
p.xaxis.minor_tick_line_color = None
p.grid.grid_line_color = None

slope_comp = 0.75
intercept1 = plotting_utils.first_intercept(slope_comp, x.max(), g_diff.min()) -0.5
line_scale = (y.max() - y.min())/5
around_line=0.2
for i in line_scale*np.array(range(30))+intercept1:
        try:
            lx, ly = plotting_utils.generate_line(intercept=i, slope=slope_comp,
                                                   bounds=(x[0]-around_line, x[1]+around_line,
                                                           g_diff.min()-around_line, g_diff.max()+around_line+0.4), point=x[1])
            p.line(lx, ly, color='grey', alpha=0.3)
        except:
            pass
        
bs_slope_reps, bs_intercept_reps, _ = plotting_utils.draw_bs_pairs_linreg(m, g_diff, size=resample_size)
p.title.text = ' slope 95% CI: ' + str([round(j, 3) for j in np.percentile(bs_slope_reps, [2.5, 97.5])]) + ' ' + str(np.sum(bs_slope_reps > 0.75))
x_boot = np.linspace(m.min(), m.max(), 200)
y_boot = np.outer(bs_slope_reps, x_boot) + np.stack([bs_intercept_reps]*200, axis=1)
low, high = np.percentile(y_boot, [2.5, 97.5], axis=0)
p1 = np.append(x_boot, x_boot[::-1])
p2 = np.append(low, high[::-1])
p.patch(p1, p2, alpha=0.5, color='lightgrey')


p.circle(m, g_diff, color='black', size=cs)
p.line(x, y, color='black', line_width=lw, line_cap='round')
p.output_backend='svg'
plots.append(p)
#bokeh.io.show(p)

m = df_summed['log mass (g)']
g_adv = np.log10(df_summed['g_adv']*2)

slope, intercept = np.polyfit(m, g_adv, deg=1)
x = np.array([m.min(), m.max()])
y = slope * x + intercept

p = bokeh.plotting.figure(plot_height=300, plot_width=400, x_range=(x[0]-0.1, x[1]+0.1), y_range=(g_adv.min()-0.2, g_adv.max()+0.5))
p.outline_line_color = None
p.yaxis.minor_tick_line_color = None
p.xaxis.minor_tick_line_color = None
p.grid.grid_line_color = None

slope_comp = 0.75
intercept1 = plotting_utils.first_intercept(slope_comp, x.max(), g_adv.min())
line_scale = (y.max() - y.min())/7
around_line=0.4
for i in line_scale*np.array(range(30))+intercept1:
        try:
            lx, ly = plotting_utils.generate_line(intercept=i, slope=slope_comp,
                                                   bounds=(x[0]-around_line, x[1]+around_line,
                                                           g_adv.min()-around_line, g_adv.max()+around_line+0.5), point=x[1])
            p.line(lx, ly, color='grey', alpha=0.3)
        except:
            pass


bs_slope_reps, bs_intercept_reps, _ = plotting_utils.draw_bs_pairs_linreg(m, g_adv, size=resample_size)
p.title.text = ' slope 95% CI: ' + str([round(j, 3) for j in np.percentile(bs_slope_reps, [2.5, 97.5])]) + ' ' + str(np.sum(bs_slope_reps < 0.75))
x_boot = np.linspace(m.min(), m.max(), 200)
y_boot = np.outer(bs_slope_reps, x_boot) + np.stack([bs_intercept_reps]*200, axis=1)
low, high = np.percentile(y_boot, [2.5, 97.5], axis=0)
p1 = np.append(x_boot, x_boot[::-1])
p2 = np.append(low, high[::-1])
p.patch(p1, p2, alpha=0.5, color='lightgrey')

p.circle(m, g_adv, color='black', size=cs)
p.line(x, y, color='black', line_width=lw, line_cap='round')
p.output_backend='svg'
plots.append(p)
#bokeh.io.show(p)

In [32]:
bokeh.io.show(plots[0])
bokeh.io.show(plots[1])

In [11]:
m = df_summed['log mass (g)']
g_diff = np.log10(df_summed['g_diff']*2)
slope, intercept = np.polyfit(m, g_diff, deg=1)
po2 = np.log10((1*((10**(3.20 + 0.75*m))*(1/20.7)*(1/24.5))/(df_summed['g_diff']*2)))
print(slope)
N = 200
y_top = 90
x = np.linspace(m.min(), m.max(), N)
y = np.linspace(1, y_top, N)
im = np.zeros((N, N))
for j, yi in zip(range(N), y):
    for i, xi in zip(range(N), x):
        im[i, j] = (yi*((10**(3.20 + 0.75*xi))*(1/20.7)*(1/24.5))/(10**(slope * xi + intercept)))

        
N = 800
y_top = 90
x = np.linspace(m.min(), m.max(), N)
y = np.linspace(1, y_top, N)

x_pos = []
for j, yi in zip(range(N), y):
    [x_pos.append([xj, yj, zj]) for xj, yj, zj in zip(x, np.log10(yi*((10**(3.20 + 0.75*x))*(1/20.7)*(1/24.5))/(10**(slope * x + intercept))),
                                                                  yi*((10**(3.20 + 0.75*x))*(1/20.7)*(1/24.5))/(10**(slope * x + intercept)))]
    
x_min, y_min, v_min = (np.array(x_pos)[:, 0].min(), np.array(x_pos)[:, 1].min(), np.array(x_pos)[:, 2].min())
x_stride, y_stride = ((np.array(x_pos)[:, 0].max() - x_min)/N, (np.array(x_pos)[:, 1].max() - y_min)/N)

im = np.ones((N, N))*v_min
for xj, yj, vj in x_pos:
    im[int(np.ceil((yj-y_min)/y_stride-1)), int(np.ceil((xj-x_min)/x_stride-1))] = vj
im = scipy.ndimage.gaussian_filter(im, sigma=1)
            
p = bokeh.plotting.figure(tooltips=[("x", "$x"), ("y", "$y"), ("value", "@image")], plot_height=300, plot_width=400)
#p.x_range.range_padding = p.y_range.range_padding = 0

cmap = bokeh.models.LinearColorMapper(palette='Viridis256', low=im.min(), high=im.max())
cmap_low, cmap_high = (np.min(1*(10**po2)), np.max(90*(10**po2)))
cmap = bokeh.models.LinearColorMapper(palette='Viridis256', low=cmap_low, high=cmap_high)

p.image(image=[im], x=x.min(), y=np.log10(im.min()), dw=x.max()-x.min(), dh=np.log10(im.max())-np.log10(im.min()), color_mapper=cmap, level="image", )

color_bar = bokeh.models.ColorBar(color_mapper=cmap, location=(0,0), ticker=bokeh.models.BasicTicker(desired_num_ticks=12, base=10))
p.add_layout(color_bar, 'right')
p.grid.grid_line_color = None

p.patch([x.min(), x.max(), x.max(), x.min()], [np.log10((8*((10**(3.20 + 0.75*x))*(1/20.7)*(1/24.5))/10**(slope * x + intercept))).min(),
                                               np.log10((8*((10**(3.20 + 0.75*x))*(1/20.7)*(1/24.5))/10**(slope * x + intercept))).max(),
                                               np.log10(im.min()), np.log10(im.min())], color='white', line_width=2)

p.patch([x.min(), x.max(), x.max(), x.min()], [np.log10((90*((10**(3.20 + 0.75*x))*(1/20.7)*(1/24.5))/10**(slope * x + intercept))).min(),
                                               np.log10((90*((10**(3.20 + 0.75*x))*(1/20.7)*(1/24.5))/10**(slope * x + intercept))).max(),
                                               np.log10(im.max()), np.log10(im.max())], color='white', line_width=2)

p.line([x.min(), x.max()], [np.log10(21), np.log10(21)], color='lightgrey', line_width=2, alpha=0.75)

#p.patch([x.min(), x.max(), x.max(), x.min()], [np.log10((1*((10**(3.20 + 0.75*x))*(1/20.7)*(1/24.5))/10**(slope * x + intercept))).min(),
#                                               np.log10((1*((10**(3.20 + 0.75*x))*(1/20.7)*(1/24.5))/10**(slope * x + intercept))).max(),
#                                               np.log10((90*((10**(3.20 + 0.75*x))*(1/20.7)*(1/24.5))/10**(slope * x + intercept))).max(),
#                                               np.log10((90*((10**(3.20 + 0.75*x))*(1/20.7)*(1/24.5))/10**(slope * x + intercept))).min()], color='white', line_width=0, alpha=0.2)


line_color='black'
#[p.line(x, np.log10((yi*((10**(3.20 + 0.75*x))*(1/20.7)*(1/24.5))/10**(slope * x + intercept))), color=line_color) for yi in np.linspace(1, y_top, 10)]
[p.line(x, np.log10((yi*((10**(3.20 + 0.75*x))*(1/20.7)*(1/24.5))/10**(slope * x + intercept))), color=line_color, line_width=lw, line_cap='round') for yi in [1, 8, 90]]
#[p.line(x, np.log10((yi*((10**(3.20 + 0.75*x))*(1/20.7)*(1/24.5))/10**(0.33  * x + intercept))), color='white', line_dash='dashed', line_width=5, line_cap='round') for yi in [1, 8, 90]]

dot_color='white'
dot_line='black'
dot_width = 1
cmapper = bokeh.transform.linear_cmap('c', palette='Viridis256', low=np.min(1*(10**po2)), high=np.max(90*(10**po2)))
for mult in [1, 8, 90]:
    source = bokeh.models.ColumnDataSource(data=dict(x=m, y=np.log10(mult*(10**po2)), c=mult*(10**po2)))
    p.circle('x', 'y', source = source, size=cs, line_color=dot_line, line_width = dot_width, fill_color=cmapper)
p.outline_line_color = None
p.yaxis.minor_tick_line_color = None
p.xaxis.minor_tick_line_color = None
p.output_backend='svg'
plots.append(p)
#bokeh.io.show(p)

0.39370488659751485


In [12]:
bokeh.io.show(bokeh.layouts.gridplot([plots[0], plots[2], plots[1]], ncols=3))

In [13]:
bokeh.io.export_svgs(plots[2], filename="./ptest.svg")

['./ptest.svg']

In [14]:
%reload_ext watermark
%watermark -p bokeh

bokeh: 2.2.3

