In [3]:
import numpy as np
import pandas as pd

import bokeh_catplot

import bokeh.io
import bokeh.plotting

%load_ext blackcellmagic

bokeh.io.output_notebook()

# Hacker stats

In [9]:
df = pd.read_csv('data/grant_complete.csv', comment='#')

df.head()

Unnamed: 0,band,beak depth (mm),beak length (mm),species,year
0,20123,8.05,9.25,fortis,1973
1,20126,10.45,11.35,fortis,1973
2,20128,9.55,10.15,fortis,1973
3,20129,8.75,9.95,fortis,1973
4,20133,10.15,11.55,fortis,1973


In [10]:
df = df.loc[(df['species']=='scandens') & (df['year'].isin([1975, 2012]))].copy()

In [11]:
p = bokeh_catplot.ecdf(
    data=df,
    cats='year',
    val='beak depth (mm)'
)

bokeh.io.show(p)

In [13]:
bd_1975 = df.loc[df['year']==1975, 'beak depth (mm)'].values
bd_2012 = df.loc[df['year']==2012, 'beak depth (mm)'].values

In [14]:
np.mean(bd_1975), np.mean(bd_2012)

(8.959999999999999, 9.188492063492063)

In [15]:
len(bd_1975)

87

## What is a confidence interval?

What is a 95% confidence interval? It can be thought of as follows. If we were to repeat the experiment over and over and over again, 95% of the time, the observed mean would lie in the 95% confidence interval. So, if the confidence intervals of the means of measurements from 1975 and from 2012 overlapped, we might not be so sure that the beaks got deeper due to some underlying selective pressure, but that we just happened to observe deeper beaks as a result of natural variability.

## Bootstrap confidence intervals

We can't repeat the experiment over and over again. Instead, we will have our computer simulate doing the experiment over and over again. Hacker statistics! We have one set of measurements. We “repeat” the experiment by drawing measurements out of the ones we have again and again. Here’s what we do to compute a bootstrap estimate of the mean of a set of $n$ data points.

1. Draw n data points out of the original data set **with replacement**. This set of data points is called a **bootstrap sample**.
1. Compute the mean of the bootstrap sample. This is called a **bootstrap replicate** of the mean.
1. Do this over and over again, storing the results.

Bootstrapping works well if you have $n > 15$, without extreme outliers.


In [22]:
# Seed the RNG for discussion purposes; normally do not seed

rg = np.random.default_rng(3252)

In [23]:
# Get a bootstrap sample

bs_sample = rg.choice(bd_1975, replace=True, size=len(bd_1975))

In [24]:
p = bokeh_catplot.ecdf(
    bd_1975,
    x_axis_label='beak depth (mm)'
)

p = bokeh_catplot.ecdf(
    bs_sample,
    marker_kwargs=dict(
        fill_color=None,
        line_color='gray'
    ),
    p=p
)

bokeh.io.show(p)

In [25]:
np.mean(bd_1975), np.mean(bs_sample)

(8.959999999999999, 8.849770114942531)

In [27]:
# Get the bootstrap replicate of the mean

bs_replicate = np.mean(bs_sample)

bs_replicate

8.849770114942531

Now we have 1 bootstrap replicate. We need to get a lot. A good number is about 2000.

In [28]:
n_reps = 2000

bs_reps_1975 = np.empty(n_reps)

for i in range(n_reps):
    bs_sample = rg.choice(bd_1975, replace=True, size=len(bd_1975))
    bs_reps_1975[i] = np.mean(bs_sample)

In [29]:
bs_reps_1975

array([8.97034483, 8.95344828, 8.97758621, ..., 8.97609195, 8.93425287,
       8.95781609])

In [30]:
# Get the confidence interval

np.percentile(bs_reps_1975, [2.5, 97.5])

array([8.8466523 , 9.07979023])

In [31]:
# Plot the bootstrap replicates

p = bokeh_catplot.ecdf(
    bs_reps_1975,
    x_axis_label='beak depth (mm)'
)

bokeh.io.show(p)

We can make this a bit more elegant.

In [32]:
def draw_bs_rep(data, func, rg):
    """Compute a bootstrap replicate from an array of data."""
    
    bs_sample = rg.choice(data, size=len(data), replace=True)
    return func(bs_sample)

In [34]:
# Use list comprehension to make the bootstrap replicates

bs_reps_1975 = np.array(
    [draw_bs_rep(bd_1975, np.mean, rg) for _ in range(n_reps)]
)

bs_reps_2012 = np.array(
    [draw_bs_rep(bd_2012, np.mean, rg) for _ in range(n_reps)]
)

In [35]:
conf_int_1975 = np.percentile(bs_reps_1975, [2.5, 97.5])
conf_int_2012 = np.percentile(bs_reps_2012, [2.5, 97.5])

conf_int_1975, conf_int_2012

(array([8.83929023, 9.08116954]), array([9.07537698, 9.30914683]))

These barely overlap, so there is probably a real difference in the mean beak depth from 1975 to 2012.

In [36]:
def coeff_var(data):
    return np.std(data) / np.mean(data)

bs_reps_1975 = np.array(
    [draw_bs_rep(bd_1975, coeff_var, rg) for _ in range(n_reps)]
)

bs_reps_2012 = np.array(
    [draw_bs_rep(bd_2012, coeff_var, rg) for _ in range(n_reps)]
)

In [37]:
conf_int_1975 = np.percentile(bs_reps_1975, [2.5, 97.5])
conf_int_2012 = np.percentile(bs_reps_2012, [2.5, 97.5])

conf_int_1975, conf_int_2012

(array([0.05373038, 0.07136381]), array([0.06368108, 0.0809896 ]))

In [42]:
# Plot the confidence intervals

years = ['2012', '1975']
p = bokeh.plotting.figure(
    frame_height=100,
    frame_width=250,
    x_axis_label='coeff. of var. of beak depth',
    y_range=years,
)

p.circle(
    [coeff_var(bd_2012), coeff_var(bd_1975)],
    years,
    size=5
)
p.line(
    conf_int_1975,
    ['1975']*2,
    line_width=3
)
p.line(
    conf_int_2012,
    ['2012']*2,
    line_width=3
)

bokeh.io.show(p)

*Notice that this confidence interval is **not** symmetric.* If you report $1.4 \pm 0.3$, it implies symmetry and requires that you had assumed a Gaussian distribution and calculated the maximum likelihood estimation.

## More complex summaries of data sets

In [43]:
# Difference of means

np.mean(bd_2012) - np.mean(bd_1975)

0.2284920634920642

In [44]:
bs_reps_1975 = np.array(
    [draw_bs_rep(bd_1975, np.mean, rg) for _ in range(n_reps)]
)

bs_reps_2012 = np.array(
    [draw_bs_rep(bd_2012, np.mean, rg) for _ in range(n_reps)]
)

bd_reps_diff = bs_reps_2012 - bs_reps_1975

np.percentile(bd_reps_diff, [2.5, 97.5])

array([0.05526834, 0.39176731])

In [45]:
# Getting the CI on the ECDF itself--built into bokeh_catplot.ecdf()

p = bokeh_catplot.ecdf(
    data=df,
    cats='year',
    val='beak depth (mm)',
    style='staircase',
    conf_int=True,
)

bokeh.io.show(p)

In [46]:
bl_1975 = df.loc[df['year']==1975, 'beak length (mm)'].values

p = bokeh.plotting.figure(
    frame_width=250,
    frame_height=250,
    x_axis_label='beak depth (mm)',
    y_axis_label='beak length (mm)',
)

p.circle(bd_1975, bl_1975)

bokeh.io.show(p)

In [47]:
p = bokeh_catplot.ecdf(
    bl_1975 / bd_1975,
    x_axis_label='length/depth'
)

bokeh.io.show(p)

How do we get the mean and bootstrap CI for this ratio?

In [49]:
# Straightforward way: make a list of the ratios, go from there

ratio = bl_1975 / bd_1975

bs_reps_ratio = [draw_bs_rep(ratio, np.mean, rg) for _ in range(n_reps)]

In [50]:
# Another way: pairs bootstrap

def draw_bs_pairs(x, y, rg):
    """Draws pairs of points out of `x` and `y`."""
    bs_inds = rg.choice(np.arange(len(x)), len(x), replace=True)
    
    return x[bs_inds], y[bs_inds]

In [51]:
ratio_bs_reps = np.empty(n_reps)

for i in range(n_reps):
    bd, bl = draw_bs_pairs(bd_1975, bl_1975, rg)
    ratio_bs_reps[i] = np.mean(bl / bd)

In [53]:
np.percentile(ratio_bs_reps, [2.5, 97.5])

array([1.5627399 , 1.59492332])

This is kind of dumb because we saw we can make the list of ratios first, but this same method is more useful for more complicated things, like the correlation.

In [56]:
def corr(x, y):
    cov = np.cov(x, y)
    return cov[0, 1] / np.sqrt(cov[0, 0] * cov[1, 1])

correlation_bs_reps = [corr(*draw_bs_pairs(bd_1975, bl_1975, rg)) for _ in range(n_reps)]

In [57]:
np.percentile(correlation_bs_reps, [2.5, 97.5])

array([0.45326638, 0.75054959])

In [58]:
%load_ext watermark
%watermark -v -p numpy,pandas,bokeh,bokeh_catplot,jupyterlab

CPython 3.7.7
IPython 7.13.0

numpy 1.18.1
pandas 0.24.2
bokeh 2.0.2
bokeh_catplot 0.1.7
jupyterlab 1.2.6
