Bootstrap resampling with Python

In [34]:
import numpy as np, pandas as pd

As far as doing the work, the bootstrap in Python is quite simple.  

In [146]:
def bootstrap_resample(X, n=None):
    """ Bootstrap resample an array_like
    Parameters
    ----------
    X : array_like
      data to resample
    n : int, optional
      length of resampled array, equal to len(X) if n==None
    Results
    -------
    returns X_resamples
    """
    if n == None:
        n = len(X)
        
    resample_i = np.floor(np.random.rand(n)*len(X)).astype(int)
    X_resample = X[resample_i]
    return X_resample
    

In [147]:
X = np.arange(10000)
X_resample = bootstrap_resample(X, n=5000)
print 'original mean:', X.mean()
print 'resampled mean:', X_resample.mean()

original mean: 4999.5
resampled mean: 5024.5518


In [148]:
sum(pd.isnull(X_resample))

0

In [149]:
def test_bsr_shape():
    # test without resampling length parameter
    X = np.arange(10000)
    X_resample = bootstrap_resample(X)
    assert X_resample.shape == (10000,), 'resampled length should be 10000'
    
    # test with resampling length parameter
    n = 5000
    X_resample = bootstrap_resample(X, n=n)
    assert X_resample.shape == (n,), 'resampled length should be %d' % n
test_bsr_shape()

In [150]:
def test_bsr_mean():
    # test that means are close
    np.random.seed(123456)  # set seed so that randomness does not lead to failed test
    X = np.arange(10000)
    X_resample = bootstrap_resample(X, 5000)
    assert abs(X_resample.mean() - X.mean()) / X.mean() < 1e-2, 'means should be approximately equal'
test_bsr_mean()

I want to use this easily with Pandas, but there is a little bit of trouble with indexing to watch out for:

In [153]:
# test that means are close for pd.DataFrame with unusual index
np.random.seed(123456)  # set seed so that randomness does not lead to failed test
X = pd.Series(np.arange(10000), index=np.arange(10000)*10)
    
X_resample = bootstrap_resample(X, 5000)
print X_resample.mean(), X.mean()
assert abs(X_resample.mean() - X.mean()) / X.mean() < 1e-2, 'means should be approximately equal'
    

504.862785863 4999.5


AssertionError: means should be approximately equal

Why didn't that work?  Because Pandas has silently dealt with the rows for which the indices are missing, returning NaNs, and then silently deat with the NaNs, dropping them from the average calculation.