# Test bootstrapping functions

In [1]:
import pandas as pd
import polyclonal.bootstrap as boot

For these tests, I'll use the same dataframe from the variant filtering notebook

In [2]:
train_df = pd.DataFrame.from_records([
            ('var1', '', 0.5, 0.1),
            ('var2', 'M1A', 0.5, 0.2),
            ('var3', 'M1A G2A', 0.5, 0.3),
            ('var4', 'M1A G2C', 0.5, 0.4),
            ('var5', 'G2A', 1, 0.5),
            ('var6', 'M1A', 1, 0.6),
            ('var7', 'M1L', 1, 0.3)
            ],
            columns=['barcode', 'aa_substitutions', 'concentration', 'prob_escape'])
train_df

Unnamed: 0,barcode,aa_substitutions,concentration,prob_escape
0,var1,,0.5,0.1
1,var2,M1A,0.5,0.2
2,var3,M1A G2A,0.5,0.3
3,var4,M1A G2C,0.5,0.4
4,var5,G2A,1.0,0.5
5,var6,M1A,1.0,0.6
6,var7,M1L,1.0,0.3


We want to bootstrap each concentration seperately -- so we should get four samples from `concentration=0.5` and `concentration=1` in all bootstraps

In [3]:
boot_df = boot.create_bootstrap_sample(train_df)
boot_df

Unnamed: 0,barcode,aa_substitutions,concentration,prob_escape
2,var3,M1A G2A,0.5,0.3
1,var2,M1A,0.5,0.2
3,var4,M1A G2C,0.5,0.4
2,var3,M1A G2A,0.5,0.3
4,var5,G2A,1.0,0.5
5,var6,M1A,1.0,0.6
6,var7,M1L,1.0,0.3


Now we test to make sure we have:
* The same number of rows in the bootstrapped dataset
* The same number of observations for each concentration in the bootsrapped dataset

In [4]:
assert len(train_df) == len(boot_df)
assert boot_df.value_counts('concentration').equals(train_df.value_counts('concentration'))

This should throw a key error when the column specified for grouping doesn't exist:

In [5]:
boot.create_bootstrap_sample(train_df, group_by_col='this_is_not_a_column')

KeyError: 'this_is_not_a_column is not in supplied data frame.'