# Notebook for getting started with `polyclonal` bootstrapping

In [1]:
import pandas as pd
from polyclonal import Polyclonal, PolyclonalCollection
import polyclonal.bootstrap as boot
import numpy as np

Below contains some simulated data from Jesse
* One simulation where we have the same variants for each conecntration
* One simulation where we do not have the same variants for each concentration
* These two situations hare handeled differently in `polyclonal` objects

In [2]:
activity_wt_df = pd.DataFrame({'epitope':  ['e1', 'e2'],
                               'activity': [ 2.0,  1.0]})

mut_escape_df = pd.DataFrame({
       'mutation': ['M1C', 'M1C', 'G2A', 'G2A', 'A4K', 'A4K', 'A4L', 'A4L'],
       'epitope':  [ 'e1',  'e2',  'e1',  'e2',  'e1',  'e2',  'e1',  'e2'],
       'escape':   [  2.0,   0.0,   3.0,   0.0,  0.0,    2.5,   0.0,   1.5],
       })

polyclonal_sim = Polyclonal(activity_wt_df=activity_wt_df,
                            mut_escape_df=mut_escape_df)

variants_df = pd.DataFrame.from_records(
         [('AA', ''),
          ('AC', 'M1C'),
          ('AG', 'G2A'),
          ('AT', 'A4K'),
          ('TA', 'A4L'),
          ('CA', 'M1C G2A'),
          ('CG', 'M1C A4K'),
          ('CC', 'G2A A4K'),
          ('TC', 'G2A A4L'),
          ('CT', 'M1C G2A A4K'),
          ('TG', 'M1C G2A A4L'),
          ('GA', 'M1C'),
          ],
         columns=['barcode', 'aa_substitutions'])

escape_probs = polyclonal_sim.prob_escape(variants_df=variants_df,
                                          concentrations=[1.0, 2.0, 4.0])

data_to_fit = (
         escape_probs
         .rename(columns={'predicted_prob_escape': 'prob_escape'})
         )

polyclonal_data = Polyclonal(data_to_fit=data_to_fit,
                             activity_wt_df=activity_wt_df,
                             site_escape_df=pd.DataFrame.from_records(
                                    [('e1', 1, 1.0), ('e1', 4, 0.0),
                                     ('e2', 1, 0.0), ('e2', 4, 2.0)],
                                    columns=['epitope', 'site', 'escape']),
                             data_mut_escape_overlap='fill_to_data',
                             )

# Now only use the first 30 elements to get rid of some mutations 
polyclonal_data2 = Polyclonal(data_to_fit=data_to_fit.head(30),
                             activity_wt_df=activity_wt_df,
                             site_escape_df=pd.DataFrame.from_records(
                                    [('e1', 1, 1.0), ('e1', 4, 0.0),
                                     ('e2', 1, 0.0), ('e2', 4, 2.0)],
                                    columns=['epitope', 'site', 'escape']),
                             data_mut_escape_overlap='fill_to_data',
                             )

#### Test basic functionality of helper methods outside of class

In [3]:
boot_df = boot.create_bootstrap_sample(data_to_fit)

# Make sure we got an appropriate number of samples
assert len(boot_df) == len(data_to_fit)
# Make sure we did sample with replacement
assert not len(boot_df.drop_duplicates()) == len(data_to_fit)
# Make sure we sampled the same number of variants at each concentration as the orignal dataset has
assert boot_df.concentration.value_counts().equals(data_to_fit.concentration.value_counts())

#### Test initialization of `PolyclonalCollection` objects

In [4]:
n_samps = 3
pc = PolyclonalCollection(root_polyclonal=polyclonal_data, n_samples=n_samps)

# Make sure the desired number of models were created and stored
assert len(pc.models) == n_samps
# Make sure attributes are stored properly
assert pc.n_samples == n_samps
# Make sure we aren't copying the actual data from the original polyclonal model
for i in range(n_samps):
    assert not pc.root_polyclonal.data_to_fit.equals(pc.models[i].data_to_fit)

#### Test `PolyclonalCollection` results

In [5]:
# Test to ensure that parameters change during fitting.
for model in pc.models:
    old_params = model._params
    boot._fit_polyclonal_model(model)
    # Did the params change?
    assert not np.array_equal(model._params, old_params)

# Optimization FAILED at Tue Jan 11 14:24:57 2022.
# Optimization FAILED at Tue Jan 11 14:24:58 2022.


In [6]:
test_list = pc.make_predictions(variants_df=boot_df)
# This funciton seems to work at generating a proper list of dataframes 
# TODO: However, we need to do something about failed optimizations -- should we discard models that fail?
# Make sure we get the same number of predictions for every model
for result in test_list:
    assert len(result) == len(boot_df)
    assert not (result.shape[1]) == (boot_df.shape[1])