# Getting started with `polyclonal` bootstrapping

## Imports and setup

In [1]:
import pandas as pd
from polyclonal import Polyclonal, PolyclonalBootstrap
import polyclonal.polyclonal_collection
import polyclonal.plot
import numpy as np

Below contains some simulated data from Jesse
* One simulation where we have the same variants for each conecntration
* One simulation where we do not have the same variants for each concentration
* These two situations hare handeled differently in `polyclonal` objects

In [2]:
activity_wt_df = pd.DataFrame({"epitope": ["1", "2"], "activity": [2.0, 1.0]})

mut_escape_df = pd.DataFrame(
    {
        "mutation": ["M1C", "M1C", "G2A", "G2A", "A4K", "A4K", "A4L", "A4L"],
        "epitope": ["1", "2", "1", "2", "1", "2", "1", "2"],
        "escape": [2.0, 0.0, 3.0, 0.0, 0.0, 2.5, 0.0, 1.5],
    }
)

polyclonal_sim = Polyclonal(activity_wt_df=activity_wt_df, mut_escape_df=mut_escape_df)

variants_df = pd.DataFrame.from_records(
    [
        ("AA", ""),
        ("AC", "M1C"),
        ("AG", "G2A"),
        ("AT", "A4K"),
        ("TA", "A4L"),
        ("CA", "M1C G2A"),
        ("CG", "M1C A4K"),
        ("CC", "G2A A4K"),
        ("TC", "G2A A4L"),
        ("CT", "M1C G2A A4K"),
        ("TG", "M1C G2A A4L"),
        ("GA", "M1C"),
    ],
    columns=["barcode", "aa_substitutions"],
)

escape_probs = polyclonal_sim.prob_escape(
    variants_df=variants_df, concentrations=[1.0, 2.0, 4.0]
)

data_to_fit = escape_probs.rename(columns={"predicted_prob_escape": "prob_escape"})

polyclonal_data = Polyclonal(
    data_to_fit=data_to_fit,
    activity_wt_df=activity_wt_df,
    site_escape_df=pd.DataFrame.from_records(
        [("1", 1, 1.0), ("1", 4, 0.0), ("2", 1, 0.0), ("2", 4, 2.0)],
        columns=["epitope", "site", "escape"],
    ),
    data_mut_escape_overlap="fill_to_data",
)
polyclonal_data.fit(reg_uniqueness2_weight=0, reg_escape_weight=0.01)
# Now only use the first 30 elements to get rid of some mutations
polyclonal_data2 = Polyclonal(
    data_to_fit=data_to_fit.head(20),
    activity_wt_df=activity_wt_df,
    site_escape_df=pd.DataFrame.from_records(
        [("1", 1, 1.0), ("1", 4, 0.0), ("2", 1, 0.0), ("2", 4, 2.0)],
        columns=["epitope", "site", "escape"],
    ),
    data_mut_escape_overlap="fill_to_data",
)

## Test basic functionality of helper methods outside of class

In [3]:
boot_df = polyclonal.polyclonal_collection.create_bootstrap_sample(data_to_fit)

# Make sure we got an appropriate number of samples
assert len(boot_df) == len(data_to_fit)
# Make sure we did sample with replacement
assert not len(boot_df.drop_duplicates()) == len(data_to_fit)
# Make sure we sampled the same number of variants at each concentration as the orignal dataset has
assert boot_df.concentration.value_counts().equals(
    data_to_fit.concentration.value_counts()
)

## Test initialization of `PolyclonalBootstrap` objects

In [4]:
n_samps = 5
n_threads = 4
pc = PolyclonalBootstrap(
    root_polyclonal=polyclonal_data,
    n_bootstrap_samples=n_samps,
    n_threads=n_threads,
    seed=0,
)

# Make sure the desired number of models were created and stored
assert len(pc.models) == n_samps
# Make sure attributes are stored properly
assert pc.n_threads == n_threads
# Make sure we aren't copying the actual data from the original polyclonal model
for i in range(n_samps):
    assert not pc.root_polyclonal.data_to_fit.equals(pc.models[i].data_to_fit)

## Test random seeding

In [5]:
# Do two different seeds generate different objects?
pc2 = PolyclonalBootstrap(
    root_polyclonal=polyclonal_data,
    n_bootstrap_samples=n_samps,
    n_threads=n_threads,
    seed=10,
)

In [6]:
# What if we use the same seed with multiple threads?
pc_copy = PolyclonalBootstrap(
    root_polyclonal=polyclonal_data,
    n_bootstrap_samples=n_samps,
    n_threads=n_threads,
    seed=0,
)

## Test `PolyclonalCollection` bootstrapping results

In [7]:
n_fit, n_failed, _ = polyclonal.polyclonal_collection.fit_models(pc.models, n_threads=2)
assert n_fit == 5 and n_failed == 0

In [8]:
# Test `fit_models()` with kwargs -- shouldn't throw any errors
pc_copy.fit_models(fix_hill_coefficient=True, fix_non_neutralized_frac=True)

(5, 0)

### Test neutralization concentration values (ICXX) and predictions for probability of escape across all models

In [9]:
n_predictions_expected = len(boot_df) * len([x for x in pc.models if x is not None])
ic90_df = pc.icXX(boot_df, x=0.9, col="IC90")
ic90_replicates_df = pc.icXX_replicates(boot_df, x=0.9)
assert len(ic90_replicates_df) == n_predictions_expected
assert len(ic90_df) == len(boot_df.drop_duplicates())

In [10]:
preds_df = pc.prob_escape(boot_df).sort_values(["aa_substitutions", "concentration"])
preds_replicates_df = pc.prob_escape_replicates(boot_df)
assert len(preds_replicates_df) == n_predictions_expected
assert len(preds_df) == len(boot_df.drop_duplicates())