# Bootstrapping the simulated RBD dataset

## Imports and setup

In [1]:
import pandas as pd
from polyclonal import Polyclonal, PolyclonalCollection
import polyclonal.polyclonal_collection as boot
import numpy as np

rbd_data = (
    pd.read_csv("../notebooks/RBD_variants_escape_noisy.csv", na_filter=None)
    .query('library == "avg2muts"')
    .query("concentration in [0.25, 1, 4]")
    .reset_index(drop=True)
)

rbd_data

Unnamed: 0,library,aa_substitutions,concentration,prob_escape,IC90
0,avg2muts,,0.25,0.050440,0.1128
1,avg2muts,,0.25,0.143100,0.1128
2,avg2muts,,0.25,0.054520,0.1128
3,avg2muts,,0.25,0.084730,0.1128
4,avg2muts,,0.25,0.041740,0.1128
...,...,...,...,...,...
89995,avg2muts,Y396T Y473L,4.00,0.000000,0.5832
89996,avg2muts,Y421W S359K,4.00,0.044600,0.5777
89997,avg2muts,Y449L V503T L335M,4.00,0.000000,1.0520
89998,avg2muts,Y473E L518F D427L,4.00,0.002918,1.1600


In [2]:
# Create a root polyclonal object
rbd_poly = Polyclonal(
    data_to_fit=rbd_data,
    activity_wt_df=pd.DataFrame.from_records(
        [
            ("1", 1.0),
            ("2", 3.0),
            ("3", 2.0),
        ],
        columns=["epitope", "activity"],
    ),
    site_escape_df=pd.DataFrame.from_records(
        [
            ("1", 417, 10.0),
            ("2", 484, 10.0),
            ("3", 444, 10.0),
        ],
        columns=["epitope", "site", "escape"],
    ),
    data_mut_escape_overlap="fill_to_data",
)

In [3]:
# Fit model
_ = rbd_poly.fit(logfreq=100)

# First fitting site-level model.
# Starting optimization of 522 parameters at Wed Feb 23 16:56:44 2022.
       step   time_sec       loss   fit_loss reg_escape  regspread
          0   0.048631     9144.4     9144.2    0.29701          0
        100     6.3463     1336.8     1333.2     3.5443          0
        200     12.488     1313.2     1308.8     4.3872          0
        300     18.944       1305     1299.8     5.1347          0
        400      24.83     1301.8     1296.2     5.6246          0
        500     30.785     1298.4     1292.5     5.8941          0
        600     36.821     1297.8     1291.7     6.0372          0
        700     43.116     1296.9     1290.4     6.5236          0
        800     49.112     1296.4     1289.8     6.6654          0
        900     55.105     1296.2     1289.4      6.759          0
       1000     61.119     1295.7     1288.8     6.8687          0
       1100     67.247     1295.4     1288.6      6.848          0
       1200     73.265  

In [4]:
# Now create two different polyclonal collection objects
n_samps = 5
n_threads = 16
rbd_pc_a = PolyclonalCollection(
    root_polyclonal=rbd_poly,
    n_bootstrap_samples=n_samps,
    n_threads=n_threads,
    seed=0,
)
rbd_pc_b = PolyclonalCollection(
    root_polyclonal=rbd_poly,
    n_bootstrap_samples=n_samps,
    n_threads=n_threads,
    seed=10,
)

In [5]:
rbd_pc_a.fit_models()
rbd_pc_b.fit_models()

In [6]:
rbd_pc_a.models[1].mut_escape_df

Unnamed: 0,epitope,site,wildtype,mutant,mutation,escape
0,1,331,N,A,N331A,0.0
1,1,331,N,D,N331D,0.0
2,1,331,N,E,N331E,0.0
3,1,331,N,F,N331F,0.0
4,1,331,N,G,N331G,0.0
...,...,...,...,...,...,...
5791,3,531,T,R,T531R,0.0
5792,3,531,T,S,T531S,0.0
5793,3,531,T,V,T531V,0.0
5794,3,531,T,W,T531W,0.0


In [7]:
# The test for the mutation frequency dictionary isn't suitable here
# With so many multi-variants, we may not get situations where a mutation isn't sampled by all models
# For our seed tests, I will just make sure we get different summary stats for each seed.
(
    rbd_pc_a_escape_dict,
    rbd_pc_a_activity_wt_dict,
) = rbd_pc_a._summarize_bootstrapped_params()
(
    rbd_pc_b_escape_dict,
    rbd_pc_b_activity_wt_dict,
) = rbd_pc_b._summarize_bootstrapped_params()

In [8]:
rbd_pc_a_escape_dict["mean"]

Unnamed: 0,mutation,epitope,site,wildtype,mean
0,N331A,1,331,A,0.0
1,N331D,1,331,D,0.0
2,N331E,1,331,E,0.0
3,N331F,1,331,F,0.0
4,N331G,1,331,G,0.0
...,...,...,...,...,...
5791,T531R,3,531,R,0.0
5792,T531S,3,531,S,0.0
5793,T531V,3,531,V,0.0
5794,T531W,3,531,W,0.0


In [9]:
rbd_pc_b_escape_dict["mean"].tail()

Unnamed: 0,mutation,epitope,site,wildtype,median
5791,T531R,3,531,R,0.0
5792,T531S,3,531,S,0.0
5793,T531V,3,531,V,0.0
5794,T531W,3,531,W,0.0
5795,T531Y,3,531,Y,0.0
