# What library size should we use?

We will test library sizes varying from 1000, 5000, 10000, 200000, 30000 variants to see how these improve model training. We will use a library with 3 mutations on average, and a concentration set = [0.25, 1, 4], which were previously determined to be optimal here and here. 

In [1]:
import pandas as pd
import polyclonal
import pickle
import random
import altair as alt
import numpy

In [2]:
noisy_data = (
    pd.read_csv('RBD_variants_escape_noisy.csv', na_filter=None)
    .query("library == 'avg3muts'")
    .query('concentration in [0.25, 1, 4]')
    .reset_index(drop=True)
    )

noisy_data

Unnamed: 0,library,aa_substitutions,concentration,prob_escape,IC90
0,avg3muts,,0.25,0.00000,0.1128
1,avg3muts,,0.25,0.01090,0.1128
2,avg3muts,,0.25,0.01458,0.1128
3,avg3muts,,0.25,0.09465,0.1128
4,avg3muts,,0.25,0.03299,0.1128
...,...,...,...,...,...
89995,avg3muts,Y449I L518Y C525R L461I,4.00,0.02197,2.3100
89996,avg3muts,Y449V K529R N394R,4.00,0.04925,0.9473
89997,avg3muts,Y451L N481T F490V,4.00,0.02315,0.9301
89998,avg3muts,Y453R V483G L492V N501P I332P,4.00,0.00000,5.0120


In [3]:
library_sizes = [1000, 5000, 10000, 20000, 30000]

for size in library_sizes:
    poly_abs = polyclonal.Polyclonal(data_to_fit=(noisy_data.groupby('concentration')
                                                            .apply(lambda x: x.sample(n=size, random_state=123))
                                                            .reset_index(drop = True)),
                                     activity_wt_df=pd.DataFrame.from_records(
                                         [('1', 1.0),
                                          ('2', 3.0),
                                          ('3', 2.0),
                                          ],
                                         columns=['epitope', 'activity'],
                                         ),
                                     site_escape_df=pd.DataFrame.from_records(
                                         [('1', 417, 10.0),
                                          ('2', 484, 10.0),
                                          ('3', 444, 10.0),
                                          ],
                                         columns=['epitope', 'site', 'escape'],
                                         ),
                                     data_mut_escape_overlap='fill_to_data',
                                 )
    
    opt_res = poly_abs.fit(logfreq=500)
    pickle.dump(poly_abs, open(f'scipy_results/libsize{size}_noisy_3conc_3muts.pkl', 'wb'))
    print(f"Model fit on library with {size} variants to scipy_results/libsize{size}_noisy_3conc_3muts.pkl")

# First fitting site-level model.
# Starting optimization of 519 parameters at Fri Nov 26 11:41:39 2021.
       step   time_sec       loss   fit_loss reg_escape  regspread
          0   0.011101     609.84     609.54    0.29701          0
        500     5.3137     56.556     51.293     5.2629          0
       1000     10.359     55.669     50.234     5.4356          0
       1239     12.641     55.516      50.03     5.4861          0
# Successfully finished at Fri Nov 26 11:41:52 2021.
# Starting optimization of 5448 parameters at Fri Nov 26 11:41:52 2021.
       step   time_sec       loss   fit_loss reg_escape  regspread
          0   0.010517     112.21     51.685     60.526 1.4313e-29
        500     6.2786     36.836     13.726     16.696      6.414
        776     9.6045     36.248     14.414     15.558     6.2759
# Successfully finished at Fri Nov 26 11:42:02 2021.
Model fit on library with 1000 variants to scipy_results/libsize1000_noisy_3conc_3muts.pkl
# First fitting site-le

## Get correlation between predicted and true beta coefficients for each trained model

In [4]:
library_sizes = [1000, 5000, 10000, 20000, 30000]
all_corrs = pd.DataFrame({'epitope' : [], 
                          'correlation' : [], 
                          'num_variants' : []}
                        )

for size in library_sizes:
    model = pickle.load(open(f'scipy_results/libsize{size}_noisy_3conc_3muts.pkl', 'rb'))

    mut_escape_pred = (
        pd.read_csv('RBD_mut_escape_df.csv')
        .merge((model.mut_escape_df
                .assign(epitope=lambda x: 'class ' + x['epitope'].astype(str))
                .rename(columns={'escape': 'predicted escape'})
                ),
               on=['mutation', 'epitope'],
               validate='one_to_one',
               )
        )

    corr = (mut_escape_pred
            .groupby('epitope')
            .apply(lambda x: x['escape'].corr(x['predicted escape']))
            .rename('correlation')
            .reset_index()
            )
    all_corrs = pd.concat([all_corrs, 
                           corr.assign(num_variants = [size] * len(corr.index))]
                         )
all_corrs.head()

Unnamed: 0,epitope,correlation,num_variants
0,class 1,0.024152,1000.0
1,class 2,0.619466,1000.0
2,class 3,0.142082,1000.0
0,class 1,0.231682,5000.0
1,class 2,0.809985,5000.0


In [6]:
base = alt.Chart(all_corrs).mark_point().encode(
    alt.X('num_variants:Q'),
    alt.Y('correlation:Q'),
    alt.Color('epitope:N'),
    tooltip=['num_variants', 'correlation', 'epitope']
)

chart = base + base.transform_loess('num_variants', 'correlation', groupby=['epitope']).mark_line(size=2.5)
chart.save('scipy_results/figures/library_size.pdf')
chart