# Library size

We’ll use simulated data to show how the number of variants in a DMS library affects the performance of `Polyclonal` models.

In [1]:
import time
import os

import pandas as pd
import altair as alt
import numpy as np
import polyclonal

We have simulated a noisy DMS library containing 30,000 variants measured at 3 different sera concentrations. The variants in this library were simulated to contain a Poisson-distributed number of mutations, with 3 mutations on average.

In [3]:
noisy_data = (
    pd.read_csv('RBD_variants_escape_noisy.csv', na_filter=None)
    .query("library == 'avg3muts'")
    .query('concentration in [0.25, 1, 4]')
    .reset_index(drop=True)
    )

noisy_data

Unnamed: 0,library,aa_substitutions,concentration,prob_escape,IC90
0,avg3muts,,0.25,0.00000,0.1128
1,avg3muts,,0.25,0.01090,0.1128
2,avg3muts,,0.25,0.01458,0.1128
3,avg3muts,,0.25,0.09465,0.1128
4,avg3muts,,0.25,0.03299,0.1128
...,...,...,...,...,...
89995,avg3muts,Y449I L518Y C525R L461I,4.00,0.02197,2.3100
89996,avg3muts,Y449V K529R N394R,4.00,0.04925,0.9473
89997,avg3muts,Y451L N481T F490V,4.00,0.02315,0.9301
89998,avg3muts,Y453R V483G L492V N501P I332P,4.00,0.00000,5.0120


We’ll randomly subsample smaller fractions of variants in our library and fit a `Polyclonal` model to each of the subsets.

In [7]:
library_sizes = [1000, 2500, 5000, 10000, 20000, 30000]

# Store all fit models in a dictionary for future lookup
fit_models = {}

for n in library_sizes:
    # key name for model
    model_string = f'{n}variants'

    poly_abs = polyclonal.Polyclonal(data_to_fit=noisy_data.groupby('concentration')
                                                           .apply(lambda x: x.sample(n=n, random_state=123))
                                                           .reset_index(drop = True),
                                     activity_wt_df=pd.DataFrame.from_records(
                                         [('1', 1.0),
                                          ('2', 3.0),
                                          ('3', 2.0),
                                          ],
                                         columns=['epitope', 'activity'],
                                         ),
                                     site_escape_df=pd.DataFrame.from_records(
                                         [('1', 417, 10.0),
                                          ('2', 484, 10.0),
                                          ('3', 444, 10.0),
                                          ],
                                         columns=['epitope', 'site', 'escape'],
                                         ),
                                     data_mut_escape_overlap='fill_to_data',
                                 )
    print(f"Fitting model on library with {n} variants.")
    poly_abs.fit()
    fit_models.update({model_string : poly_abs})

Fitting model on library with 1000 variants.
Fitting model on library with 2500 variants.
Fitting model on library with 5000 variants.
Fitting model on library with 10000 variants.
Fitting model on library with 20000 variants.
Fitting model on library with 30000 variants.


We can look at the correlation between predicted and true beta coefficients (mutation effects at each epitope) for the fit models.

In [8]:
all_corrs = pd.DataFrame({'epitope' : [], 
                          'correlation' : [], 
                          'num_variants' : []})

for size in library_sizes:
    model = fit_models[f'{size}variants']

    mut_escape_pred = (
        pd.read_csv('RBD_mut_escape_df.csv')
        .merge((model.mut_escape_df
                .assign(epitope=lambda x: 'class ' + x['epitope'].astype(str))
                .rename(columns={'escape': 'predicted escape'})
                ),
               on=['mutation', 'epitope'],
               validate='one_to_one',
               )
        )

    corr = (mut_escape_pred
            .groupby('epitope')
            .apply(lambda x: x['escape'].corr(x['predicted escape']))
            .rename('correlation')
            .reset_index()
            )
    
    all_corrs = pd.concat([all_corrs, 
                    corr.assign(num_variants = [str(size)]* len(corr.index))
                        ])

In [9]:
# NBVAL_IGNORE_OUTPUT
base = alt.Chart(all_corrs).mark_point().encode(
    alt.X('num_variants:Q'),
    alt.Y('correlation:Q'),
    alt.Color('epitope:N'),
    tooltip=['num_variants', alt.Tooltip('correlation', format='.3f'), 'epitope']
)
base + base.transform_loess('num_variants', 'correlation', groupby=['epitope']
                           ).mark_line(size=2.5
                                      ).properties(title='predicted vs. true beta coefficients')

Additionally, we’ll look at the correlation between predicted and true IC90’s for each of the fit models. To do this, we’ll predict the IC90’s of variants in a different simulated library that our models were not fit on.

In [10]:
exact_data = (
    pd.read_csv('RBD_variants_escape_exact.csv', na_filter=None)
    .query('library == "avg4muts"')
    .query('concentration in [1]')
    .reset_index(drop=True)
    )

We’ll make the comparison on a log scale, and clip IC90s at values >50 as that is likely to be way outside the dynamic range given the concentrations used.

In [12]:
ic90_corrs = pd.DataFrame({'correlation' : [], 
                           'num_variants' : []})

max_ic90 = 50
for size in library_sizes:
    model = fit_models[f'{size}variants']
    
    ic90s = (exact_data[['aa_substitutions', 'IC90']]
         .assign(IC90=lambda x: x['IC90'].clip(upper=max_ic90))
         .drop_duplicates()
         )
    ic90s = model.filter_variants_by_seen_muts(ic90s)
    ic90s = model.icXX(ic90s, x=0.9, col='predicted_IC90', max_c=max_ic90)

    ic90s = (
        ic90s
        .assign(log_IC90=lambda x: np.log10(x['IC90']),
            predicted_log_IC90=lambda x: np.log10(x['predicted_IC90']),
            )
    )

    corr = ic90s['log_IC90'].corr(ic90s['predicted_log_IC90'])
    
    ic90_corrs = pd.concat([ic90_corrs,
                    pd.DataFrame({'correlation' : corr,
                                  'num_variants' : [str(size)]})])

In [13]:
# NBVAL_IGNORE_OUTPUT
base = alt.Chart(ic90_corrs).mark_point().encode(
    alt.X('num_variants:Q'),
    alt.Y('correlation:Q'),
    tooltip=['num_variants', alt.Tooltip('correlation', format='.3f')]
)
base + base.transform_loess('num_variants', 'correlation'
                           ).mark_line(size=2.5
                                      ).properties(title='predicted vs. true IC90')

## Summary

Library size is an important factor. Having greater than 10,000 (functional) multiply-mutated variants led to higher correlation between predicted and true beta coefficients for each epitope, particularly the subdominant epitopes.