# What mutation rate should we use?

We have simulated noisy data with average of 1, 2, 3, and 4 mutations. If we fit a model to each of them, how good are the fits? 
Each library has 30,000 variants and we will use the same parameters in each model.

In [2]:
import pandas as pd
import polyclonal
import pickle
import random
import altair as alt
import numpy

In [3]:
noisy_data = (
    pd.read_csv('RBD_variants_escape_noisy.csv', na_filter=None)
    .query('concentration in [0.25, 1, 4]')
    .reset_index(drop=True)
    )

noisy_data

Unnamed: 0,library,aa_substitutions,concentration,prob_escape,IC90
0,avg1muts,,0.25,0.087480,0.1128
1,avg1muts,,0.25,0.034240,0.1128
2,avg1muts,,0.25,0.037880,0.1128
3,avg1muts,,0.25,0.035730,0.1128
4,avg1muts,,0.25,0.000000,0.1128
...,...,...,...,...,...
359995,avg2muts,Y473E L518F D427L,4.00,0.002918,1.1600
359996,avg1muts,Y473S G413Q,4.00,0.000000,0.5780
359997,avg1muts,Y473V P479R F392W,4.00,0.160200,1.4550
359998,avg3muts,Y489Q N501Y,4.00,0.000000,0.5881


# Polyclonal model simulations

In [7]:
random.seed(123)
for n in [1,2,3,4]:
    poly_abs = polyclonal.Polyclonal(data_to_fit=noisy_data.query(f"library == 'avg{n}muts'"),
                                     activity_wt_df=pd.DataFrame.from_records(
                                         [('1', 1.0),
                                          ('2', 3.0),
                                          ('3', 2.0),
                                          ],
                                         columns=['epitope', 'activity'],
                                         ),
                                     site_escape_df=pd.DataFrame.from_records(
                                         [('1', 417, 10.0),
                                          ('2', 484, 10.0),
                                          ('3', 444, 10.0),
                                          ],
                                         columns=['epitope', 'site', 'escape'],
                                         ),
                                     data_mut_escape_overlap='fill_to_data',
                                 )
    
    opt_res = poly_abs.fit(logfreq=500)
    pickle.dump(poly_abs, open(f'scipy_results/noisy_3conc_{n}muts.pkl', 'wb'))
    print(f"Model fit on library with {n} mutation per variant on average written to scipy_results/noisy_3conc_{n}muts.pkl")

# First fitting site-level model.
# Starting optimization of 522 parameters at Wed Nov 24 12:15:19 2021.
       step   time_sec       loss   fit_loss reg_escape  regspread
          0   0.019013     2659.6     2659.3    0.29701          0
        500     10.095      450.7     445.11     5.5853          0
       1000     19.934     446.89     440.67     6.2131          0
       1500     30.185     445.25     438.71     6.5333          0
       1633     32.894     445.15     438.58      6.571          0
# Successfully finished at Wed Nov 24 12:15:52 2021.
# Starting optimization of 5796 parameters at Wed Nov 24 12:15:52 2021.
       step   time_sec       loss   fit_loss reg_escape  regspread
          0   0.024132      785.7     705.71      79.99  1.819e-29
        500     13.547     397.01     326.36     52.328     18.324
       1000     27.127      379.6     318.79     38.518     22.295
       1500     40.875     375.65     316.88     35.772         23
       1782     48.548     375.08

## Get correlation between predicted and true beta coefficients for each trained model

In [3]:
all_corrs = pd.DataFrame({'epitope' : [], 
                          'correlation' : [], 
                          'library' : []}
                        )

for n in [1,2,3,4]:
    model = pickle.load(open(f'scipy_results/noisy_3conc_{n}muts.pkl', 'rb'))

    mut_escape_pred = (
        pd.read_csv('RBD_mut_escape_df.csv')
        .merge((model.mut_escape_df
                .assign(epitope=lambda x: 'class ' + x['epitope'].astype(str))
                .rename(columns={'escape': 'predicted escape'})
                ),
               on=['mutation', 'epitope'],
               validate='one_to_one',
               )
        )

    corr = (mut_escape_pred
            .groupby('epitope')
            .apply(lambda x: x['escape'].corr(x['predicted escape']))
            .rename('correlation')
            .reset_index()
            )
    all_corrs = pd.concat([all_corrs, 
                           corr.assign(library = [f"avg{n}muts"] * len(corr.index))]
                         )
all_corrs.head()

Unnamed: 0,epitope,correlation,library
0,class 1,0.12796,avg1muts
1,class 2,0.857029,avg1muts
2,class 3,0.653858,avg1muts
0,class 1,0.812462,avg2muts
1,class 2,0.956589,avg2muts


In [5]:
chart = alt.Chart(all_corrs).mark_bar().encode(
    x= alt.X('library:O', axis=alt.Axis(labels=False)),
    y='correlation:Q',
    color='library:N',
    column='epitope:N',
    tooltip = ['library', 'correlation']
).properties(width=125, height=200)
chart.save('scipy_results/figures/mutation_rate.pdf')
chart

## Get correlation between predicted and true IC90's for each trained model

To make a fair comparison, we will read in the "exact" simulated data from a library measured at a set of concentrations that none of the above models were trained on.

In [18]:
exact_data = (
    pd.read_csv('RBD_variants_escape_exact.csv', na_filter=None)
    .query('library == "avg4muts"')
    .query('concentration in [0.125, 0.5, 2]')
    .reset_index(drop=True)
    )

In [22]:
exact_data.query('aa_substitutions.str.contains("P384R")')

Unnamed: 0,library,aa_substitutions,concentration,prob_escape,IC90
1824,avg4muts,A344K S371R P384R G447C L452E E471V H519T,0.125,0.9930,157.5000
1825,avg4muts,A344K S371R P384R G447C L452E E471V H519T,0.500,0.9726,157.5000
1826,avg4muts,A344K S371R P384R G447C L452E E471V H519T,2.000,0.8986,157.5000
2238,avg4muts,A344R V382L P384R N448T F464W Q493R S494R,0.125,0.9999,4323.0000
2239,avg4muts,A344R V382L P384R N448T F464W Q493R S494R,0.500,0.9994,4323.0000
...,...,...,...,...,...
87475,avg4muts,Y369M P384R G447A N450L,0.500,0.5510,4.3600
87476,avg4muts,Y369M P384R G447A N450L,2.000,0.2177,4.3600
88104,avg4muts,Y369T P384R G526V,0.125,0.6116,0.9127
88105,avg4muts,Y369T P384R G526V,0.500,0.2183,0.9127


In [24]:
ic90s.query('aa_substitutions.str.contains("P384R")')

Unnamed: 0,aa_substitutions,IC90
1824,A344K S371R P384R G447C L452E E471V H519T,50.0000
2238,A344R V382L P384R N448T F464W Q493R S494R,50.0000
2814,A348H K378V P384R Y396T V445A Q498L G504N,47.3100
3129,A348P P384R R408W V445Y G482S F490T G504C K528Q,50.0000
3915,A352G S373E P384R A520D,2.3430
...,...,...
85911,Y365I P384R I468T F490D,5.0220
86838,Y369E P384R K424R I468A,2.3590
87249,Y369K P384R T385P R408S F486I,35.0600
87474,Y369M P384R G447A N450L,4.3600


To compare the true simulated IC90 values to those predicted by the fit model, we make the comparison on a log scale, and clip IC90s at values >50 as likely to be way outside the dynamic range given the concentrations used.

In [19]:
test_model = pickle.load(open(f'scipy_results/noisy_3conc_1muts.pkl', 'rb'))

In [20]:
import numpy
from plotnine import *

max_ic90 = 50

# we only need the variants, not the concentration for the IC90 comparison
ic90s = (exact_data[['aa_substitutions', 'IC90']]
         .assign(IC90=lambda x: x['IC90'].clip(upper=max_ic90))
         .drop_duplicates()
         )

ic90s = test_model.icXX(ic90s, x=0.9, col='predicted_IC90', max_c=max_ic90)

ic90s = (
    ic90s
    .assign(log_IC90=lambda x: numpy.log10(x['IC90']),
            predicted_log_IC90=lambda x: numpy.log10(x['predicted_IC90']),
            )
    )

corr = ic90s['log_IC90'].corr(ic90s['predicted_log_IC90'])
print(f"Correlation is {corr:.2f}")

ic90_corr_plot = (
    ggplot(ic90s) +
    aes('log_IC90', 'predicted_log_IC90') +
    geom_point(alpha=0.1, size=1) +
    theme_classic() +
    theme(figure_size=(3, 3))
    )

_ = ic90_corr_plot.draw()

ValueError: substitutions not in `allowed_subs`: ['P384W']