# What mutation rate should we use?

We have simulated noisy data with average of 1, 2, 3, and 4 mutations. If we fit a model to each of them, how good are the fits? 
Each library has 30,000 variants and we will use the same parameters in each model.

In [2]:
import pandas as pd
import polyclonal
import pickle
import random
import altair as alt
import numpy

In [3]:
noisy_data = (
    pd.read_csv('RBD_variants_escape_noisy.csv', na_filter=None)
    .query('concentration in [0.25, 1, 4]')
    .reset_index(drop=True)
    )

noisy_data

Unnamed: 0,library,aa_substitutions,concentration,prob_escape,IC90
0,avg1muts,,0.25,0.087480,0.1128
1,avg1muts,,0.25,0.034240,0.1128
2,avg1muts,,0.25,0.037880,0.1128
3,avg1muts,,0.25,0.035730,0.1128
4,avg1muts,,0.25,0.000000,0.1128
...,...,...,...,...,...
359995,avg2muts,Y473E L518F D427L,4.00,0.002918,1.1600
359996,avg1muts,Y473S G413Q,4.00,0.000000,0.5780
359997,avg1muts,Y473V P479R F392W,4.00,0.160200,1.4550
359998,avg3muts,Y489Q N501Y,4.00,0.000000,0.5881


# Polyclonal model simulations

In [7]:
random.seed(123)
for n in [1,2,3,4]:
    poly_abs = polyclonal.Polyclonal(data_to_fit=noisy_data.query(f"library == 'avg{n}muts'"),
                                     activity_wt_df=pd.DataFrame.from_records(
                                         [('1', 1.0),
                                          ('2', 3.0),
                                          ('3', 2.0),
                                          ],
                                         columns=['epitope', 'activity'],
                                         ),
                                     site_escape_df=pd.DataFrame.from_records(
                                         [('1', 417, 10.0),
                                          ('2', 484, 10.0),
                                          ('3', 444, 10.0),
                                          ],
                                         columns=['epitope', 'site', 'escape'],
                                         ),
                                     data_mut_escape_overlap='fill_to_data',
                                 )
    
    opt_res = poly_abs.fit(logfreq=500)
    pickle.dump(poly_abs, open(f'scipy_results/noisy_3conc_{n}muts.pkl', 'wb'))
    print(f"Model fit on library with {n} mutation per variant on average written to scipy_results/noisy_3conc_{n}muts.pkl")

# First fitting site-level model.
# Starting optimization of 522 parameters at Wed Nov 24 12:15:19 2021.
       step   time_sec       loss   fit_loss reg_escape  regspread
          0   0.019013     2659.6     2659.3    0.29701          0
        500     10.095      450.7     445.11     5.5853          0
       1000     19.934     446.89     440.67     6.2131          0
       1500     30.185     445.25     438.71     6.5333          0
       1633     32.894     445.15     438.58      6.571          0
# Successfully finished at Wed Nov 24 12:15:52 2021.
# Starting optimization of 5796 parameters at Wed Nov 24 12:15:52 2021.
       step   time_sec       loss   fit_loss reg_escape  regspread
          0   0.024132      785.7     705.71      79.99  1.819e-29
        500     13.547     397.01     326.36     52.328     18.324
       1000     27.127      379.6     318.79     38.518     22.295
       1500     40.875     375.65     316.88     35.772         23
       1782     48.548     375.08

## Get correlation between predicted and true beta coefficients for each trained model

In [4]:
all_corrs = pd.DataFrame({'epitope' : [], 
                          'correlation' : [], 
                          'library' : []}
                        )

for n in [1,2,3,4]:
    model = pickle.load(open(f'scipy_results/noisy_3conc_{n}muts.pkl', 'rb'))

    mut_escape_pred = (
        pd.read_csv('RBD_mut_escape_df.csv')
        .merge((model.mut_escape_df
                .assign(epitope=lambda x: 'class ' + x['epitope'].astype(str))
                .rename(columns={'escape': 'predicted escape'})
                ),
               on=['mutation', 'epitope'],
               validate='one_to_one',
               )
        )

    corr = (mut_escape_pred
            .groupby('epitope')
            .apply(lambda x: x['escape'].corr(x['predicted escape']))
            .rename('correlation')
            .reset_index()
            )
    all_corrs = pd.concat([all_corrs, 
                           corr.assign(library = [f"avg{n}muts"] * len(corr.index))]
                         )
all_corrs.head()

Unnamed: 0,epitope,correlation,library
0,class 1,0.12796,avg1muts
1,class 2,0.857029,avg1muts
2,class 3,0.653858,avg1muts
0,class 1,0.812462,avg2muts
1,class 2,0.956589,avg2muts


In [5]:
alt.Chart(all_corrs).mark_bar().encode(
    x= alt.X('library:O', axis=alt.Axis(labels=False)),
    y='correlation:Q',
    color='library:N',
    column='epitope:N',
    tooltip = ['library', 'correlation']
).properties(width=125, height=200)

# Torchdms simulations

In [None]:
clean_data = (
    pd.read_csv('RBD_variants_escape_exact.csv', na_filter=None)
    .query('concentration in [0.25, 1, 4]')
    .reset_index(drop=True)
    )

clean_data

Unnamed: 0,library,aa_substitutions,concentration,prob_escape,IC90
0,avg1muts,,0.25,0.025120,0.1128
1,avg1muts,,0.25,0.025120,0.1128
2,avg1muts,,0.25,0.025120,0.1128
3,avg1muts,,0.25,0.025120,0.1128
4,avg1muts,,0.25,0.025120,0.1128
...,...,...,...,...,...
359995,avg4muts,Y508W C525F,1.00,0.019370,0.4073
359996,avg4muts,Y508W C525F,4.00,0.000655,0.4073
359997,avg4muts,Y508W G526L,0.25,0.197600,0.4122
359998,avg4muts,Y508W G526L,1.00,0.019800,0.4122


Write the dataset for each library into separate output directories.

In [42]:
import torchdms
import Bio.SeqIO
import pickle

wtseq_dna = Bio.SeqIO.read('RBD_seq.fasta', 'fasta').seq
wtseq_aa = str(wtseq_dna.translate())
assert len(wtseq_aa) == 201

for n in [1,2,3,4]:
    avg_n_data = noisy_data.query(f"library == 'avg{n}muts'")
    # torchdms uses 1-indexed mutations
    formatted_data = (
            avg_n_data
            .assign(aa_substitutions=lambda x: x['aa_substitutions'].apply(
                                polyclonal.utils.shift_mut_site, shift=-330)
            )
    )    
    assert len(formatted_data.index) == 90000
    with open(f"torchdms_results/noisy_3conc_{n}muts/noisy_3conc_{n}muts_data.pkl", "wb") as f:
        pickle.dump([formatted_data, wtseq_aa], f)
    print(f"Dataset written to torchdms_results/noisy_3conc_{n}muts.")

Dataset written to torchdms_results/noisy_3conc_1muts.
Dataset written to torchdms_results/noisy_3conc_2muts.
Dataset written to torchdms_results/noisy_3conc_3muts.
Dataset written to torchdms_results/noisy_3conc_4muts.


In [4]:
import torchdms
import Bio.SeqIO
import pickle

wtseq_dna = Bio.SeqIO.read('RBD_seq.fasta', 'fasta').seq
wtseq_aa = str(wtseq_dna.translate())
assert len(wtseq_aa) == 201

for n in [1,2,3,4]:
    avg_n_data = clean_data.query(f"library == 'avg{n}muts'")
    # torchdms uses 1-indexed mutations
    formatted_data = (
            avg_n_data
            .assign(aa_substitutions=lambda x: x['aa_substitutions'].apply(
                                polyclonal.utils.shift_mut_site, shift=-330)
            )
    )    
    assert len(formatted_data.index) == 90000
    with open(f"torchdms_results/clean_3conc_{n}muts/clean_3conc_{n}muts_data.pkl", "wb") as f:
        pickle.dump([formatted_data, wtseq_aa], f)
    print(f"Dataset written to torchdms_results/clean_3conc_{n}muts.")

Dataset written to torchdms_results/clean_3conc_1muts.
Dataset written to torchdms_results/clean_3conc_2muts.
Dataset written to torchdms_results/clean_3conc_3muts.
Dataset written to torchdms_results/clean_3conc_4muts.


Train the `torchdms` models

In [35]:
for n in [1,2,3,4]:
    min_test_per_stratum = [200,250,300,350]
    min_count_per_stratum = [800,1200,1600,2000]
    
    %cd torchdms_results/noisy_3conc_{n}muts
    
    !echo "Prepping dataset."
    !tdms prep --per-stratum-variants-for-test {min_test_per_stratum[n-1]} --skip-stratum-if-count-is-smaller-than {min_count_per_stratum[n-1]} \
    --partition-by library *data.pkl prepped prob_escape
    
    !echo "Training model."
    !tdms go --config config.json

    %cd ../.. 

'\nfor n in [1,2,3,4]:\n    min_test_per_stratum = [200,250,300,350]\n    min_count_per_stratum = [800,1200,1600,2000]\n    \n    %cd torchdms_results/noisy_3conc_{n}muts\n    \n    !echo "Prepping dataset."\n    !tdms prep --per-stratum-variants-for-test {min_test_per_stratum[n-1]} --skip-stratum-if-count-is-smaller-than {min_count_per_stratum[n-1]}     --partition-by library *data.pkl prepped prob_escape\n    \n    !echo "Training model."\n    !tdms go --config config.json\n\n    %cd ../.. \n'

In [36]:
for n in [1,2,3,4]:
    min_test_per_stratum = [200,250,300,350]
    min_count_per_stratum = [800,1200,1600,2000]
    
    %cd torchdms_results/clean_3conc_{n}muts
    
    !echo "Prepping dataset."
    !tdms prep --per-stratum-variants-for-test {min_test_per_stratum[n-1]} --skip-stratum-if-count-is-smaller-than {min_count_per_stratum[n-1]} \
    --partition-by library *data.pkl prepped prob_escape
    
    !echo "Training model."
    !tdms go --config config.json

    %cd ../.. 

'\nfor n in [1,2,3,4]:\n    min_test_per_stratum = [200,250,300,350]\n    min_count_per_stratum = [800,1200,1600,2000]\n    \n    %cd torchdms_results/clean_3conc_{n}muts\n    \n    !echo "Prepping dataset."\n    !tdms prep --per-stratum-variants-for-test {min_test_per_stratum[n-1]} --skip-stratum-if-count-is-smaller-than {min_count_per_stratum[n-1]}     --partition-by library *data.pkl prepped prob_escape\n    \n    !echo "Training model."\n    !tdms go --config config.json\n\n    %cd ../.. \n'