In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

Ignore the cell below for now

In [2]:
# I don't think we need this barcode look up table, though it may be useful if we want to speed up this script
# codon_variant_table = pd.read_csv('codon_variant_table.txt')
# target = 'SARS-CoV-2'
# library = 'lib1'
# rows_to_select = (codon_variant_table['target']==target) & (codon_variant_table['library']==library)
# codon_variant_table_sars_cov_2 = codon_variant_table[rows_to_select].reset_index(drop=True)

In [3]:
# file with all raw_data 
# can be found here: https://github.com/jbloomlab/SARS-CoV-2-RBD_DMS/blob/master/results/counts/variant_counts.csv
# I zipped the data after download.
variant_counts = pd.read_csv('variant_counts.txt.gz')

This notebook can be run for library = 'lib1' or 'lib2' -- see below

In [4]:
#variant_counts 
target = 'SARS-CoV-2'
library = 'lib1'
rows_to_select_variant_counts = (variant_counts['target']==target) & (variant_counts['library']==library)

In [5]:
variant_counts_sars_cov_2_df = variant_counts[rows_to_select_variant_counts].reset_index(drop=True)

In [6]:
# not sure if this merging is necessary
#merged_df = variant_counts_sars_cov_2_df.merge(codon_variant_table_sars_cov_2,how='inner',on='barcode')
#merged_df = merged_df.reset_index(drop=True).copy()

Filtering on a single barcode, I get 68 rows. Fours corresponding to counts in 4 Sort-seq bins (Figure 2A), and 64 rows corresponding to counts 16x4 Titeseq bins i.e., 16 different concentrations and 4 different bins (Figure 2B)

In [7]:
variant_counts_sars_cov_2_df[variant_counts_sars_cov_2_df['barcode']=='AAAAAAAACAAGCAGA']

Unnamed: 0,target,library,sample,barcode,count,variant_call_support,codon_substitutions,aa_substitutions,n_codon_substitutions,n_aa_substitutions


In [8]:
# extract only sortseq related information (Figure 2A)
# need to change this to barcode and ct_1, ct_2, ct_3, ct_4
sort_seq_df = variant_counts_sars_cov_2_df[variant_counts_sars_cov_2_df['sample'].str.contains('SortSeq_bin')].reset_index(drop=True).copy()
sort_seq_df.head()

Unnamed: 0,target,library,sample,barcode,count,variant_call_support,codon_substitutions,aa_substitutions,n_codon_substitutions,n_aa_substitutions
0,SARS-CoV-2,lib2,SortSeq_bin1,GAGTTGTGTACTACCT,7999,56,ATC2AAC AGA136TAG,I2N R136*,2,2
1,SARS-CoV-2,lib2,SortSeq_bin1,GATAATACTCGCTGAT,7782,36,AGA73CAC CCG177TAG,R73H P177*,2,2
2,SARS-CoV-2,lib2,SortSeq_bin1,AAGGTTCCTAGCCTAT,6530,3,CTT5GTG GCA81ACC GAC90TAC GAC98AAC AAT120TTG A...,L5V A81T D90Y D98N N120L I142F Y165T Y175* L183N,9,9
3,SARS-CoV-2,lib2,SortSeq_bin1,GAAACGTTGACTAATA,5633,23,CTT5ACG TTC47ACC ATA104TCC GAT112CAC GCA145TAG...,L5T F47T I104S D112H A145* Y175R,6,6
4,SARS-CoV-2,lib2,SortSeq_bin1,GTTTAATGATACACAA,5295,26,TGT31TAG TCC41GAC,C31* S41D,2,2


Get counts for each Sort seq bin, along with barcodes and aa mutations

In [9]:
cts_1 = sort_seq_df[sort_seq_df['sample'] == 'SortSeq_bin1'].sort_values(by='barcode')['count'].values
cts_2 = sort_seq_df[sort_seq_df['sample'] == 'SortSeq_bin2'].sort_values(by='barcode')['count'].values
cts_3 = sort_seq_df[sort_seq_df['sample'] == 'SortSeq_bin3'].sort_values(by='barcode')['count'].values
cts_4 = sort_seq_df[sort_seq_df['sample'] == 'SortSeq_bin4'].sort_values(by='barcode')['count'].values

barcodes = sort_seq_df[sort_seq_df['sample'] == 'SortSeq_bin1'].sort_values(by='barcode')['barcode'].values
aa_substitutions = sort_seq_df[sort_seq_df['sample'] == 'SortSeq_bin1'].sort_values(by='barcode')['aa_substitutions'].values


In [10]:
sort_seq_df2 = pd.DataFrame({'barcodes':barcodes,
                             'aa_substitutions':aa_substitutions,
                             'SortSeq_bin1':cts_1,
                             'SortSeq_bin2':cts_2,
                             'SortSeq_bin3':cts_3,
                             'SortSeq_bin4':cts_4})

In [11]:
sort_seq_df2

Unnamed: 0,barcodes,aa_substitutions,SortSeq_bin1,SortSeq_bin2,SortSeq_bin3,SortSeq_bin4
0,AAAAAAAAAAAAGTAA,T3L,32,14,95,468
1,AAAAAAAAAACAACAT,L5Y A42S G83L F156Y,2,24,48,95
2,AAAAAAAAAATAAGGA,L38S C49L P96H T148S,52,26,17,9
3,AAAAAAAAAATACAGA,G51W K87G P96I,372,353,113,11
4,AAAAAAAAAATGCGGT,E10L L95R E186Y,91,53,56,0
...,...,...,...,...,...,...
96507,TTTTTTTACAGTCTTG,Y165S,262,184,449,185
96508,TTTTTTTACCAGACTG,G83V D98L,596,462,376,78
96509,TTTTTTTCTTTAAAAA,V71H S147Q,234,186,134,14
96510,TTTTTTTGAAGCCCCA,,6,8,44,316


In [12]:
#sort_seq_df2.to_csv('sort_seq_df.csv',index=False)

### Get 4x16 Tite-seq concentration specific bin counts

In [13]:
tite_seq_df = variant_counts_sars_cov_2_df[variant_counts_sars_cov_2_df['sample'].str.contains('TiteSeq')].reset_index(drop=True).copy()

In [14]:
# dictionary which will be populated with tite-seq counts
tite_seq_counts_dict = {}

In [15]:
counter = 1
for concentration in range(1,16+1):
    for bin_ in range(1,4+1):
        if concentration <10:
            prefix = '0'
        else: 
            prefix = ''
        sample = f'TiteSeq_{prefix}{concentration}_bin{bin_}'
        
        cts_ = tite_seq_df[tite_seq_df['sample'] == sample].sort_values(by='barcode')['count'].values
        tite_seq_counts_dict[sample] = cts_
        
        print(f'counter = {counter}, sample = {sample}')
        counter+=1

counter = 1, sample = TiteSeq_01_bin1
counter = 2, sample = TiteSeq_01_bin2
counter = 3, sample = TiteSeq_01_bin3
counter = 4, sample = TiteSeq_01_bin4
counter = 5, sample = TiteSeq_02_bin1
counter = 6, sample = TiteSeq_02_bin2
counter = 7, sample = TiteSeq_02_bin3
counter = 8, sample = TiteSeq_02_bin4
counter = 9, sample = TiteSeq_03_bin1
counter = 10, sample = TiteSeq_03_bin2
counter = 11, sample = TiteSeq_03_bin3
counter = 12, sample = TiteSeq_03_bin4
counter = 13, sample = TiteSeq_04_bin1
counter = 14, sample = TiteSeq_04_bin2
counter = 15, sample = TiteSeq_04_bin3
counter = 16, sample = TiteSeq_04_bin4
counter = 17, sample = TiteSeq_05_bin1
counter = 18, sample = TiteSeq_05_bin2
counter = 19, sample = TiteSeq_05_bin3
counter = 20, sample = TiteSeq_05_bin4
counter = 21, sample = TiteSeq_06_bin1
counter = 22, sample = TiteSeq_06_bin2
counter = 23, sample = TiteSeq_06_bin3
counter = 24, sample = TiteSeq_06_bin4
counter = 25, sample = TiteSeq_07_bin1
counter = 26, sample = TiteSeq_07_

In [16]:
tite_seq_df2 = pd.DataFrame(tite_seq_counts_dict)

In [17]:
final_df = sort_seq_df2.join(tite_seq_df2).copy()

In [18]:
final_df

Unnamed: 0,barcodes,aa_substitutions,SortSeq_bin1,SortSeq_bin2,SortSeq_bin3,SortSeq_bin4,TiteSeq_01_bin1,TiteSeq_01_bin2,TiteSeq_01_bin3,TiteSeq_01_bin4,...,TiteSeq_14_bin3,TiteSeq_14_bin4,TiteSeq_15_bin1,TiteSeq_15_bin2,TiteSeq_15_bin3,TiteSeq_15_bin4,TiteSeq_16_bin1,TiteSeq_16_bin2,TiteSeq_16_bin3,TiteSeq_16_bin4
0,AAAAAAAAAAAAGTAA,T3L,32,14,95,468,0,1,0,406,...,41,0,321,39,56,0,302,15,0,0
1,AAAAAAAAAACAACAT,L5Y A42S G83L F156Y,2,24,48,95,0,0,2,141,...,0,0,81,3,0,0,47,0,235,0
2,AAAAAAAAAATAAGGA,L38S C49L P96H T148S,52,26,17,9,30,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,AAAAAAAAAATACAGA,G51W K87G P96I,372,353,113,11,96,11,0,11,...,0,0,108,4,0,0,79,24,0,0
4,AAAAAAAAAATGCGGT,E10L L95R E186Y,91,53,56,0,7,2,0,0,...,0,0,9,0,0,0,32,2,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96507,TTTTTTTACAGTCTTG,Y165S,262,184,449,185,0,29,211,45,...,1,1,147,10,0,0,210,30,0,0
96508,TTTTTTTACCAGACTG,G83V D98L,596,462,376,78,0,24,394,13,...,0,0,388,27,0,0,366,40,0,0
96509,TTTTTTTCTTTAAAAA,V71H S147Q,234,186,134,14,49,17,1,0,...,0,0,88,19,0,0,119,5,0,0
96510,TTTTTTTGAAGCCCCA,,6,8,44,316,19,0,10,141,...,71,0,194,23,0,0,215,28,0,0


In [19]:
#final_df.to_csv(f'ace2rbd_raw_counts_{library}.csv',index=False)

### change aa_substitutions to full protein sequences

In [20]:
abreviation_dict = {
    'Ala':'A', 
    'Arg':'R',
    'Asn':'N',
    'Asp':'D',
    'Cys':'C',
    'Glu':'E',
    'Gln':'Q',
    'Gly':'G',
    'His':'H',
    'Ile':'I',
    'Leu':'L',
    'Lys':'K',
    'Met':'M',
    'Phe':'F',
    'Pro':'P',
    'Ser':'S',
    'Thr':'T',
    'Trp':'W',
    'Tyr':'Y',
    'Val':'V'
}

aas = list(abreviation_dict.values())

In [21]:
# Protein sequence determined from the wt coding DNA sequence in Snapgene
# 'AATATTACAAACTTGTGCCCTTTTGGTGAAGTTTTTAACGCCACCAGATTTGCATCTGTTTATGCTTGGAACAGGAAGAGAATCAGCAACTGTGTTGCTGATTATTCTGTCCTATATAATTCCGCATCATTTTCCACTTTTAAGTGTTATGGAGTGTCTCCTACTAAATTAAATGATCTCTGCTTTACTAATGTCTATGCAGATTCATTTGTAATTAGAGGTGATGAAGTCAGACAAATCGCTCCAGGGCAAACTGGAAAGATTGCTGATTATAATTATAAATTACCAGATGATTTTACAGGCTGCGTTATAGCTTGGAATTCTAACAATCTTGATTCTAAGGTTGGTGGTAATTATAATTACCTGTATAGATTGTTTAGGAAGTCTAATCTCAAACCTTTTGAGAGAGATATTTCAACTGAAATCTATCAGGCCGGTAGCACACCTTGTAATGGTGTTGAAGGTTTTAATTGTTACTTTCCTTTACAATCATATGGTTTCCAACCCACTAATGGTGTTGGTTACCAACCATACAGAGTAGTAGTACTTTCTTTTGAACTTCTACATGCACCAGCAACTGTTTGTGGACCTAAAAAGTCTACT'
wt_seq = 'NITNLCPFGEVFNATRFASVYAWNRKRISNCVADYSVLYNSASFSTFKCYGVSPTKLNDLCFTNVYADSFVIRGDEVRQIAPGQTGKIADYNYKLPDDFTGCVIAWNSNNLDSKVGGNYNYLYRLFRKSNLKPFERDISTEIYQAGSTPCNGVEGFNCYFPLQSYGFQPTNGVGYQPYRVVVLSFELLHAPATVCGPKKST'
len(wt_seq)

201

Drop NANs. ~ 5k sequences without any mutations? Strange.

In [22]:
print(final_df.shape)
final_df = final_df.dropna().reset_index(drop=True).copy()
print(final_df.shape)

(96512, 70)
(91049, 70)


In [23]:
final_df

Unnamed: 0,barcodes,aa_substitutions,SortSeq_bin1,SortSeq_bin2,SortSeq_bin3,SortSeq_bin4,TiteSeq_01_bin1,TiteSeq_01_bin2,TiteSeq_01_bin3,TiteSeq_01_bin4,...,TiteSeq_14_bin3,TiteSeq_14_bin4,TiteSeq_15_bin1,TiteSeq_15_bin2,TiteSeq_15_bin3,TiteSeq_15_bin4,TiteSeq_16_bin1,TiteSeq_16_bin2,TiteSeq_16_bin3,TiteSeq_16_bin4
0,AAAAAAAAAAAAGTAA,T3L,32,14,95,468,0,1,0,406,...,41,0,321,39,56,0,302,15,0,0
1,AAAAAAAAAACAACAT,L5Y A42S G83L F156Y,2,24,48,95,0,0,2,141,...,0,0,81,3,0,0,47,0,235,0
2,AAAAAAAAAATAAGGA,L38S C49L P96H T148S,52,26,17,9,30,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,AAAAAAAAAATACAGA,G51W K87G P96I,372,353,113,11,96,11,0,11,...,0,0,108,4,0,0,79,24,0,0
4,AAAAAAAAAATGCGGT,E10L L95R E186Y,91,53,56,0,7,2,0,0,...,0,0,9,0,0,0,32,2,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91044,TTTTTTTAAGTCCCAA,Y66L G174N,3,8,50,236,0,0,0,239,...,0,0,246,5,0,0,183,26,0,0
91045,TTTTTTTACAGTCTTG,Y165S,262,184,449,185,0,29,211,45,...,1,1,147,10,0,0,210,30,0,0
91046,TTTTTTTACCAGACTG,G83V D98L,596,462,376,78,0,24,394,13,...,0,0,388,27,0,0,366,40,0,0
91047,TTTTTTTCTTTAAAAA,V71H S147Q,234,186,134,14,49,17,1,0,...,0,0,88,19,0,0,119,5,0,0


In [24]:
# Parse aa_substituions
import re
matches_list = [re.findall('([A-Za-z\*]+)([0-9]+)([A-Za-z\*]+)', s) for s in final_df['aa_substitutions']]

In [25]:
matches_list[0:10]

[[('T', '3', 'L')],
 [('L', '5', 'Y'), ('A', '42', 'S'), ('G', '83', 'L'), ('F', '156', 'Y')],
 [('L', '38', 'S'), ('C', '49', 'L'), ('P', '96', 'H'), ('T', '148', 'S')],
 [('G', '51', 'W'), ('K', '87', 'G'), ('P', '96', 'I')],
 [('E', '10', 'L'), ('L', '95', 'R'), ('E', '186', 'Y')],
 [('G', '83', 'Y'), ('G', '155', 'S'), ('T', '201', 'K')],
 [('Y', '50', 'H'), ('V', '103', 'S')],
 [('F', '99', 'D')],
 [('K', '48', 'L'),
  ('T', '55', 'V'),
  ('G', '117', 'E'),
  ('T', '140', 'S'),
  ('P', '149', 'S')],
 [('T', '3', 'Q'), ('Y', '165', 'L')]]

In [26]:
final_df.insert(loc=0, column='hamming_dist', value=[len(m) for m in matches_list])

In [27]:
final_df

Unnamed: 0,hamming_dist,barcodes,aa_substitutions,SortSeq_bin1,SortSeq_bin2,SortSeq_bin3,SortSeq_bin4,TiteSeq_01_bin1,TiteSeq_01_bin2,TiteSeq_01_bin3,...,TiteSeq_14_bin3,TiteSeq_14_bin4,TiteSeq_15_bin1,TiteSeq_15_bin2,TiteSeq_15_bin3,TiteSeq_15_bin4,TiteSeq_16_bin1,TiteSeq_16_bin2,TiteSeq_16_bin3,TiteSeq_16_bin4
0,1,AAAAAAAAAAAAGTAA,T3L,32,14,95,468,0,1,0,...,41,0,321,39,56,0,302,15,0,0
1,4,AAAAAAAAAACAACAT,L5Y A42S G83L F156Y,2,24,48,95,0,0,2,...,0,0,81,3,0,0,47,0,235,0
2,4,AAAAAAAAAATAAGGA,L38S C49L P96H T148S,52,26,17,9,30,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,AAAAAAAAAATACAGA,G51W K87G P96I,372,353,113,11,96,11,0,...,0,0,108,4,0,0,79,24,0,0
4,3,AAAAAAAAAATGCGGT,E10L L95R E186Y,91,53,56,0,7,2,0,...,0,0,9,0,0,0,32,2,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91044,2,TTTTTTTAAGTCCCAA,Y66L G174N,3,8,50,236,0,0,0,...,0,0,246,5,0,0,183,26,0,0
91045,1,TTTTTTTACAGTCTTG,Y165S,262,184,449,185,0,29,211,...,1,1,147,10,0,0,210,30,0,0
91046,2,TTTTTTTACCAGACTG,G83V D98L,596,462,376,78,0,24,394,...,0,0,388,27,0,0,366,40,0,0
91047,2,TTTTTTTCTTTAAAAA,V71H S147Q,234,186,134,14,49,17,1,...,0,0,88,19,0,0,119,5,0,0


In [28]:
### Create mut_df

# Parse strings in 'aa_substitutions' column
f = open('tmp.txt','w')
f.write('id,l,c\n')
for i, matches in enumerate(matches_list):
    for _, l, c in matches:
        f.write(f'{i},{int(l)-1},{c}\n')
f.close()
mut_df = pd.read_csv('tmp.txt')

# Map long-form aa to short-form aa
#mut_df['c'] = mut_df['c'].map(abreviation_dict).astype(str)

# Remove all unrecognized 'c'
ix = mut_df['c'].isin(aas)
mut_df = mut_df[ix]

# preview mut_df
print(f'min l: {min(mut_df["l"])}')
print(f'max l: {max(mut_df["l"])}')
print(f'max id: {max(mut_df["id"])}')
mut_df.head()

min l: 0
max l: 200
max id: 91048


Unnamed: 0,id,l,c
0,0,2,L
1,1,4,Y
2,1,41,S
3,1,82,L
4,1,155,Y


In [29]:
# old MAVE-NN utility functions
from dev import mutations_to_dataset

In [30]:
protein_sequences = mutations_to_dataset(wt_seq=wt_seq, mut_df=mut_df)

In [31]:
#final_df['x'] = protein_sequences['x'].values
final_df.insert(loc=0, column='x', value=protein_sequences['x'].values)

In [32]:
final_df

Unnamed: 0,x,hamming_dist,barcodes,aa_substitutions,SortSeq_bin1,SortSeq_bin2,SortSeq_bin3,SortSeq_bin4,TiteSeq_01_bin1,TiteSeq_01_bin2,...,TiteSeq_14_bin3,TiteSeq_14_bin4,TiteSeq_15_bin1,TiteSeq_15_bin2,TiteSeq_15_bin3,TiteSeq_15_bin4,TiteSeq_16_bin1,TiteSeq_16_bin2,TiteSeq_16_bin3,TiteSeq_16_bin4
0,NILNLCPFGEVFNATRFASVYAWNRKRISNCVADYSVLYNSASFST...,1,AAAAAAAAAAAAGTAA,T3L,32,14,95,468,0,1,...,41,0,321,39,56,0,302,15,0,0
1,NITNYCPFGEVFNATRFASVYAWNRKRISNCVADYSVLYNSSSFST...,4,AAAAAAAAAACAACAT,L5Y A42S G83L F156Y,2,24,48,95,0,0,...,0,0,81,3,0,0,47,0,235,0
2,NITNLCPFGEVFNATRFASVYAWNRKRISNCVADYSVSYNSASFST...,4,AAAAAAAAAATAAGGA,L38S C49L P96H T148S,52,26,17,9,30,0,...,0,0,0,0,0,0,0,0,0,0
3,NITNLCPFGEVFNATRFASVYAWNRKRISNCVADYSVLYNSASFST...,3,AAAAAAAAAATACAGA,G51W K87G P96I,372,353,113,11,96,11,...,0,0,108,4,0,0,79,24,0,0
4,NITNLCPFGLVFNATRFASVYAWNRKRISNCVADYSVLYNSASFST...,3,AAAAAAAAAATGCGGT,E10L L95R E186Y,91,53,56,0,7,2,...,0,0,9,0,0,0,32,2,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91044,NITNLCPFGEVFNATRFASVYAWNRKRISNCVADYSVLYNSASFST...,2,TTTTTTTAAGTCCCAA,Y66L G174N,3,8,50,236,0,0,...,0,0,246,5,0,0,183,26,0,0
91045,NITNLCPFGEVFNATRFASVYAWNRKRISNCVADYSVLYNSASFST...,1,TTTTTTTACAGTCTTG,Y165S,262,184,449,185,0,29,...,1,1,147,10,0,0,210,30,0,0
91046,NITNLCPFGEVFNATRFASVYAWNRKRISNCVADYSVLYNSASFST...,2,TTTTTTTACCAGACTG,G83V D98L,596,462,376,78,0,24,...,0,0,388,27,0,0,366,40,0,0
91047,NITNLCPFGEVFNATRFASVYAWNRKRISNCVADYSVLYNSASFST...,2,TTTTTTTCTTTAAAAA,V71H S147Q,234,186,134,14,49,17,...,0,0,88,19,0,0,119,5,0,0


In [33]:
final_df.to_csv(f'ace2rbd_raw_counts_{library}.csv',index=False)

In [39]:
#manual test
#wt_seq[174-1],final_df.loc[91044]['x'][174-1]

('G', 'N')