# Process the data from [Cao et al](https://www.biorxiv.org/content/10.1101/2021.12.07.470392v1)

First read in the escape data:

In [1]:
import collections

import Bio.SeqIO

import pandas as pd

# add aliases for these, also anything originally named by alias
# is switched to have main name the non-alias
aliases = {'S309': 'VIR-7831',
           'COV2-2196': 'AZD8895',
           'COV2-2130': 'AZD1061',
           'LY-CoV1404': 'bebtelovimab',
           'BRII-196': 'amubarvimab',
           'REGN10933': 'casirivimab',
           'REGN10987': 'imdevimab',
           'LY-CoV555': 'bamlanivimab',
           'LY-CoV016': 'etesevimab',
           }
alias_to_name = {val: key for key, val in aliases.items()}

data = (
    pd.read_csv('All_NAbs_Mutation.csv')
    .assign(condition=lambda x: x['condition'].map(
                lambda n: alias_to_name[n] if n in alias_to_name else n)
            )
    )

In [2]:
data

Unnamed: 0,condition,site,wildtype,mutation,escape
0,LY-CoV1404,336,C,G,0.032955
1,LY-CoV1404,347,F,N,0.042724
2,LY-CoV1404,347,F,W,0.017269
3,LY-CoV1404,366,S,K,0.001917
4,LY-CoV1404,366,S,P,0.152756
...,...,...,...,...,...
30653,COV2-2308,487,N,Q,0.004714
30654,COV2-2308,487,N,R,0.288806
30655,COV2-2308,487,N,S,0.003046
30656,COV2-2308,487,N,T,0.002914


Any sites that don't have escape are missing, so we need to fill them in.
So first make a data frame that is zero for all mutations:

In [3]:
spike_start = 331
spike_end = 531
rbd = str(Bio.SeqIO.read('spike.fasta', 'fasta').seq[spike_start - 1: spike_end])

aas = ('A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L',
       'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y')

rbd_df = pd.DataFrame.from_records(
        [(condition, site, wildtype, mut)
         for condition in data['condition'].unique()
         for site, wildtype in enumerate(rbd, spike_start)
         for mut in aas],
        columns=['condition', 'site', 'wildtype', 'mutation']
        )
rbd_df

Unnamed: 0,condition,site,wildtype,mutation
0,LY-CoV1404,331,N,A
1,LY-CoV1404,331,N,C
2,LY-CoV1404,331,N,D
3,LY-CoV1404,331,N,E
4,LY-CoV1404,331,N,F
...,...,...,...,...
992935,COV2-2308,531,T,S
992936,COV2-2308,531,T,T
992937,COV2-2308,531,T,V
992938,COV2-2308,531,T,W


In [4]:
completed_data = (
    data.merge(rbd_df,
               how='outer',
               )
    .assign(escape=lambda x: x['escape'].fillna(0))
    .rename(columns={'escape': 'mut_escape'})
    .sort_values(['condition', 'site'])
    .reset_index(drop=True)
    )

completed_data

Unnamed: 0,condition,site,wildtype,mutation,mut_escape
0,1-57,331,N,A,0.0
1,1-57,331,N,C,0.0
2,1-57,331,N,D,0.0
3,1-57,331,N,E,0.0
4,1-57,331,N,F,0.0
...,...,...,...,...,...
992935,WIBP-2B11,531,T,S,0.0
992936,WIBP-2B11,531,T,T,0.0
992937,WIBP-2B11,531,T,V,0.0
992938,WIBP-2B11,531,T,W,0.0


Write to create `data.csv`:

In [5]:
completed_data.to_csv('data.csv', index=False)

Next read in the antibodies:

In [6]:
antibodies = pd.read_csv('antibodies.csv')

antibodies

Unnamed: 0,name,source,epitope group,Omicron-binding,D614G_IC50,SARS_IC50,Beta_IC50,Omicron_IC50,Hchain,Lchain
0,BD-739,vaccine,A,No,0.4690,>10,5.1,>10,MDWTWRFLYVVAAATGVQSQVQLVQSGAEVKKPGSSVKVSCKASGG...,METPAQLLFLLLLWLPDTTGEIVLTQSPGTLSLSPGERATLSCRAS...
1,BRII-196,DOI: 10.1038/s41467-020-20501-9,A,No,0.0527,>10,0.0054,>10,MGWSLILLFLVAVATRVLSEVQLVESGGGLVQPGGSLRLSCAASGI...,MGWSCIILFLVATATGVHSEIVLTQSPGTLSLSPGERATLSCRASQ...
2,C093,DOI: 10.1038/s41586-021-03207-w,A,No,0.0215,>10,0.0359,>10,QVQLQESGPGLVKPSETLSLTCTVSGGSISSYYWSWIRQPPGKGLE...,QSVLTQPPSVSAAPGQKVTISCSGSSSNIGNNYVSWYQQLPGTAPK...
3,COVOX-150,DOI: 10.1016/j.cell.2021.02.032,A,No,0.0840,>10,0.0096,>10,MGWSLILLFLVAVATRVLSQVQLVESGGGLIQPGGSLRLSCAASGV...,MGWSCIILFLVATATGVHSEIVMTQSPSSLSASVGDRVTITCRASQ...
4,BD-822,long-term convalescent,A,No,0.0030,>10,>10,>10,QVTLRESGPALVKPTQTLTLTCSFSGFSLTTRGMCVSWIRQSPGKA...,DIQMTQSPSSLSASVGDRVTITCRASHNINKYLNWYQQKPGKAPKL...
...,...,...,...,...,...,...,...,...,...,...
242,BD55-5226,SARS convalescent,F,Yes,0.7192,0.0363,2.6621,>10,MELGLRWVFLVAILEGVQCEVQLVESGGGLVKPGGSLRLSCAASGF...,DIQMTQSPSSLSASVGDRVTITCRASQSISSYLNWYQQKPGKAPKL...
243,BD55-5267,SARS convalescent,F,Yes,0.1224,0.0236,0.2434,>10,MKHLWFFLLLVAAPRWVLSQVQLQESGPGLVKPSGTLSLTCAVSGG...,DIQMTQSPSSLSASVGDRVTITCRASQGISSSLAWYQQKPGKAPDL...
244,BD-708,vaccine,F,Yes,0.4680,>10,1.4,>10,MEFGLSWVFLVALLRGVQCQVQLVESGGGVVQPGRSLRLSCAASGF...,MDMRVPAQLLGLLLLWLPGTRCDIQMTQSPSSLSASVGDRVTITCR...
245,BD55-3304,SARS convalescent,F,Yes,0.4793,0.2159,0.0885,3.2,MGWSLILLFLVAVATRVLSQVQLVESGGGVVQPERSLRLSCAASGF...,MGWSCIILFLVATATGVHSDIVMTQSPDSLAVSLGERATISCKSSQ...


Classify antibodies by eliciting virus and move source to notes:

In [7]:
antibodies = (
    antibodies
    .assign(eliciting_virus=lambda x: x['source'].map(
                    lambda s: ('SARS-CoV-1 then SARS-CoV-2'
                               if s == 'SARS convalescent'
                               else 'SARS-CoV-2')
                    ),
            name=lambda x: x['name'].map(lambda n: alias_to_name[n] if n in alias_to_name else n),
            )
    .assign(eliciting_virus=lambda x: x['eliciting_virus'].where(
                                        ~x['name'].isin({'S304', 'S309'}),
                                        'SARS-CoV-1')
            )
    .rename(columns={'source': 'notes'})
    )

assert set(antibodies['name']) == set(data['condition'])

antibodies.groupby('eliciting_virus').aggregate({'name': 'count'})

Unnamed: 0_level_0,name
eliciting_virus,Unnamed: 1_level_1
SARS-CoV-1,2
SARS-CoV-1 then SARS-CoV-2,35
SARS-CoV-2,210


Add other classifications:

In [8]:
# convert epitope groups to Barnes classes
epitope_group_to_class = {
    'A': 'class 1',
    'B': 'class 1',
    'C': 'class 2',
    'D': 'class 3',
    'E': 'class 3',
    'F': 'class 4',
    }

antibodies = (
    antibodies
    .assign(type='antibody',
            year=2021,
            subtype=lambda x: x['epitope group'].map(epitope_group_to_class),
            neutralizes_Omicron=lambda x: x['Omicron_IC50'] != '>10',
            notes=lambda x: x['notes'].str.replace(':', ''),
            )
    [['name', 'type', 'subtype', 'year', 'eliciting_virus', 'neutralizes_Omicron', 'notes']]
    )

antibodies

Unnamed: 0,name,type,subtype,year,eliciting_virus,neutralizes_Omicron,notes
0,BD-739,antibody,class 1,2021,SARS-CoV-2,False,vaccine
1,BRII-196,antibody,class 1,2021,SARS-CoV-2,False,DOI 10.1038/s41467-020-20501-9
2,C093,antibody,class 1,2021,SARS-CoV-2,False,DOI 10.1038/s41586-021-03207-w
3,COVOX-150,antibody,class 1,2021,SARS-CoV-2,False,DOI 10.1016/j.cell.2021.02.032
4,BD-822,antibody,class 1,2021,SARS-CoV-2,False,long-term convalescent
...,...,...,...,...,...,...,...
242,BD55-5226,antibody,class 4,2021,SARS-CoV-1 then SARS-CoV-2,False,SARS convalescent
243,BD55-5267,antibody,class 4,2021,SARS-CoV-1 then SARS-CoV-2,False,SARS convalescent
244,BD-708,antibody,class 4,2021,SARS-CoV-2,False,vaccine
245,BD55-3304,antibody,class 4,2021,SARS-CoV-1 then SARS-CoV-2,True,SARS convalescent


Write `study.yml`:

In [9]:
with open('study.yml', 'w') as f:
    f.write('\n'.join([
            'study_title: B.1.1.529 escapes the majority of SARS-CoV-2 neutralizing antibodies of diverse epitopes',
            'study_first_author: Cao',
            'study_year: 2021',
            'study_journal: bioRxiv',
            'study_url: https://www.biorxiv.org/content/10.1101/2021.12.07.470392v1.full',
            'lab: Xie_XS',
            'spike: Wuhan-Hu-1',
            'notes: data from Y. Cao by e-mail',
            ]))
    f.write('\nconditions:\n')
    for tup in antibodies.itertuples(index=False):
        f.write(f"  {tup.name}:\n")
        for col in ['type', 'subtype', 'year', 'eliciting_virus', 'neutralizes_Omicron', 'notes']:
            val = getattr(tup, col)
            f.write(f"    {col}: {val}\n")
        if tup.name in aliases:
            f.write(f"    alias: {aliases[tup.name]}\n")