# Download and process the data

Download the files from the GitHub repo: https://github.com/jianfcpku/SARS-CoV-2-RBD-DMS-broad

In [1]:
import os
import requests

# from most recent tagged commit
files_to_download = [
    "https://raw.githubusercontent.com/jianfcpku/SARS-CoV-2-RBD-DMS-broad/53eb95b075972e2ed738614010e3e807d83fbaa8/data/Neutral_info.csv",
    "https://raw.githubusercontent.com/jianfcpku/SARS-CoV-2-RBD-DMS-broad/53eb95b075972e2ed738614010e3e807d83fbaa8/data/BD45_names.csv",
    "https://github.com/jianfcpku/SARS-CoV-2-RBD-DMS-broad/raw/53eb95b075972e2ed738614010e3e807d83fbaa8/outputs/antibody_clusters.csv",
    "https://github.com/jianfcpku/SARS-CoV-2-RBD-DMS-broad/raw/53eb95b075972e2ed738614010e3e807d83fbaa8/outputs/antibody_clusters_BA1_specific.csv",
    "https://github.com/jianfcpku/SARS-CoV-2-RBD-DMS-broad/raw/53eb95b075972e2ed738614010e3e807d83fbaa8/outputs/results_clean.csv",
    "https://github.com/jianfcpku/SARS-CoV-2-RBD-DMS-broad/raw/53eb95b075972e2ed738614010e3e807d83fbaa8/outputs/results_clean_BA1_specific.csv",
]

for url in files_to_download:
    f = os.path.basename(url)
    if os.path.isfile(f):
        print(f"{f} already exists")
    else:
        print(f"Downloading {f} from {url}")
        response = requests.get(url)
        with open(f, "wb") as f_out:
            f_out.write(response.content)

Neutral_info.csv already exists
BD45_names.csv already exists
antibody_clusters.csv already exists
antibody_clusters_BA1_specific.csv already exists
results_clean.csv already exists
results_clean_BA1_specific.csv already exists


Read antibody name mapping:

In [2]:
import pandas as pd

names = pd.read_csv("BD45_names.csv")
names.head()

Unnamed: 0,id,name
0,BD45-1,S309
1,BD45-2,CB6
2,BD45-3,REGN10933
3,BD45-4,REGN10987
4,BD45-7,P2B-2F6


Read neutralization data:

In [3]:
neutralize_renames = {
    "D614G_IC50": "Wuhan-Hu-1",
    "SARS_IC50": "SARS-CoV-1",
    "Omicron_IC50": "Omicron BA.1",
    "BA2_IC50": "Omicron BA.2",
    "BA2_12_1_IC50": "Omicron BA.12.1",
    "BA4_IC50": "Omicron BA.4/BA.5",
}

def known_to_neutralize(row):
    return "[" + ", ".join(virus for virus in neutralize_renames.values() if row[virus] < 10) + "]"

# convert epitope groups to Barnes classes
epitope_group_to_class = {
    'A': 'class 1',
    'B': 'class 1',
    'C': 'class 2',
    'D': 'class 3',
    'E': 'class 3',
    'F': 'class 4',
    }
    
antibodies = (
    pd.read_csv("Neutral_info.csv")
    .rename(columns={"Unnamed: 0": "id"})
    .merge(names, how="left")
    .assign(name=lambda x: x["name"].where(x["name"].notnull(), x["id"]))
    .rename(columns=neutralize_renames)
    .assign(
        known_to_neutralize=lambda x: x.apply(known_to_neutralize, axis=1),
        eliciting_virus=lambda x: x["source_group"].map(
            {
                "WT": "SARS-CoV-2",
                "BA.1": "SARS-CoV-2;Omicron BA.1",
                "SARS": "SARS-CoV-1 then SARS-CoV-2",
            }
        ),
        subtype=lambda x: x["group"].str[0].map(epitope_group_to_class),
        type="antibody",
        year=2022,
        notes="",
    )
    [["id", "name", "type", "subtype", "year", "eliciting_virus", "known_to_neutralize", "notes"]]
)

Read the escape data for most antibodies:

In [4]:
import Bio.SeqIO

spike_start = 331
spike_end = 531
rbd = str(Bio.SeqIO.read('spike.fasta', 'fasta').seq[spike_start - 1: spike_end])

aas = ('A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L',
       'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y')

data = (
    pd.read_csv("results_clean.csv")
    .rename(columns={"antibody": "id"})
)

missing_ids = set(data["id"]) - set(antibodies["id"])
print(f"The following {len(missing_ids)} ids are missing and will be excluded:\n{missing_ids}")

data = (
    data
    .merge(antibodies[["id", "name"]], validate="many_to_one", on="id")
    .rename(columns={"name": "condition"})
    [["condition", "site", "mutation", "mut_escape"]]
)

rbd_df = pd.DataFrame.from_records(
        [(condition, site, wildtype, mut)
         for condition in data['condition'].unique()
         for site, wildtype in enumerate(rbd, spike_start)
         for mut in aas],
        columns=['condition', 'site', 'wildtype', 'mutation']
        )

data = (
    data
    .merge(rbd_df, how="outer")
    .assign(mut_escape=lambda x: x["mut_escape"].fillna(0))
    .sort_values(["condition", "site"])
)

data

The following 0 ids are missing and will be excluded:
set()


Unnamed: 0,condition,site,mutation,mut_escape,wildtype
1123456,1-57,331,A,0.0,N
1123457,1-57,331,C,0.0,N
1123458,1-57,331,D,0.0,N
1123459,1-57,331,E,0.0,N
1123460,1-57,331,F,0.0,N
...,...,...,...,...,...
6126475,XGv-422,531,S,0.0,T
6126476,XGv-422,531,T,0.0,T
6126477,XGv-422,531,V,0.0,T
6126478,XGv-422,531,W,0.0,T


Drop antibodies that don't neutralize any viruses:

In [5]:
not_neutralizing = antibodies.query("known_to_neutralize == '[]'")["name"].tolist()
print(f"{len(not_neutralizing)} of {len(antibodies)} are non-neutralizing")

antibodies = antibodies.query("name not in @not_neutralizing")

data = data.query("condition not in @not_neutralizing")

186 of 1538 are non-neutralizing


Drop antibodies with no data:

In [6]:
antibodies = antibodies.query("name in @data['condition'].unique()")

Read the data for the Omicron BA.1 specific antibodies:

In [7]:
data_ba1 = (
    pd.read_csv("results_clean_BA1_specific.csv")
    .rename(columns={"antibody": "id"})
)

# None of these should be in our antibodies data frame,
# so make a new antibody data frame that contains them
assert not set(data_ba1["id"]).intersection(antibodies["id"])
antibodies_ba1 = (
    data_ba1
    .rename(columns={"show_name": "name"})
    .assign(
        type="antibody",
        subtype=lambda x: x["group"].str[0].map(epitope_group_to_class),
        year=2022,
        eliciting_virus="SARS-CoV-2;Omicron BA.1",
        known_to_neutralize="[Omicron BA.1]",
        notes="",
    )
    [["name", "type", "subtype", "year", "eliciting_virus", "known_to_neutralize", "notes"]]
    .drop_duplicates()
    .reset_index(drop=True)
)
display(antibodies_ba1)

data_ba1 = (
    data_ba1
    .rename(columns={"show_name": "condition"})
    [["condition", "site", "mutation", "mut_escape"]]
)

rbd_ba1_muts = {
    int(mut[1: -1]): mut
    for mut in ["G339D", "S371L", "S373P", "S375F", "K417N", "N440K", "G446S",
                "S477N", "T478K", "E484A", "Q493R", "G496S", "Q498R", "N501Y",
                "Y505H"]
}
rbd_ba1 = []
for r, wt in enumerate(rbd, spike_start):
    if r in rbd_ba1_muts:
        assert wt == rbd_ba1_muts[r][0]
        rbd_ba1.append(rbd_ba1_muts[r][-1])
    else:
        rbd_ba1.append(wt)
rbd_ba1 = "".join(rbd_ba1)

rbd_ba1_df = pd.DataFrame.from_records(
        [(condition, site, wildtype, mut)
         for condition in data_ba1['condition'].unique()
         for site, wildtype in enumerate(rbd_ba1, spike_start)
         for mut in aas],
        columns=['condition', 'site', 'wildtype', 'mutation']
        )

data_ba1 = (
    data_ba1
    .merge(rbd_ba1_df, how="outer")
    .assign(mut_escape=lambda x: x["mut_escape"].fillna(0))
    .sort_values(["condition", "site"])
    .reset_index(drop=True)
)

data_ba1

Unnamed: 0,name,type,subtype,year,eliciting_virus,known_to_neutralize,notes
0,BD56-235,antibody,class 1,2022,SARS-CoV-2;Omicron BA.1,[Omicron BA.1],
1,BD56-236,antibody,class 3,2022,SARS-CoV-2;Omicron BA.1,[Omicron BA.1],
2,BD56-266,antibody,class 4,2022,SARS-CoV-2;Omicron BA.1,[Omicron BA.1],
3,BD56-267,antibody,class 4,2022,SARS-CoV-2;Omicron BA.1,[Omicron BA.1],
4,BD56-268,antibody,class 3,2022,SARS-CoV-2;Omicron BA.1,[Omicron BA.1],
...,...,...,...,...,...,...,...
97,BD56-883,antibody,class 4,2022,SARS-CoV-2;Omicron BA.1,[Omicron BA.1],
98,BD56-887,antibody,class 1,2022,SARS-CoV-2;Omicron BA.1,[Omicron BA.1],
99,BD56-888,antibody,class 3,2022,SARS-CoV-2;Omicron BA.1,[Omicron BA.1],
100,BD56-890,antibody,class 1,2022,SARS-CoV-2;Omicron BA.1,[Omicron BA.1],


Unnamed: 0,condition,site,mutation,mut_escape,wildtype
0,BD56-235,331,A,0.0,N
1,BD56-235,331,C,0.0,N
2,BD56-235,331,D,0.0,N
3,BD56-235,331,E,0.0,N
4,BD56-235,331,F,0.0,N
...,...,...,...,...,...
410035,BD56-894,531,S,0.0,T
410036,BD56-894,531,T,0.0,T
410037,BD56-894,531,V,0.0,T
410038,BD56-894,531,W,0.0,T


Write the data:

In [8]:
pd.concat([data, data_ba1]).to_csv("data.csv", index=False)

Write the YAML about the study:

In [9]:
with open('study.yml', 'w') as f:
    f.write('\n'.join([
        "study_title: BA.2.12.1, BA.4 and BA.5 escape antibodies elicited by Omicron infection",
        "study_first_author: Cao",
        "study_year: 2022",
        "study_journal: bioRxiv",
        "study_url: https://www.biorxiv.org/content/10.1101/2022.04.30.489997v1",
        "lab: Xie_XS",
        "notes: data from https://github.com/jianfcpku/SARS-CoV-2-RBD-DMS-broad",
        "conditions:\n",
    ]))
    for tup in pd.concat([antibodies, antibodies_ba1]).itertuples(index=False):
        f.write(f"  {tup.name}:\n")
        for col in ['type', 'subtype', 'year', 'eliciting_virus', 'known_to_neutralize', 'notes']:
            val = getattr(tup, col)
            f.write(f"    {col}: {val}\n")