# Process the deep mutational scanning data

The data come from the following papers from Yunlong Cao's group:

 - [Imprinted SARS-CoV-2 humoral immunity induces convergent Omicron RBD evolution (2022)](https://www.nature.com/articles/s41586-022-05644-7)
 
 - [Repeated Omicron infection alleviates SARS-CoV-2 immune imprinting (2023)](https://www.biorxiv.org/content/10.1101/2023.05.01.538516v2)

In [1]:
import os

import pandas as pd

## Process data from [Repeated Omicron infection alleviates SARS-CoV-2 immune imprinting (2023)](https://www.biorxiv.org/content/10.1101/2023.05.01.538516v2)

In [2]:
# read the data
repeated_2023_info = (
    pd.read_csv("Cao_data/SARS-CoV-2-reinfection-DMS/antibody_info.csv")
    .rename(columns={"Unnamed: 0": "antibody"})
).assign(study="repeated_2023")

assert len(repeated_2023_info) == repeated_2023_info["antibody"].nunique()

# get the antibody sources
repeated_2023_source = repeated_2023_info[["antibody", "source", "study"]]

print("Number of antibodies from each source:")
display(
    repeated_2023_source
    .groupby(["study", "source"])
    .aggregate(n_antibodies=pd.NamedAgg("antibody", "nunique"))
    .sort_values("n_antibodies", ascending=False)
)

# get the IC50s
repeated_2023_ic50s = (
    repeated_2023_info
    .melt(
        id_vars=["study", "antibody"],
        value_vars=[c for c in repeated_2023_info if c.endswith("_IC50")],
        var_name="virus",
        value_name="IC50",
    )
    .query("IC50.notnull()")  # no data, 10 means non-neutralizing
    .assign(
        virus=lambda x: x["virus"].str.replace("_IC50", "").map({
            "D614G": "D614G",
            "BA1": "BA.1",
            "BA2": "BA.2",
            "BA3": "BA.3",
            "SARS": "SARS",
            "BA1_1": "BA.1.1",
            "BA2_12_1": "BA.2.12.1",
            "BA2_75": "BA.2.75",
            "BA5": "BA.5",
            "BF7": "BF.7",
            "BA2_13": "BA.2.13",
            "BQ1_1": "BQ.1.1",
            "XBB": "XBB",
            "XBB1_5": "XBB.1.5",
            "XBB1_5_10": "XBB.1.5.10",
        }),
    )
)

print("\nNumber of IC50s per antibody:")
display(
    repeated_2023_ic50s
    .groupby("antibody", as_index=False)
    .aggregate(viruses_w_IC50s=pd.NamedAgg("virus", lambda s: "; ".join(s)))
    .groupby("viruses_w_IC50s")
    .aggregate(n_antibodies=pd.NamedAgg("antibody", "nunique"))
    .sort_values("n_antibodies", ascending=False)
)

Number of antibodies from each source:


Unnamed: 0_level_0,Unnamed: 1_level_0,n_antibodies
study,source,Unnamed: 2_level_1
repeated_2023,BA.5 convalescents,445
repeated_2023,BA.1 convalescents reinfection,284
repeated_2023,BF.7 convalescents,243
repeated_2023,BA.2 convalescents reinfection,232
repeated_2023,BA.2 convalescents,53
repeated_2023,long-term BA.1 convalescents,38
repeated_2023,BA.1 convalescents,36
repeated_2023,WT convalescents,12
repeated_2023,SARS convalescents,5
repeated_2023,WT mouse,2



Number of IC50s per antibody:


Unnamed: 0_level_0,n_antibodies
viruses_w_IC50s,Unnamed: 1_level_1
D614G; BA.1; BA.2; SARS; BA.2.75; BA.5; BF.7; BQ.1.1; XBB; XBB.1.5; XBB.1.5.10,428
D614G; BA.1; BA.2; SARS; BA.2.75; BA.5; BF.7; BQ.1.1; XBB; XBB.1.5,181
D614G; BA.1; BA.2; BA.3; SARS; BA.1.1; BA.2.12.1; BA.2.75; BA.5; BA.2.13; BQ.1.1; XBB; XBB.1.5; XBB.1.5.10,137
D614G; BA.1; BA.2; SARS; BA.2.75; BA.5; BQ.1.1; XBB; XBB.1.5; XBB.1.5.10,130
D614G; BA.1; BA.2; BA.2.75; BA.5; BF.7; BQ.1.1; XBB; XBB.1.5; XBB.1.5.10,129
D614G; BA.1; BA.2; SARS; BA.2.75; BA.5; BQ.1.1; XBB; XBB.1.5,92
D614G; BA.1; BA.2; BA.2.75; BA.5; BQ.1.1; XBB; XBB.1.5,48
D614G; SARS; BA.5; BQ.1.1; XBB; XBB.1.5; XBB.1.5.10,42
D614G; BA.1; BA.2; BA.2.75; BA.5; BQ.1.1; XBB; XBB.1.5; XBB.1.5.10,34
D614G; BA.1; BA.2; BA.3; SARS; BA.1.1; BA.2.12.1; BA.2.75; BA.5; BA.2.13; BQ.1.1; XBB; XBB.1.5,17


## Process data from [Imprinted SARS-CoV-2 humoral immunity induces convergent Omicron RBD evolution (2022)](https://www.nature.com/articles/s41586-022-05644-7)

In [3]:
# read the data, which is in multiple sheets
imprinted_2022_raw_info = pd.read_excel(
    "Cao_data/convergent_RBD_evolution/antibody_info.xlsx",
    header=1,
    sheet_name=None,
    usecols=lambda c: not c.startswith("Unnamed"),
)

common_cols = None

for sheetname, sheet in imprinted_2022_raw_info.items():
    columns = sheet.columns.tolist()
    if common_cols is None:
        common_cols = columns
    else:
        common_cols = [
            c for c in columns if c in set(common_cols).intersection(columns)
        ]

imprinted_2022_info = pd.concat(
    [sheet[common_cols] for sheet in imprinted_2022_raw_info.values()],
    ignore_index=True,
).rename(columns={"Antibody  Name": "antibody"}).assign(study="imprinted_2022")

assert len(imprinted_2022_info) == imprinted_2022_info["antibody"].nunique()

# get the antibody sources
imprinted_2022_source = imprinted_2022_info[["antibody", "source", "study"]]
display(
    imprinted_2022_source
    .groupby(["study", "source"])
    .aggregate(n_antibodies=pd.NamedAgg("antibody", "nunique"))
)

# get the IC50s
imprinted_2022_ic50s = (
    imprinted_2022_info
    .melt(
        id_vars=["study", "antibody"],
        value_vars=["D614G", "BA.1", "BA.2", "BA.5", "BA.2.75", "BQ.1.1", "XBB"],
        var_name="virus",
        value_name="IC50",
    )
    # values of >10 to 10
    .query("IC50 != '--'")
    .assign(IC50=lambda x: x["IC50"].map(lambda ic50: 10 if ic50 == ">10" else float(ic50)))
)

print("\nNumber of IC50s per antibody:")
display(
    imprinted_2022_ic50s
    .groupby("antibody", as_index=False)
    .aggregate(viruses_w_IC50s=pd.NamedAgg("virus", lambda s: "; ".join(s)))
    .groupby("viruses_w_IC50s")
    .aggregate(n_antibodies=pd.NamedAgg("antibody", "nunique"))
    .sort_values("n_antibodies", ascending=False)
)

Unnamed: 0_level_0,Unnamed: 1_level_0,n_antibodies
study,source,Unnamed: 2_level_1
imprinted_2022,BA.1 convalescents,752
imprinted_2022,BA.2 convalescents,751
imprinted_2022,BA.5 convalescents,269
imprinted_2022,SARS convalescents,881
imprinted_2022,WT convalescents,438
imprinted_2022,WT vaccinees,240
imprinted_2022,WT-engineered,2



Number of IC50s per antibody:


Unnamed: 0_level_0,n_antibodies
viruses_w_IC50s,Unnamed: 1_level_1
D614G; BA.1; BA.2; BA.5; BA.2.75; BQ.1.1; XBB,3309
D614G; BA.1; BA.2; BA.5; BA.2.75,13
D614G; BA.1; BA.2; BA.5; BA.2.75; BQ.1.1,3
D614G; BA.1; BA.2; BA.5; BA.2.75; XBB,3
BQ.1.1; XBB,2
BA.1; BA.2; BA.5; BA.2.75; BQ.1.1; XBB,1
BA.1; BQ.1.1; XBB,1
D614G; BQ.1.1; XBB,1


## Aggregate the antibody information from the different studies
First the antibody sources:

In [4]:
antibody_source = pd.concat(
    [repeated_2023_source, imprinted_2022_source],
    ignore_index=True,
)

print("Dropping the following antibodies duplicated across studies:")
display(antibody_source.groupby("antibody").tail(-1).reset_index(drop=True))

# keep just one of antibodies repeated across studies
antibody_source = antibody_source.groupby("antibody").first().reset_index()

print("\nSources of retained antibodies:")
display(antibody_source)

os.makedirs("results", exist_ok=True)
antibody_source.to_csv("results/antibody_sources.csv", index=False)

Dropping the following antibodies duplicated across studies:


Unnamed: 0,antibody,source,study
0,BD55-1205,WT convalescents,imprinted_2022
1,BD55-3372,SARS convalescents,imprinted_2022
2,BD55-4637,SARS convalescents,imprinted_2022
3,BD55-5483,SARS convalescents,imprinted_2022
4,BD55-5514,SARS convalescents,imprinted_2022
5,BD55-5840,SARS convalescents,imprinted_2022



Sources of retained antibodies:


Unnamed: 0,antibody,source,study
0,1-57,WT convalescents,imprinted_2022
1,2-15,WT convalescents,imprinted_2022
2,7D6,WT convalescents,imprinted_2022
3,ADG-2,SARS convalescents,imprinted_2022
4,B38,WT convalescents,imprinted_2022
...,...,...,...
4672,XGv-416,WT vaccinees,imprinted_2022
4673,XGv-418,WT vaccinees,imprinted_2022
4674,XGv-420,WT vaccinees,imprinted_2022
4675,XGv-421,WT vaccinees,imprinted_2022


Now the IC50s:

In [5]:
antibody_ic50s = pd.concat(
    [repeated_2023_ic50s, imprinted_2022_ic50s],
    ignore_index=True,
)

print("Dropping the following IC50s duplicated across studies:")
display(antibody_ic50s.groupby(["antibody", "virus"]).tail(-1).reset_index(drop=True))

# keep just one of IC50s repeated across studies
antibody_ic50s = antibody_ic50s.groupby(["antibody", "virus"]).first().reset_index()

print("\nIC50s of retained antibodies:")
display(antibody_ic50s)

print("\nNumber of antibodies with no neutralization on any virus:")
assert antibody_ic50s["IC50"].dtype == float
display(
    antibody_ic50s
    .assign(non_neut=lambda x: x["IC50"] == 10)
    .groupby("antibody", as_index=False)
    .aggregate({"non_neut": "all"})
    .groupby("non_neut")
    .aggregate(n_antibodies=pd.NamedAgg("antibody", "nunique"))
)

antibody_ic50s.to_csv("results/antibody_IC50s.csv", index=False)

Dropping the following IC50s duplicated across studies:


Unnamed: 0,study,antibody,virus,IC50
0,imprinted_2022,BD55-1205,D614G,0.00331
1,imprinted_2022,BD55-3372,D614G,0.0068
2,imprinted_2022,BD55-4637,D614G,0.0201
3,imprinted_2022,BD55-5483,D614G,0.0138
4,imprinted_2022,BD55-5514,D614G,0.0107
5,imprinted_2022,BD55-5840,D614G,0.001
6,imprinted_2022,BD55-1205,BA.1,0.0052
7,imprinted_2022,BD55-3372,BA.1,0.0203
8,imprinted_2022,BD55-4637,BA.1,0.0227
9,imprinted_2022,BD55-5483,BA.1,0.00647



IC50s of retained antibodies:


Unnamed: 0,antibody,virus,study,IC50
0,1-57,BA.1,imprinted_2022,10.000
1,1-57,BA.2,imprinted_2022,10.000
2,1-57,BA.2.75,imprinted_2022,10.000
3,1-57,BA.5,imprinted_2022,10.000
4,1-57,BQ.1.1,imprinted_2022,10.000
...,...,...,...,...
37180,XGv-422,BA.2.75,imprinted_2022,0.495
37181,XGv-422,BA.5,imprinted_2022,0.681
37182,XGv-422,BQ.1.1,imprinted_2022,1.060
37183,XGv-422,D614G,imprinted_2022,0.394



Number of antibodies with no neutralization on any virus:


Unnamed: 0_level_0,n_antibodies
non_neut,Unnamed: 1_level_1
False,3431
True,1246
