# Extract antibody information
Extract the antibody information provided by Cao et al in an Excel file into a pandas DataFrame and save to a CSV.

Import Python modules:

In [1]:
import pandas as pd

Read the data in the Excel file:

In [2]:
raw_info = pd.read_excel(
    "convergent_RBD_evolution/antibody_info.xlsx",
    header=1,
    sheet_name=None,
    usecols=lambda c: not c.startswith("Unnamed"),
)

Look at the columns in the sheets and get the shared columns:

In [3]:
common_cols = None

for sheetname, sheet in raw_info.items():
    columns = sheet.columns.tolist()
    print(f"\n{sheetname=}\n{columns=}")
    if common_cols is None:
        common_cols = columns
    else:
        common_cols = [
            c for c in columns if c in set(common_cols).intersection(columns)
        ]
        
print(f"\nCommon columns: {common_cols}")

info = pd.concat(
    [sheet[common_cols].assign(omicron_specificity=sheetname) for sheetname, sheet in raw_info.items()],
    ignore_index=True
)

assert len(info) == info["Antibody  Name"].nunique()

print(f"\nRead information for {len(info)} antibodies")

display(
    info.groupby("source")
    .aggregate(n_antibodies=pd.NamedAgg("Antibody  Name", "nunique"))
)

info.to_csv("antibody_info.csv", index=False)


sheetname='cross'
columns=['Antibody  Name', 'Epitope Group', 'source', 'D614G', 'BA.1', 'BA.2', 'BA.2.75', 'BA.5', 'BQ.1.1', 'XBB', 'Heavy chain V gene', 'Heavy chain J gene', 'Light chain V gene', 'Light chain J gene', 'Heavy chain AA', 'Light chain AA']

sheetname='specific'
columns=['Antibody  Name', 'source', 'D614G', 'BA.1', 'BA.2', 'BA.5', 'BA.2.75', 'BA.2.75.2', 'CA.1', 'BQ.1.1', 'BR.2', 'BM.1.1.1', 'XBB', 'Heavy chain V gene', 'Heavy chain J gene', 'Light chain V gene', 'Light chain J gene', 'Heavy chain AA', 'Light chain AA']

Common columns: ['Antibody  Name', 'source', 'D614G', 'BA.1', 'BA.2', 'BA.5', 'BA.2.75', 'BQ.1.1', 'XBB', 'Heavy chain V gene', 'Heavy chain J gene', 'Light chain V gene', 'Light chain J gene', 'Heavy chain AA', 'Light chain AA']

Read information for 3333 antibodies


Unnamed: 0_level_0,n_antibodies
source,Unnamed: 1_level_1
BA.1 convalescents,752
BA.2 convalescents,751
BA.5 convalescents,269
SARS convalescents,881
WT convalescents,438
WT vaccinees,240
WT-engineered,2


In [5]:
info.groupby("omicron_specificity").size()

omicron_specificity
cross       3051
specific     282
dtype: int64