# Libraries

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import seaborn as sns

from utils import *

%matplotlib inline

# Data import

## Epitope pairwise comparisons

In [2]:
epitope_pairs_dict = {}

# loop through directory with all pairwise epitope comparisons for top antigens in IEDB
for filename in os.listdir(os.path.dirname(os.getcwd()) + "/data/IEDB/epitope_comparison/"): 
    
    filepath = os.path.join(os.path.dirname(os.getcwd()) + "/data/IEDB/epitope_comparison/", filename)
    df = pd.read_csv(filepath)
    
    # extract epitope ID
    df["Epitope_ID_A"] = df["Epitope_ID_A"].apply(get_IEDB_ID)
    df["Epitope_ID_B"] = df["Epitope_ID_B"].apply(get_IEDB_ID)
    
    # retrieve antigen and species name from file name
    _, _, antigen, species = filename.split("_")
    
    # remove file extension from species string
    species, _ = species.split(".", maxsplit=1)
    
    # add dataframe to dictionary
    epitope_pairs_dict[(antigen, species)] = df

## IEDB assays

In [3]:
assays = pd.read_csv(os.path.dirname(os.getcwd()) + "/data/IEDB/IEDB_assays_discontinuous.csv")

## SAbDab summary 

In [4]:
sabdab = pd.read_csv(os.path.dirname(os.getcwd()) + "/data/SAbDab/Summary_all_structures_all_organisms.tsv", sep="\t")
print("Number of all PDB antibody structures on SAbDab:", len(sabdab))

Number of all PDB antibody structures on SAbDab: 16105


# Epitope pair filtering for high overlap

In [5]:
epitope_cutoff = 0.75 # choose rather low cutoff, can always filter antibodies later
epitope_pairs_similar_dict = {}
epitope_ids = []
    
for key in epitope_pairs_dict.keys():
    
    print("Antigen:", key[0])
    print("Species:", key[1])
    
    # filter current dataframe for overlapping epitopes above cutoff
    current_epitope_pairs = epitope_pairs_dict[key]
    print("\tNumber of compared epitope pairs:", len(current_epitope_pairs))
    
    epitope_pairs_similar = filter_epitopes_for_high_overlap(current_epitope_pairs, cutoff=epitope_cutoff)
    epitope_pairs_similar_dict[key] = epitope_pairs_similar
    print("\tNumber of epitope pairs with >%.2f overlap: %i" %(epitope_cutoff, len(epitope_pairs_similar)))
    
    epitope_ids = create_list_of_epitope_IDs(epitope_ids, epitope_pairs_similar)
    
    print("-----------------")

Antigen: ADP-ribosylcyclasecyclicADP-ribosehydrolase1
Species: Homosapiens
	Number of compared epitope pairs: 21
	Number of epitope pairs with >0.75 overlap: 0
-----------------
Antigen: Beta-2adrenergicreceptor
Species: Homosapiens
	Number of compared epitope pairs: 21
	Number of epitope pairs with >0.75 overlap: 2
-----------------
Antigen: Beta-secretase2
Species: Homosapiens
	Number of compared epitope pairs: 15
	Number of epitope pairs with >0.75 overlap: 5
-----------------
Antigen: BotulinumneurotoxintypeA
Species: Clostridiumbotulinum
	Number of compared epitope pairs: 78
	Number of epitope pairs with >0.75 overlap: 1
-----------------
Antigen: CapsidproteinVP1
Species: Norwalkvirus
	Number of compared epitope pairs: 78
	Number of epitope pairs with >0.75 overlap: 2
-----------------
Antigen: CD81antigen
Species: Homosapiens
	Number of compared epitope pairs: 15
	Number of epitope pairs with >0.75 overlap: 1
-----------------
Antigen: Circumsporozoiteprotein
Species: Plasmodium

In [6]:
epitope_pairs_similar_df = pd.DataFrame()

for key in epitope_pairs_similar_dict.keys():        
    epitope_pairs_similar_df = pd.concat([epitope_pairs_similar_df, epitope_pairs_similar_dict[key]])

print("Number of epitope pairs with overlap >%.2f overlap: %i" %(epitope_cutoff, len(epitope_pairs_similar_df)))
print("Number of unique epitope IDs:", len(epitope_ids))

# save dataframe
epitope_pairs_similar_df.to_csv(os.path.dirname(os.getcwd()) + "/data/IEDB/Epitope_pairs_similar.csv", index=False)

Number of epitope pairs with overlap >0.75 overlap: 1401
Number of unique epitope IDs: 527


# Annotation of antibody information

## Retrieve PDB IDs of IEDB entries

In [7]:
# get epitope-PDB ID mappings from IEDB assay dataframe
assays["Epitope_ID"] = assays["Epitope - IEDB IRI"].apply(get_IEDB_ID)
epitope_id_pdb = assays[assays["Epitope_ID"].isin(epitope_ids)][["Epitope_ID", "Complex - PDB ID"]]
epitope_id_pdb.drop_duplicates(inplace=True)
epitope_id_pdb.columns = ["Epitope_ID", "PDB"]
print("Number of unique epitope-structure mappings:", len(epitope_id_pdb["PDB"]))
print("Number of unique PDB structures:", len(set(epitope_id_pdb["PDB"])))

Number of unique epitope-structure mappings: 675
Number of unique PDB structures: 622


## Filter SAbDab entries

### Separate human and non-human antibodies

In [8]:
sabdab_human = sabdab[(sabdab["heavy_species"] == "homo sapiens") & (sabdab["light_species"] == "homo sapiens")]
sabdab_human[["heavy_species", "light_species"]].value_counts()

heavy_species  light_species
homo sapiens   homo sapiens     6871
Name: count, dtype: int64

In [9]:
print("Number of all PDB antibody structures:", len(sabdab))
print("Number of human PDB antibody structures:", len(sabdab_human))

Number of all PDB antibody structures: 16105
Number of human PDB antibody structures: 6871


### Filter on antibody information

In [10]:
sabdab_human_filtered = filter_SAbDab_entries(sabdab_human)
print("Number of human PDB antibody structures:", len(sabdab_human_filtered))

Number of human PDB antibody structures: 2238


In [11]:
sabdab_human_filtered_by_numbers = filter_SAbDab_entries_by_number(sabdab_human_filtered)
print("Number of human SAbDab entries left:", len(sabdab_human_filtered_by_numbers))

Number of human SAbDab entries left: 1433


In [12]:
# lower case to match PDB IDs with SAbDab summary file
pdb_list = epitope_id_pdb["PDB"].str.lower()
# filter for IEDB PDB IDs that are found in SAbDab
sabdab_human_iedb = sabdab_human_filtered_by_numbers[sabdab_human_filtered_by_numbers["pdb"].isin(set(pdb_list))]
print("Number of IEDB epitopes with available human PDB antibody structures in SAbDab:", len(set(sabdab_human_iedb["pdb"])))

Number of IEDB epitopes with available human PDB antibody structures in SAbDab: 168


## Check which SAbDab entries describe same antibody

In [13]:
# more than two entries indicates more than one antibody in PDB, not able to determine which antibody is relevant
sabdab_human_iedb_two_entries = filter_for_antibodies_with_two_entries(sabdab_human_iedb)
# if two entries for same PDB ID have identical antibody class, it is likely that it describes same antibody
# difference in chain assignment between PDB and authors
sabdab_human_iedb_same_class = filter_for_same_class_antibodies(sabdab_human_iedb_two_entries)

print("\n\nNumber of PDBs left:", len(sabdab_human_iedb_same_class["pdb"].unique()))
print("Checking: ", end="")
for pdb in sabdab_human_iedb_same_class["pdb"].sort_values().unique():
    print(pdb, end = " ")

# 36 entries with two rows but same antibody classes were checked manually on PDB
# drop 5G64 as it contains two antibodies in stucture
print("\n\nDropping: 5g64")
sabdab_human_iedb_same_class = sabdab_human_iedb_same_class[sabdab_human_iedb_same_class["pdb"] != "5g64"]
print("Number of PDBs with two entries:", len(sabdab_human_iedb_same_class["pdb"].unique()))

Dropping: 7s5q 7s5r 8dxu 7nx7 7nxa 6de7 6xc7 7chf 7zfc 7m6d 7r8l 6mco 7che 7nx8 6uyg 6mdt 5t3z 7nx9 7pqz 7q0h 7k9z 7chc 7fcp 7or9 7r6w 5t3x 7s8h 5fyl 7lm8 7nx6 7q0g 6xc3 7zf3 7nxb 

Number of PDBs left: 36
Checking: 1yyl 1yym 2i5y 2i60 5c7x 5d72 5g64 5if0 5uea 5umn 6ayz 6az2 6azz 6b0h 6mft 6xc2 6xc4 6yla 6z2m 7c01 7cho 7e5y 7e7x 7e7y 7klg 7klh 7kmg 7l2c 7m3i 7mmo 7msq 7pqy 7rxj 7s0x 7xsc 7zf9 

Dropping: 5g64
Number of PDBs with two entries: 35


In [14]:
# final dataframe contains PDBs with only one entry or 
# with two entries that describe the same antibody with different chain names
pdb_counts = sabdab_human_iedb["pdb"].value_counts()
sabdab_human_iedb_final = sabdab_human_iedb[(sabdab_human_iedb["pdb"].isin(pdb_counts[pdb_counts == 1].index)) | \
    (sabdab_human_iedb["pdb"].isin(sabdab_human_iedb_same_class["pdb"]))]
print("Number of final human SAbDab entries:", len(sabdab_human_iedb_final))
print("Number of unique human PDB entries in SAbDab:", len(sabdab_human_iedb_final["pdb"].unique()))

Number of final human SAbDab entries: 168
Number of unique human PDB entries in SAbDab: 133


## Create Epitope ID PDB mappings

In [15]:
epitope_id_pdb_filtered_human = epitope_id_pdb[epitope_id_pdb["PDB"].isin(sabdab_human_iedb_final["pdb"].str.upper())]
print("Number of human Epitope ID PDB mappings:", len(epitope_id_pdb_filtered_human))
print("Number of human Epitope IDs with multiple PDB mappings:", 
    len(epitope_id_pdb_filtered_human[epitope_id_pdb_filtered_human["Epitope_ID"].duplicated()]))

epitope_id_pdb_unique_human = epitope_id_pdb_filtered_human.drop_duplicates(subset=["Epitope_ID"], keep="first")
print("Number of unique human Epitope ID PDB mappings:", len(epitope_id_pdb_unique_human))

Number of human Epitope ID PDB mappings: 137
Number of human Epitope IDs with multiple PDB mappings: 19
Number of unique human Epitope ID PDB mappings: 118


## Save data

In [16]:
epitope_id_pdb_unique_human.to_csv(os.path.dirname(os.getcwd()) + 
    "/data/antibody_pairs/Epitope_PDB_ID.csv", index=False)
sabdab_human_iedb_final.to_csv(os.path.dirname(os.getcwd()) + 
    "/data/SAbDab/Summary_antibody_pair_structures_human.csv", index=False)

## Antibody sequence retrieval

### Automatic

```
Input: Summary_antibody_pair_structures_human.csv 
Script: antibody_sequence_retrieval_from_pdb.py 
Output: Antibody_pairs_amino_acid_heavy_chain.FASTA, Antibody_pairs_amino_acid_light_chain.FASTA
```

### Manually

https://www.rcsb.org/fasta/entry/[PDB]/display

In [17]:
pdb_ids_list_heavy_human = extract_pdb_ids_from_fasta(os.path.dirname(os.getcwd()) + 
     "/data/FASTA/Antibody_pairs_amino_acid_heavy_chain.fasta")
pdb_ids_list_light_human = extract_pdb_ids_from_fasta(os.path.dirname(os.getcwd()) + 
     "/data/FASTA/Antibody_pairs_amino_acid_light_chain.fasta")

In [18]:
pdb_ids_list_heavy_human = pd.Series(pdb_ids_list_heavy_human).str.lower()
sabdab_no_sequence_heavy = sabdab_human_iedb_final[~sabdab_human_iedb_final["pdb"].isin(pdb_ids_list_heavy_human)]["pdb"]
print("Number of heavy chains without retrieved FASTA sequence:", len(sabdab_no_sequence_heavy))
for i in sabdab_no_sequence_heavy.str.upper():
    print(i)

Number of heavy chains without retrieved FASTA sequence: 14
7S5P
7UL0
7UL1
7T72
8BH5
7WBZ
8CWV
7ZF8
7U2E
7U2D
7RXI
7SBU
7SD5
7FCQ


In [19]:
pdb_ids_list_light_human = pd.Series(pdb_ids_list_light_human).str.lower()
sabdab_no_sequence_light = sabdab_human_iedb_final[~sabdab_human_iedb_final["pdb"].isin(pdb_ids_list_light_human)]["pdb"]
print("Number of light chains without retrieved FASTA sequence:", len(sabdab_no_sequence_light.unique()))
for i in sabdab_no_sequence_light.str.upper().unique():
    print(i)

Number of light chains without retrieved FASTA sequence: 23
7XSC
7XS8
7S5P
7UL0
7UL1
7T72
8BH5
7WBZ
8CWV
7S0X
7PQY
7ZF8
7MSQ
7ZF9
7RXJ
7F7E
7U2E
7U2D
7RXI
7SBU
7PS1
7SD5
7FCQ
