This notebook cleans up the crosslink data for each RBD-nanobody pair, by first ensuring that the residue indices are numbered consistently, and then extracts crosslink sets for each nanobody-crosslink-pair in a (CSV) format suitable for integrative modeling. Note:

- RBD residues are reported according to the sequence of the full CoV2 spike protein (the first RBD residue in the sequence is actually 319 in the sequence of the entire spike). However, they will be reformatted relative to the RBD sequence given (i.e. treating the first residue in the sequence as position 1). Since we do rigid docking, any residues not within the structurally covered range of the available RBD structure (333-526 for the structure [6M0J](https://www.rcsb.org/structure/6M0J)), will be ignored.
<br>

- nanobody sequences used in the crosslinking experiment have an additional N-terminal GS "tag" and a C-terminal LEHHHHHH tag which are not present in the core nanobody sequence used to obtain comparative models. So, crosslinked residues must be accordingly renumbered. 

First import a few necessary modules and set filenames, output locations, etc. 

In [1]:
import os
import numpy as np
import pandas as pd
from collections import namedtuple, OrderedDict
from Bio import SeqIO

FASTA_FN = "../spike_nanobody.fasta.txt"
XL_FN = "crosslinks_24Jun2021.xlsx"
OUTDIR = "xl"

RECEPTOR = "rbd"
LIGANDS = ["rbd-9", "rbd-11", "rbd-15", "s1-1", "s1-23", "s1-46"]
RECEPTOR_OFFSET = -318
RECEPTOR_RANGE = (333, 526)

FASTA = {r.id: r.seq._data for r in SeqIO.parse(FASTA_FN, format="fasta")}

Raw crosslink data are kept in an input spreadsheet. While all of the links are manually curated, some may be undesirable to include in the final modeling. Any crosslink that has a value of 0 in the "included" column will be excluded while generating the formatted CSV files.

In [2]:
def _get_resid(peptide_seq, seq, offset=0):
    xl_loc = peptide_seq.find("*") - 1
    if xl_loc < 0:
        return None
    peptide_seq_cleaned = "".join([s for s in peptide_seq if s != "*"])
    out = 1 + seq.find(peptide_seq_cleaned) + xl_loc
    return out - offset

For good measure, the function below will perform additional checks to detect duplicate crosslinks and then set all linkers to DSS, since that is what is used.

In [3]:
def _trim_crosslinks(xls):
    d = OrderedDict()
    for xl in xls:
        p1, r1, p2, r2 = xl
        key = (p1, r1, p2, r2)
        if key not in d:
            d[key] = 1
        else:
            d[key] += 1
    
    xls_out = []
    for k, v in d.items():
        if v > 1:
            xl_str = "(%s, %d, %s, %d)" % k
            print("Duplicate found for XL %s, num_copies = %d" % (xl_str, v))
        xls_out.append(k)
    return xls_out 

In [5]:
def process_XL_set(ligand, outfn):
    receptor_seq = FASTA[RECEPTOR]
    ligand_seq = FASTA[ligand]
    df = pd.read_excel(XL_FN, sheet_name="%s_%s" % (RECEPTOR, ligand))
    xls = []
    for i in range(len(df)):
        row_num = i+2
        
        # ------------------------------------------
        # LIGAND (i.e. NANOBODY) CROSSLINKED RESIDUE
        # ------------------------------------------
        this_df = df.iloc[i]
        ligand_pep_seq = this_df["peptide1"]
        ligand_res = this_df["residue1"]
        ligand_res_computed = _get_resid(ligand_pep_seq, "GS" + ligand_seq) # add N-terminal "GS" tag
        # ignore if not found
        if ligand_res_computed is None:
            continue       
        # ignore if either residue on the N-terminal tag is crosslinked
        if ligand_res_computed <= 2:
            continue
        # check consistency with given residue number by collaborator
        if ligand_res != ligand_res_computed:
            print("Row %d. Ligand XL residue ID mismatch. Given: %d, computed from peptide seq: %d" % \
                 (row_num, ligand_res, ligand_res_computed))
        
        # when all checks have passesd, subtract the added tag
        ligand_res_computed -= 2
        
        
        # ---------------------------------------
        # RECEPTOR (i.e. RBD) CROSSLINKED RESIDUE
        # ---------------------------------------
        receptor_pep_seq = this_df["peptide2"]
        receptor_res = this_df["residue2"]
        receptor_res_computed = _get_resid(receptor_pep_seq, receptor_seq, RECEPTOR_OFFSET)
        # ignore if not found
        if receptor_res_computed is None:
            continue
        # ignore if outside structurally available receptor sequence
        if receptor_res_computed not in range(*RECEPTOR_RANGE):
            continue
        # check consistency with given residue number by collaborator
        if receptor_res != receptor_res_computed:
            print("Row %d. Receptor XL residue ID mismatch. Given: %d, computed from peptide seq: %d" % \
                 (row_num, receptor_res, receptor_res_computed))
        
        # if this crosslink has been explicitly excluded by collaborator, remove it
        if not this_df["included"]:
            continue
        
        # add the crosslink
        xl = (RECEPTOR, receptor_res_computed, ligand, ligand_res_computed)
        xls.append(xl)
    
    xls_trimmed = _trim_crosslinks(xls)
    out_df = pd.DataFrame(xls_trimmed, columns=["receptor", "receptor_residue", "ligand", "ligand_residue"])
    out_df.to_csv(outfn, index=False)
    display(out_df)

In [6]:
os.makedirs(OUTDIR, exist_ok=True)
for l in LIGANDS:
    print("Processing XLs for %s" % l)
    outfn = os.path.join(OUTDIR, "%s.csv" % l)
    process_XL_set(l, outfn)
    print("\n\n")

Processing XLs for rbd-9


Unnamed: 0,receptor,receptor_residue,ligand,ligand_residue
0,rbd,386,rbd-9,67
1,rbd,444,rbd-9,112
2,rbd,444,rbd-9,45
3,rbd,444,rbd-9,67
4,rbd,458,rbd-9,78





Processing XLs for rbd-11


Unnamed: 0,receptor,receptor_residue,ligand,ligand_residue
0,rbd,462,rbd-11,45
1,rbd,458,rbd-11,45
2,rbd,458,rbd-11,66
3,rbd,386,rbd-11,88
4,rbd,417,rbd-11,88
5,rbd,386,rbd-11,66
6,rbd,444,rbd-11,66





Processing XLs for rbd-15


Unnamed: 0,receptor,receptor_residue,ligand,ligand_residue
0,rbd,458,rbd-15,45
1,rbd,417,rbd-15,45
2,rbd,444,rbd-15,78
3,rbd,458,rbd-15,78
4,rbd,386,rbd-15,78
5,rbd,462,rbd-15,45
6,rbd,356,rbd-15,78





Processing XLs for s1-1
Duplicate found for XL (rbd, 417, s1-1, 78), num_copies = 2


Unnamed: 0,receptor,receptor_residue,ligand,ligand_residue
0,rbd,417,s1-1,78
1,rbd,458,s1-1,45
2,rbd,462,s1-1,45
3,rbd,417,s1-1,45
4,rbd,386,s1-1,78
5,rbd,458,s1-1,67
6,rbd,444,s1-1,67
7,rbd,444,s1-1,78
8,rbd,444,s1-1,89
9,rbd,458,s1-1,78





Processing XLs for s1-23
Duplicate found for XL (rbd, 417, s1-23, 89), num_copies = 2


Unnamed: 0,receptor,receptor_residue,ligand,ligand_residue
0,rbd,444,s1-23,67
1,rbd,462,s1-23,67
2,rbd,458,s1-23,45
3,rbd,444,s1-23,45
4,rbd,462,s1-23,45
5,rbd,417,s1-23,89
6,rbd,444,s1-23,89





Processing XLs for s1-46


Unnamed: 0,receptor,receptor_residue,ligand,ligand_residue
0,rbd,458,s1-46,45
1,rbd,458,s1-46,78
2,rbd,386,s1-46,67
3,rbd,458,s1-46,67





