In [4]:
from Bio.PDB import PDBParser, NeighborSearch
from Bio.PDB.Polypeptide import is_aa
import pandas as pd

In [5]:
def find_contacts(pdb_file, chain1_id="A", chain2_id="B", distance_cutoff=5.0):
    parser = PDBParser(QUIET=True)
    structure = parser.get_structure("complex", pdb_file)
    model = structure[0]

    chain1_atoms = [atom for res in model[chain1_id] if is_aa(res) for atom in res]
    chain2_atoms = [atom for res in model[chain2_id] if is_aa(res) for atom in res]

    ns = NeighborSearch(chain2_atoms)

    chain1_contacts = set()
    chain2_contacts = set()

    for atom in chain1_atoms:
        neighbors = ns.search(atom.coord, distance_cutoff)
        for neighbor in neighbors:
            res1 = atom.get_parent()
            res2 = neighbor.get_parent()
            if res1 != res2:
                chain1_contacts.add((res1.get_resname(), res1.get_id()[1]))
                chain2_contacts.add((res2.get_resname(), res2.get_id()[1]))

    return sorted(chain1_contacts, key=lambda x: x[1]), sorted(chain2_contacts, key=lambda x: x[1])


In [6]:
import pandas as pd

def format_contacts_as_dataframe(chain1_contacts, chain2_contacts, chain1_id="A", chain2_id="B"):
    # sets to sorted lists
    sorted_a = sorted(chain1_contacts, key=lambda x: x[1])
    sorted_b = sorted(chain2_contacts, key=lambda x: x[1])

    max_len = max(len(sorted_a), len(sorted_b))
    sorted_a += [("", "")] * (max_len - len(sorted_a))
    sorted_b += [("", "")] * (max_len - len(sorted_b))

    # merge into a dataframe
    df = pd.DataFrame({
        f"Chain {chain1_id} Residue": [res[0] for res in sorted_a],
        f"Chain {chain1_id} Position": [res[1] for res in sorted_a],
        f"Chain {chain2_id} Residue": [res[0] for res in sorted_b],
        f"Chain {chain2_id} Position": [res[1] for res in sorted_b],
    })

    return df

In [7]:
# put the path to your pdb file here 
pdb_path = "/Users/jriya/Desktop/vldlrex2to14_fullreelin_4ca.pdb"
chainA_res, chainB_res = find_contacts(pdb_path, chain1_id="A", chain2_id="B", distance_cutoff=3.5)

df_contacts = format_contacts_as_dataframe(chainA_res, chainB_res, "A", "B")
df_contacts

Unnamed: 0,Chain A Residue,Chain A Position,Chain B Residue,Chain B Position
0,ASP,27,GLU,29
1,GLU,28,THR,57
2,GLN,55,ARG,139
3,TRP,61,HIS,931
4,ASP,68,LEU,937
...,...,...,...,...
56,THR,553,,
57,LEU,554,,
58,ASN,556,,
59,ASN,557,,


In [8]:
# Step 1: Define exon coordinates
exon_coords = {
    1:  (1, 28),
    2:  (29, 68),
    3:  (69, 109),
    4:  (110, 150),
    5:  (151, 274),
    6:  (275, 315),
    7:  (316, 356),
    8:  (357, 396),
    9:  (397, 438),
    10: (439, 495),
    11: (496, 568),
    12: (569, 608),
    13: (609, 654),
    14: (655, 702),
    15: (703, 751),
    16: (752, 779),
    17: (780, 806),
    18: (807, 862),
    19: (863, 873)
}

# Step 2: Specify selected exons used in the spliced construct
selected_exons = [2, 3,4,5,6, 7,8,9,10,11,12,13,14]

# Step 3: Build mapping from PDB position → full position and exon
pdb_to_full = {}
pdb_to_spliced = {}
pdb_to_exon = {}
spliced_pos = 1
pdb_pos = 1

for exon in selected_exons:
    start, end = exon_coords[exon]
    for full_pos in range(start, end + 1):
        pdb_to_full[pdb_pos] = full_pos
        pdb_to_spliced[pdb_pos] = spliced_pos
        pdb_to_exon[pdb_pos] = exon
        spliced_pos += 1
        pdb_pos += 1

# Step 4: Identify Chain A columns dynamically
chain_cols = [col for col in df_contacts.columns if "Chain" in col and "Position" in col]
residue_cols = [col for col in df_contacts.columns if "Chain" in col and "Residue" in col]
chain_to_map = chain_cols[0]  # Chain A Position
residue_col = residue_cols[0]  # Chain A Residue
other_cols = [col for col in df_contacts.columns if col not in [chain_to_map, residue_col]]

# Step 5: Clean and map
df_clean = df_contacts.copy()
df_clean = df_clean[df_clean[chain_to_map].apply(lambda x: str(x).isdigit())]
df_clean[chain_to_map] = df_clean[chain_to_map].astype(int)

df_clean["Full Sequence Position"] = df_clean[chain_to_map].map(pdb_to_full)
df_clean["Spliced Position"] = df_clean[chain_to_map].map(pdb_to_spliced)
df_clean["Exon"] = df_clean[chain_to_map].map(pdb_to_exon)

# Step 6: Final cleaned and sorted output
df_contacts_mapped = df_clean.dropna(subset=["Spliced Position"])
df_contacts_mapped = df_contacts_mapped.sort_values("Spliced Position")

# Reorder columns to show main mapping info first
output_cols = [residue_col, chain_to_map, "Full Sequence Position", "Spliced Position", "Exon"] + other_cols
df_contacts_mapped = df_contacts_mapped[output_cols]


In [12]:
df_contacts_mapped

Unnamed: 0,Chain A Residue,Chain A Position,Full Sequence Position,Spliced Position,Exon,Chain B Residue,Chain B Position
0,ASP,27,55,27,2,GLU,29
1,GLU,28,56,28,2,THR,57
2,GLN,55,83,55,3,ARG,139
3,TRP,61,89,61,3,HIS,931
4,ASP,68,96,68,3,LEU,937
...,...,...,...,...,...,...,...
56,THR,553,581,553,12,,
57,LEU,554,582,554,12,,
58,ASN,556,584,556,12,,
59,ASN,557,585,557,12,,


In [14]:
with pd.option_context('display.max_rows', 100):  # Set to a value >= 61
    display(df_contacts_mapped.head(61))

Unnamed: 0,Chain A Residue,Chain A Position,Full Sequence Position,Spliced Position,Exon,Chain B Residue,Chain B Position
0,ASP,27,55,27,2,GLU,29.0
1,GLU,28,56,28,2,THR,57.0
2,GLN,55,83,55,3,ARG,139.0
3,TRP,61,89,61,3,HIS,931.0
4,ASP,68,96,68,3,LEU,937.0
5,GLU,70,98,70,3,GLU,941.0
6,ILE,87,115,87,4,ARG,983.0
7,GLN,98,126,98,4,TYR,990.0
8,TRP,104,132,104,4,ASP,1122.0
9,ASP,107,135,107,4,GLU,1126.0
