In [1]:
from Bio.PDB import PDBParser, NeighborSearch
from Bio.PDB.Polypeptide import is_aa
import pandas as pd

In [2]:
def find_contacts(pdb_file, chain1_id="A", chain2_id="B", distance_cutoff=5.0):
    parser = PDBParser(QUIET=True)
    structure = parser.get_structure("complex", pdb_file)
    model = structure[0]

    chain1_atoms = [atom for res in model[chain1_id] if is_aa(res) for atom in res]
    chain2_atoms = [atom for res in model[chain2_id] if is_aa(res) for atom in res]

    ns = NeighborSearch(chain2_atoms)

    chain1_contacts = set()
    chain2_contacts = set()

    for atom in chain1_atoms:
        neighbors = ns.search(atom.coord, distance_cutoff)
        for neighbor in neighbors:
            res1 = atom.get_parent()
            res2 = neighbor.get_parent()
            if res1 != res2:
                chain1_contacts.add((res1.get_resname(), res1.get_id()[1]))
                chain2_contacts.add((res2.get_resname(), res2.get_id()[1]))

    return sorted(chain1_contacts, key=lambda x: x[1]), sorted(chain2_contacts, key=lambda x: x[1])


In [3]:
import pandas as pd

def format_contacts_as_dataframe(chain1_contacts, chain2_contacts, chain1_id="A", chain2_id="B"):
    # sets to sorted lists
    sorted_a = sorted(chain1_contacts, key=lambda x: x[1])
    sorted_b = sorted(chain2_contacts, key=lambda x: x[1])

    max_len = max(len(sorted_a), len(sorted_b))
    sorted_a += [("", "")] * (max_len - len(sorted_a))
    sorted_b += [("", "")] * (max_len - len(sorted_b))

    # merge into a dataframe
    df = pd.DataFrame({
        f"Chain {chain1_id} Residue": [res[0] for res in sorted_a],
        f"Chain {chain1_id} Position": [res[1] for res in sorted_a],
        f"Chain {chain2_id} Residue": [res[0] for res in sorted_b],
        f"Chain {chain2_id} Position": [res[1] for res in sorted_b],
    })

    return df

In [4]:
# put the path to your pdb file here 
pdb_path = "/Users/jriya/Desktop/ldlrex2to13_fullreelin_4ca.pdb"
chainA_res, chainB_res = find_contacts(pdb_path, chain1_id="A", chain2_id="B", distance_cutoff=3.5)

df_contacts = format_contacts_as_dataframe(chainA_res, chainB_res, "A", "B")
df_contacts

Unnamed: 0,Chain A Residue,Chain A Position,Chain B Residue,Chain B Position
0,LYS,21,PRO,10
1,TRP,22,PHE,11
2,GLU,29,PHE,13
3,PRO,130,VAL,103
4,THR,139,VAL,108
...,...,...,...,...
57,ILE,611,,
58,ILE,612,,
59,ASN,613,,
60,GLU,614,,


In [5]:
# Step 1: Define exon coordinates
exon_coords = {
    1:  (1, 23),
    2:  (23, 64),
    3:  (64, 105),
    4:  (105, 232),
    5:  (232, 273),
    6:  (273, 314),
    7:  (314, 354),
    8:  (354, 396),
    9:  (396, 453),
    10: (453, 529),
    11: (529, 569),
    12: (569, 615),
    13: (616, 663),
    14: (663, 714),
    15: (714, 771),
    16: (771, 797),
    17: (797, 849),
    18: (850, 860)
}

# Step 2: Specify selected exons used in the spliced construct
selected_exons = [2,3,4,5,6,7,8,9,10,11,12,13]

# Step 3: Build mapping from PDB position → full position and exon
pdb_to_full = {}
pdb_to_spliced = {}
pdb_to_exon = {}
spliced_pos = 1
pdb_pos = 1

for exon in selected_exons:
    start, end = exon_coords[exon]
    for full_pos in range(start, end + 1):
        pdb_to_full[pdb_pos] = full_pos
        pdb_to_spliced[pdb_pos] = spliced_pos
        pdb_to_exon[pdb_pos] = exon
        spliced_pos += 1
        pdb_pos += 1

# Step 4: Identify Chain A columns dynamically
chain_cols = [col for col in df_contacts.columns if "Chain" in col and "Position" in col]
residue_cols = [col for col in df_contacts.columns if "Chain" in col and "Residue" in col]
chain_to_map = chain_cols[0]  # Chain A Position
residue_col = residue_cols[0]  # Chain A Residue
other_cols = [col for col in df_contacts.columns if col not in [chain_to_map, residue_col]]

# Step 5: Clean and map
df_clean = df_contacts.copy()
df_clean = df_clean[df_clean[chain_to_map].apply(lambda x: str(x).isdigit())]
df_clean[chain_to_map] = df_clean[chain_to_map].astype(int)

df_clean["Full Sequence Position"] = df_clean[chain_to_map].map(pdb_to_full)
df_clean["Spliced Position"] = df_clean[chain_to_map].map(pdb_to_spliced)
df_clean["Exon"] = df_clean[chain_to_map].map(pdb_to_exon)

# Step 6: Final cleaned and sorted output
df_contacts_mapped = df_clean.dropna(subset=["Spliced Position"])
df_contacts_mapped = df_contacts_mapped.sort_values("Spliced Position")

# Reorder columns to show main mapping info first
output_cols = [residue_col, chain_to_map, "Full Sequence Position", "Spliced Position", "Exon"] + other_cols
df_contacts_mapped = df_contacts_mapped[output_cols]


In [6]:
df_contacts_mapped

Unnamed: 0,Chain A Residue,Chain A Position,Full Sequence Position,Spliced Position,Exon,Chain B Residue,Chain B Position
0,LYS,21,43,21,2,PRO,10
1,TRP,22,44,22,2,PHE,11
2,GLU,29,51,29,2,PHE,13
3,PRO,130,150,130,4,VAL,103
4,THR,139,159,139,4,VAL,108
...,...,...,...,...,...,...,...
57,ILE,611,623,611,13,,
58,ILE,612,624,612,13,,
59,ASN,613,625,613,13,,
60,GLU,614,626,614,13,,


In [7]:
with pd.option_context('display.max_rows', 100):  # Set to a value >= 61
    display(df_contacts_mapped.head(61))

Unnamed: 0,Chain A Residue,Chain A Position,Full Sequence Position,Spliced Position,Exon,Chain B Residue,Chain B Position
0,LYS,21,43,21,2,PRO,10.0
1,TRP,22,44,22,2,PHE,11.0
2,GLU,29,51,29,2,PHE,13.0
3,PRO,130,150,130,4,VAL,103.0
4,THR,139,159,139,4,VAL,108.0
5,CYS,140,160,140,4,SER,696.0
6,ILE,141,161,141,4,THR,849.0
7,GLN,143,163,143,4,ASN,850.0
8,LEU,144,164,144,4,LEU,851.0
9,TRP,145,165,145,4,VAL,852.0
