In [1]:
# Add execute permissions to binaries
!chmod +x dssp
!chmod +x DAlphaBall.gcc

# Verify permissions
!ls -l dssp
!ls -l DAlphaBall.gcc

-rwxr-xr-x@ 1 satishgaurav  staff  877904 May  4 21:24 [31mdssp[m[m
-rwxr-xr-x@ 1 satishgaurav  staff  345824 May  4 21:24 [31mDAlphaBall.gcc[m[m


In [2]:
from IPython.display import display, Markdown

In [3]:
####################################
################ BioPython functions
####################################
### Import dependencies
import os
import math
import numpy as np
from collections import defaultdict
from scipy.spatial import cKDTree
from Bio import BiopythonWarning
from Bio.PDB import PDBParser, DSSP, Selection, Polypeptide, PDBIO, Select, Chain, Superimposer
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from Bio.PDB.Selection import unfold_entities
from Bio.PDB.Polypeptide import is_aa

# analyze sequence composition of design
def validate_design_sequence(sequence, num_clashes, advanced_settings):
    note_array = []

    # Check if protein contains clashes after relaxation
    if num_clashes > 0:
        note_array.append('Relaxed structure contains clashes.')

    # Check if the sequence contains disallowed amino acids
    if advanced_settings["omit_AAs"]:
        restricted_AAs = advanced_settings["omit_AAs"].split(',')
        for restricted_AA in restricted_AAs:
            if restricted_AA in sequence:
                note_array.append('Contains: '+restricted_AA+'!')

    # Analyze the protein
    analysis = ProteinAnalysis(sequence)

    # Calculate the reduced extinction coefficient per 1% solution
    extinction_coefficient_reduced = analysis.molar_extinction_coefficient()[0]
    molecular_weight = round(analysis.molecular_weight() / 1000, 2)
    extinction_coefficient_reduced_1 = round(extinction_coefficient_reduced / molecular_weight * 0.01, 2)

    # Check if the absorption is high enough
    if extinction_coefficient_reduced_1 <= 2:
        note_array.append(f'Absorption value is {extinction_coefficient_reduced_1}, consider adding tryptophane to design.')

    # Join the notes into a single string
    notes = ' '.join(note_array)

    return notes

# temporary function, calculate RMSD of input PDB and trajectory target
def target_pdb_rmsd(trajectory_pdb, starting_pdb, chain_ids_string):
    # Parse the PDB files
    parser = PDBParser(QUIET=True)
    structure_trajectory = parser.get_structure('trajectory', trajectory_pdb)
    structure_starting = parser.get_structure('starting', starting_pdb)
    
    # Extract chain A from trajectory_pdb
    chain_trajectory = structure_trajectory[0]['A']
    
    # Extract the specified chains from starting_pdb
    chain_ids = chain_ids_string.split(',')
    residues_starting = []
    for chain_id in chain_ids:
        chain_id = chain_id.strip()
        chain = structure_starting[0][chain_id]
        for residue in chain:
            if is_aa(residue, standard=True):
                residues_starting.append(residue)
    
    # Extract residues from chain A in trajectory_pdb
    residues_trajectory = [residue for residue in chain_trajectory if is_aa(residue, standard=True)]
    
    # Ensure that both structures have the same number of residues
    min_length = min(len(residues_starting), len(residues_trajectory))
    residues_starting = residues_starting[:min_length]
    residues_trajectory = residues_trajectory[:min_length]
    
    # Collect CA atoms from the two sets of residues
    atoms_starting = [residue['CA'] for residue in residues_starting if 'CA' in residue]
    atoms_trajectory = [residue['CA'] for residue in residues_trajectory if 'CA' in residue]
    
    # Calculate RMSD using structural alignment
    sup = Superimposer()
    sup.set_atoms(atoms_starting, atoms_trajectory)
    rmsd = sup.rms
    
    return round(rmsd, 2)

# detect C alpha clashes for deformed trajectories
def calculate_clash_score(pdb_file, threshold=2.4, only_ca=False):
    parser = PDBParser(QUIET=True)
    structure = parser.get_structure('protein', pdb_file)

    atoms = []
    atom_info = []  # Detailed atom info for debugging and processing

    for model in structure:
        for chain in model:
            for residue in chain:
                for atom in residue:
                    if atom.element == 'H':  # Skip hydrogen atoms
                        continue
                    if only_ca and atom.get_name() != 'CA':
                        continue
                    atoms.append(atom.coord)
                    atom_info.append((chain.id, residue.id[1], atom.get_name(), atom.coord))

    tree = cKDTree(atoms)
    pairs = tree.query_pairs(threshold)

    valid_pairs = set()
    for (i, j) in pairs:
        chain_i, res_i, name_i, coord_i = atom_info[i]
        chain_j, res_j, name_j, coord_j = atom_info[j]

        # Exclude clashes within the same residue
        if chain_i == chain_j and res_i == res_j:
            continue

        # Exclude directly sequential residues in the same chain for all atoms
        if chain_i == chain_j and abs(res_i - res_j) == 1:
            continue

        # If calculating sidechain clashes, only consider clashes between different chains
        if not only_ca and chain_i == chain_j:
            continue

        valid_pairs.add((i, j))

    return len(valid_pairs)

three_to_one_map = {
    'ALA': 'A', 'CYS': 'C', 'ASP': 'D', 'GLU': 'E', 'PHE': 'F',
    'GLY': 'G', 'HIS': 'H', 'ILE': 'I', 'LYS': 'K', 'LEU': 'L',
    'MET': 'M', 'ASN': 'N', 'PRO': 'P', 'GLN': 'Q', 'ARG': 'R',
    'SER': 'S', 'THR': 'T', 'VAL': 'V', 'TRP': 'W', 'TYR': 'Y'
}

# identify interacting residues at the binder interface
def hotspot_residues(trajectory_pdb, binder_chain="B", atom_distance_cutoff=4.0):
    # Parse the PDB file
    parser = PDBParser(QUIET=True)
    structure = parser.get_structure("complex", trajectory_pdb)

    # Get the specified chain
    binder_atoms = Selection.unfold_entities(structure[0][binder_chain], 'A')
    binder_coords = np.array([atom.coord for atom in binder_atoms])

    # Get atoms and coords for the target chain
    target_atoms = Selection.unfold_entities(structure[0]['A'], 'A')
    target_coords = np.array([atom.coord for atom in target_atoms])

    # Build KD trees for both chains
    binder_tree = cKDTree(binder_coords)
    target_tree = cKDTree(target_coords)

    # Prepare to collect interacting residues
    interacting_residues = {}

    # Query the tree for pairs of atoms within the distance cutoff
    pairs = binder_tree.query_ball_tree(target_tree, atom_distance_cutoff)

    # Process each binder atom's interactions
    for binder_idx, close_indices in enumerate(pairs):
        binder_residue = binder_atoms[binder_idx].get_parent()
        binder_resname = binder_residue.get_resname()

        # Convert three-letter code to single-letter code using the manual dictionary
        if binder_resname in three_to_one_map:
            aa_single_letter = three_to_one_map[binder_resname]
            for close_idx in close_indices:
                target_residue = target_atoms[close_idx].get_parent()
                interacting_residues[binder_residue.id[1]] = aa_single_letter

    return interacting_residues

# calculate secondary structure percentage of design
def calc_ss_percentage(pdb_file, advanced_settings, chain_id="B", atom_distance_cutoff=4.0):
    # Parse the structure
    parser = PDBParser(QUIET=True)
    structure = parser.get_structure('protein', pdb_file)
    model = structure[0]  # Consider only the first model in the structure

    # Calculate DSSP for the model
    dssp = DSSP(model, pdb_file, dssp='mkdssp')

    # Prepare to count residues
    ss_counts = defaultdict(int)
    ss_interface_counts = defaultdict(int)
    plddts_interface = []
    plddts_ss = []

    # Get chain and interacting residues once
    chain = model[chain_id]
    interacting_residues = set(hotspot_residues(pdb_file, chain_id, atom_distance_cutoff).keys())

    for residue in chain:
        residue_id = residue.id[1]
        if (chain_id, residue_id) in dssp:
            ss = dssp[(chain_id, residue_id)][2]  # Get the secondary structure
            ss_type = 'loop'
            if ss in ['H', 'G', 'I']:
                ss_type = 'helix'
            elif ss == 'E':
                ss_type = 'sheet'

            ss_counts[ss_type] += 1

            if ss_type != 'loop':
                # calculate secondary structure normalised pLDDT
                avg_plddt_ss = sum(atom.bfactor for atom in residue) / len(residue)
                plddts_ss.append(avg_plddt_ss)

            if residue_id in interacting_residues:
                ss_interface_counts[ss_type] += 1

                # calculate interface pLDDT
                avg_plddt_residue = sum(atom.bfactor for atom in residue) / len(residue)
                plddts_interface.append(avg_plddt_residue)

    # Calculate percentages
    total_residues = sum(ss_counts.values())
    total_interface_residues = sum(ss_interface_counts.values())

    percentages = calculate_percentages(total_residues, ss_counts['helix'], ss_counts['sheet'])
    interface_percentages = calculate_percentages(total_interface_residues, ss_interface_counts['helix'], ss_interface_counts['sheet'])

    i_plddt = round(sum(plddts_interface) / len(plddts_interface) / 100, 2) if plddts_interface else 0
    ss_plddt = round(sum(plddts_ss) / len(plddts_ss) / 100, 2) if plddts_ss else 0

    return (*percentages, *interface_percentages, i_plddt, ss_plddt)

def calculate_percentages(total, helix, sheet):
    helix_percentage = round((helix / total) * 100,2) if total > 0 else 0
    sheet_percentage = round((sheet / total) * 100,2) if total > 0 else 0
    loop_percentage = round(((total - helix - sheet) / total) * 100,2) if total > 0 else 0


    return helix_percentage, sheet_percentage, loop_percentage

In [4]:
advanced_settings = {
    "dssp_path": "/usr/local/bin/mkdssp",
    "omit_AAs": "C",
}
pdb_file = "./../target/8d9y.pdb"
calc_ss_percentage(pdb_file=pdb_file, advanced_settings=advanced_settings)

(6.85, 57.08, 36.07, 0.0, 85.71, 14.29, 0.36, 0.36)

In [5]:
import pandas as pd

AA_PROPERTIES = {
        "ALA": "nonpolar",
        "ARG": "positive",
        "ASN": "polar",
        "ASP": "negative",
        "CYS": "polar",
        "GLN": "polar",
        "GLU": "negative",
        "GLY": "nonpolar",
        "HIS": "positive",
        "ILE": "nonpolar",
        "LEU": "nonpolar",
        "LYS": "positive",
        "MET": "nonpolar",
        "PHE": "nonpolar",
        "PRO": "nonpolar",
        "SER": "polar",
        "THR": "polar",
        "TRP": "nonpolar",
        "TYR": "polar",
        "VAL": "nonpolar",
        "SEC": "polar"
    }

def create_structure_df(pdb_file, advanced_settings, chain_id=None, atom_distance_cutoff=4.0):
    parser = PDBParser(QUIET=True)
    structure = parser.get_structure('protein', pdb_file)
    model = structure[0]
    
    # Calculate DSSP for the model
    dssp = DSSP(model, pdb_file, dssp='mkdssp')
    
    # Prepare data for DataFrame
    data = []
    
    # Process either specific chain or all chains
    chains = [model[chain_id]] if chain_id else model.get_chains()
    
    for chain in chains:
        current_chain_id = chain.id
        # Get interacting residues if chain_id is specified
        interacting_residues = set()
        if current_chain_id:
            interacting_residues = set(hotspot_residues(pdb_file, current_chain_id, atom_distance_cutoff).keys())
        
        for residue in chain:
            residue_id = residue.id[1]
            res_name = residue.get_resname()
            
            if (current_chain_id, residue_id) in dssp:
                dssp_data = dssp[(current_chain_id, residue_id)]
                ss = dssp_data[2]
                acc = dssp_data[3]
                phi = dssp_data[4]
                psi = dssp_data[5]
                
                ss_type = 'loop'
                if ss in ['H', 'G', 'I']:
                    ss_type = 'helix'
                elif ss == 'E':
                    ss_type = 'sheet'
                    
                avg_plddt = sum(atom.bfactor for atom in residue) / len(residue) / 100
                
                data.append({
                    'chain_id': current_chain_id,
                    'residue_id': residue_id,
                    'residue_name': res_name,
                    'property': AA_PROPERTIES.get(res_name, 'unknown'),
                    'ss_type': ss_type,
                    'ss_code': ss,
                    'accessibility': acc,
                    'phi': phi,
                    'psi': psi,
                    'plddt': round(avg_plddt, 2),
                    'is_interface': residue_id in interacting_residues if chain_id else None
                })
    
    return pd.DataFrame(data)


# Example usage:
pdb_file = "./../target/8d9y.pdb"
df = create_structure_df(pdb_file, advanced_settings)
df.head()

Unnamed: 0,chain_id,residue_id,residue_name,property,ss_type,ss_code,accessibility,phi,psi,plddt,is_interface
0,A,1,GLU,negative,loop,-,0.680412,360.0,122.5,0.52,
1,A,2,ILE,nonpolar,loop,-,0.011834,-73.0,130.0,0.37,
2,A,3,VAL,nonpolar,loop,-,0.028169,-102.2,130.1,0.33,
3,A,4,LEU,nonpolar,sheet,E,0.0,-108.9,127.0,0.36,
4,A,5,THR,polar,sheet,E,0.091549,-111.7,104.5,0.34,


In [6]:
df.tail(10)

Unnamed: 0,chain_id,residue_id,residue_name,property,ss_type,ss_code,accessibility,phi,psi,plddt,is_interface
1985,L,56,ILE,nonpolar,sheet,E,0.189349,-113.3,129.5,0.52,
1986,L,57,ILE,nonpolar,sheet,E,0.568047,-121.4,113.8,0.51,
1987,L,58,CYS,polar,sheet,E,0.325926,-111.9,157.9,0.64,
1988,L,59,CYS,polar,sheet,E,0.237037,-149.7,-169.6,0.83,
1989,L,60,SER,polar,loop,S,0.730769,-124.4,-39.1,0.89,
1990,L,61,THR,polar,loop,S,0.690141,-74.7,176.3,0.85,
1991,L,62,ASP,negative,loop,T,0.730061,-83.3,134.4,0.84,
1992,L,63,ASN,polar,loop,T,0.43949,55.1,45.1,0.79,
1993,L,64,CYS,polar,loop,-,0.42963,-83.9,-31.4,0.9,
1994,L,65,ASN,polar,loop,-,0.210191,-85.3,360.0,0.68,


In [7]:
df.head(10)

Unnamed: 0,chain_id,residue_id,residue_name,property,ss_type,ss_code,accessibility,phi,psi,plddt,is_interface
0,A,1,GLU,negative,loop,-,0.680412,360.0,122.5,0.52,
1,A,2,ILE,nonpolar,loop,-,0.011834,-73.0,130.0,0.37,
2,A,3,VAL,nonpolar,loop,-,0.028169,-102.2,130.1,0.33,
3,A,4,LEU,nonpolar,sheet,E,0.0,-108.9,127.0,0.36,
4,A,5,THR,polar,sheet,E,0.091549,-111.7,104.5,0.34,
5,A,6,GLN,polar,sheet,E,0.040404,-88.1,138.9,0.33,
6,A,7,SER,polar,sheet,E,0.253846,-139.7,134.7,0.44,
7,A,8,PRO,nonpolar,loop,-,0.066176,-85.3,163.5,0.38,
8,A,9,SER,polar,loop,S,0.292308,-65.8,-24.2,0.38,
9,A,10,SER,polar,sheet,E,0.253846,-153.8,152.3,0.41,


In [8]:
df_target = df[df['chain_id'] == 'I']
df_target = df_target.reset_index(drop=True)
df_target.index = range(1, len(df_target) + 1)

In [9]:
df_target.to_csv('8d9y_chain_B_secondary_structure_information.csv', index=False)

In [10]:
df_target[df_target['property'] == 'nonpolar']

Unnamed: 0,chain_id,residue_id,residue_name,property,ss_type,ss_code,accessibility,phi,psi,plddt,is_interface
4,I,4,PHE,nonpolar,sheet,E,0.269036,-70.6,130.8,0.37,
7,I,7,PRO,nonpolar,loop,T,0.330882,-71.1,-46.6,0.39,
9,I,9,VAL,nonpolar,loop,T,0.43662,-64.8,103.2,0.46,
15,I,15,PRO,nonpolar,loop,P,0.544118,-55.9,127.2,0.78,
16,I,16,PRO,nonpolar,loop,T,0.948529,-53.1,145.0,0.83,
17,I,17,GLY,nonpolar,loop,T,0.607143,99.5,-43.3,0.81,
20,I,20,VAL,nonpolar,sheet,E,0.197183,-122.3,123.6,0.59,
26,I,26,TRP,nonpolar,sheet,E,0.220264,-159.6,172.0,0.45,
29,I,29,GLY,nonpolar,loop,T,0.380952,-75.7,-7.8,0.45,
31,I,31,GLY,nonpolar,helix,H,0.238095,-60.2,-21.0,0.5,


In [11]:
display(Markdown(df_target[df_target['property'] == 'nonpolar'].sort_values('accessibility', ascending=False).to_markdown()))

|    | chain_id   |   residue_id | residue_name   | property   | ss_type   | ss_code   |   accessibility |    phi |    psi |   plddt | is_interface   |
|---:|:-----------|-------------:|:---------------|:-----------|:----------|:----------|----------------:|-------:|-------:|--------:|:---------------|
| 32 | I          |           32 | GLY            | nonpolar   | helix     | H         |      1          |  -70.1 |  -35.6 |    0.52 |                |
| 16 | I          |           16 | PRO            | nonpolar   | loop      | T         |      0.948529   |  -53.1 |  145   |    0.83 |                |
| 68 | I          |           68 | PRO            | nonpolar   | loop      | -         |      0.735294   |  -70.5 |  360   |    0.65 |                |
| 17 | I          |           17 | GLY            | nonpolar   | loop      | T         |      0.607143   |   99.5 |  -43.3 |    0.81 |                |
| 44 | I          |           44 | ALA            | nonpolar   | loop      | S         |      0.566038   |  -86.7 |  -21.1 |    0.61 |                |
| 15 | I          |           15 | PRO            | nonpolar   | loop      | P         |      0.544118   |  -55.9 |  127.2 |    0.78 |                |
| 57 | I          |           57 | ILE            | nonpolar   | sheet     | E         |      0.538462   | -115.1 |  133.6 |    0.43 |                |
| 49 | I          |           49 | PRO            | nonpolar   | loop      | -         |      0.455882   |  -59.3 |  132.4 |    0.72 |                |
|  9 | I          |            9 | VAL            | nonpolar   | loop      | T         |      0.43662    |  -64.8 |  103.2 |    0.46 |                |
| 29 | I          |           29 | GLY            | nonpolar   | loop      | T         |      0.380952   |  -75.7 |   -7.8 |    0.45 |                |
| 67 | I          |           67 | PHE            | nonpolar   | loop      | -         |      0.365482   |  -75   |  132.8 |    0.54 |                |
|  7 | I          |            7 | PRO            | nonpolar   | loop      | T         |      0.330882   |  -71.1 |  -46.6 |    0.39 |                |
| 47 | I          |           47 | PRO            | nonpolar   | loop      | -         |      0.286765   |  -63.6 |  156   |    0.63 |                |
|  4 | I          |            4 | PHE            | nonpolar   | sheet     | E         |      0.269036   |  -70.6 |  130.8 |    0.37 |                |
| 35 | I          |           35 | GLY            | nonpolar   | loop      | -         |      0.25       |   88.9 | -162.5 |    0.44 |                |
| 31 | I          |           31 | GLY            | nonpolar   | helix     | H         |      0.238095   |  -60.2 |  -21   |    0.5  |                |
| 26 | I          |           26 | TRP            | nonpolar   | sheet     | E         |      0.220264   | -159.6 |  172   |    0.45 |                |
| 54 | I          |           54 | ILE            | nonpolar   | loop      | -         |      0.218935   |  -87.1 |  115.3 |    0.61 |                |
| 20 | I          |           20 | VAL            | nonpolar   | sheet     | E         |      0.197183   | -122.3 |  123.6 |    0.59 |                |
| 56 | I          |           56 | ILE            | nonpolar   | sheet     | E         |      0.171598   | -130.8 |  127.4 |    0.51 |                |
| 43 | I          |           43 | ALA            | nonpolar   | sheet     | E         |      0.0943396  | -170.2 |  151.5 |    0.63 |                |
| 41 | I          |           41 | GLY            | nonpolar   | sheet     | E         |      0.0833333  | -177.3 | -172.8 |    0.4  |                |
| 40 | I          |           40 | LEU            | nonpolar   | sheet     | E         |      0.0182927  | -134.8 |  138.1 |    0.4  |                |
| 38 | I          |           38 | VAL            | nonpolar   | sheet     | E         |      0.00704225 | -148.4 |  126.9 |    0.42 |                |

In [12]:
display(Markdown(df_target.sort_values('accessibility', ascending=False).to_markdown()))

|    | chain_id   |   residue_id | residue_name   | property   | ss_type   | ss_code   |   accessibility |    phi |    psi |   plddt | is_interface   |
|---:|:-----------|-------------:|:---------------|:-----------|:----------|:----------|----------------:|-------:|-------:|--------:|:---------------|
| 32 | I          |           32 | GLY            | nonpolar   | helix     | H         |      1          |  -70.1 |  -35.6 |    0.52 |                |
| 16 | I          |           16 | PRO            | nonpolar   | loop      | T         |      0.948529   |  -53.1 |  145   |    0.83 |                |
| 52 | I          |           52 | LYS            | positive   | loop      | T         |      0.917073   |   58   | -139.5 |    0.95 |                |
| 48 | I          |           48 | THR            | polar      | loop      | -         |      0.866197   | -106.1 |  121.8 |    0.65 |                |
| 45 | I          |           45 | THR            | polar      | loop      | S         |      0.802817   | -137.6 |  133.4 |    0.67 |                |
| 50 | I          |           50 | LYS            | positive   | loop      | -         |      0.770732   |  -55.8 |  -73.5 |    0.9  |                |
| 13 | I          |           13 | ARG            | positive   | sheet     | E         |      0.766129   |  -65.3 |  122.2 |    0.6  |                |
| 68 | I          |           68 | PRO            | nonpolar   | loop      | -         |      0.735294   |  -70.5 |  360   |    0.65 |                |
| 33 | I          |           33 | SER            | polar      | helix     | H         |      0.715385   | -107.4 |  -18.2 |    0.51 |                |
| 66 | I          |           66 | THR            | polar      | loop      | -         |      0.647887   |  -66.1 |  156.6 |    0.48 |                |
| 51 | I          |           51 | LYS            | positive   | loop      | S         |      0.634146   | -167.2 | -169.6 |    0.91 |                |
| 17 | I          |           17 | GLY            | nonpolar   | loop      | T         |      0.607143   |   99.5 |  -43.3 |    0.81 |                |
| 61 | I          |           61 | THR            | polar      | loop      | S         |      0.605634   | -127.5 |  163.7 |    0.44 |                |
| 62 | I          |           62 | ASP            | negative   | loop      | T         |      0.588957   |  -61.5 |  124   |    0.42 |                |
| 60 | I          |           60 | SER            | polar      | loop      | -         |      0.584615   | -147.9 |   40   |    0.52 |                |
| 19 | I          |           19 | GLU            | negative   | loop      | -         |      0.57732    | -125   |   18.3 |    0.68 |                |
| 44 | I          |           44 | ALA            | nonpolar   | loop      | S         |      0.566038   |  -86.7 |  -21.1 |    0.61 |                |
| 15 | I          |           15 | PRO            | nonpolar   | loop      | P         |      0.544118   |  -55.9 |  127.2 |    0.78 |                |
| 57 | I          |           57 | ILE            | nonpolar   | sheet     | E         |      0.538462   | -115.1 |  133.6 |    0.43 |                |
| 55 | I          |           55 | LYS            | positive   | sheet     | E         |      0.517073   | -117.8 |  142.6 |    0.55 |                |
| 12 | I          |           12 | GLU            | negative   | sheet     | E         |      0.469072   | -133.1 |  146.4 |    0.52 |                |
| 49 | I          |           49 | PRO            | nonpolar   | loop      | -         |      0.455882   |  -59.3 |  132.4 |    0.72 |                |
| 10 | I          |           10 | ARG            | positive   | sheet     | E         |      0.455645   | -148.7 |  157.8 |    0.55 |                |
|  1 | I          |            1 | ARG            | positive   | loop      | -         |      0.439516   |  360   |  162.4 |    0.62 |                |
| 27 | I          |           27 | THR            | polar      | sheet     | E         |      0.43662    | -100.5 |  127.9 |    0.51 |                |
|  9 | I          |            9 | VAL            | nonpolar   | loop      | T         |      0.43662    |  -64.8 |  103.2 |    0.46 |                |
| 53 | I          |           53 | ASP            | negative   | loop      | T         |      0.429448   |  -93   |   28   |    0.77 |                |
| 46 | I          |           46 | CYS            | polar      | loop      | -         |      0.4        |  -55.4 |  123.1 |    0.66 |                |
|  2 | I          |            2 | ARG            | positive   | sheet     | E         |      0.387097   | -116.5 |  138.8 |    0.43 |                |
| 29 | I          |           29 | GLY            | nonpolar   | loop      | T         |      0.380952   |  -75.7 |   -7.8 |    0.45 |                |
| 67 | I          |           67 | PHE            | nonpolar   | loop      | -         |      0.365482   |  -75   |  132.8 |    0.54 |                |
| 36 | I          |           36 | LYS            | positive   | sheet     | E         |      0.331707   |  -79.1 |  143.2 |    0.5  |                |
|  7 | I          |            7 | PRO            | nonpolar   | loop      | T         |      0.330882   |  -71.1 |  -46.6 |    0.39 |                |
| 24 | I          |           24 | LYS            | positive   | sheet     | E         |      0.326829   | -126.2 |  128.4 |    0.47 |                |
| 18 | I          |           18 | GLN            | polar      | loop      | -         |      0.318182   |  -76.7 |  113.7 |    0.72 |                |
| 63 | I          |           63 | ASN            | polar      | loop      | T         |      0.312102   |   55.9 |   43.5 |    0.36 |                |
| 58 | I          |           58 | CYS            | polar      | sheet     | E         |      0.296296   | -126.6 |  155.5 |    0.49 |                |
| 47 | I          |           47 | PRO            | nonpolar   | loop      | -         |      0.286765   |  -63.6 |  156   |    0.63 |                |
| 59 | I          |           59 | CYS            | polar      | sheet     | E         |      0.281481   | -154.8 | -179.2 |    0.43 |                |
|  4 | I          |            4 | PHE            | nonpolar   | sheet     | E         |      0.269036   |  -70.6 |  130.8 |    0.37 |                |
| 35 | I          |           35 | GLY            | nonpolar   | loop      | -         |      0.25       |   88.9 | -162.5 |    0.44 |                |
| 39 | I          |           39 | ASP            | negative   | sheet     | E         |      0.239264   | -117.2 |  142.6 |    0.49 |                |
| 31 | I          |           31 | GLY            | nonpolar   | helix     | H         |      0.238095   |  -60.2 |  -21   |    0.5  |                |
| 11 | I          |           11 | SER            | polar      | sheet     | E         |      0.230769   |  -71.6 |  130   |    0.35 |                |
| 64 | I          |           64 | CYS            | polar      | loop      | -         |      0.22963    |  -87   |  -17   |    0.38 |                |
| 26 | I          |           26 | TRP            | nonpolar   | sheet     | E         |      0.220264   | -159.6 |  172   |    0.45 |                |
| 54 | I          |           54 | ILE            | nonpolar   | loop      | -         |      0.218935   |  -87.1 |  115.3 |    0.61 |                |
| 20 | I          |           20 | VAL            | nonpolar   | sheet     | E         |      0.197183   | -122.3 |  123.6 |    0.59 |                |
| 22 | I          |           22 | TYR            | polar      | sheet     | E         |      0.184685   | -136.2 |  155.7 |    0.42 |                |
| 56 | I          |           56 | ILE            | nonpolar   | sheet     | E         |      0.171598   | -130.8 |  127.4 |    0.51 |                |
|  6 | I          |            6 | THR            | polar      | loop      | T         |      0.169014   |  -96.9 |  158.2 |    0.41 |                |
|  5 | I          |            5 | THR            | polar      | sheet     | E         |      0.147887   | -124.5 |  140.8 |    0.43 |                |
| 42 | I          |           42 | CYS            | polar      | sheet     | E         |      0.133333   |  -96.8 |  168.4 |    0.57 |                |
|  8 | I          |            8 | SER            | polar      | loop      | T         |      0.123077   |  -89.3 |  105.9 |    0.48 |                |
| 43 | I          |           43 | ALA            | nonpolar   | sheet     | E         |      0.0943396  | -170.2 |  151.5 |    0.63 |                |
| 37 | I          |           37 | ARG            | positive   | sheet     | E         |      0.0927419  |  -66.6 |  161.7 |    0.43 |                |
| 28 | I          |           28 | ASP            | negative   | loop      | -         |      0.0858896  |  -87.4 | -175.7 |    0.45 |                |
| 41 | I          |           41 | GLY            | nonpolar   | sheet     | E         |      0.0833333  | -177.3 | -172.8 |    0.4  |                |
| 23 | I          |           23 | THR            | polar      | sheet     | E         |      0.0774648  | -127.6 |  126.4 |    0.37 |                |
| 25 | I          |           25 | THR            | polar      | sheet     | E         |      0.0422535  | -123.4 |  149.2 |    0.45 |                |
| 34 | I          |           34 | ARG            | positive   | helix     | H         |      0.0403226  | -119.2 |  -12.3 |    0.47 |                |
| 30 | I          |           30 | HIS            | positive   | loop      | T         |      0.0380435  | -106.5 |    9.4 |    0.47 |                |
| 14 | I          |           14 | CYS            | polar      | loop      | P         |      0.0222222  |  -64.3 |  144.4 |    0.69 |                |
| 65 | I          |           65 | ASN            | polar      | loop      | -         |      0.0191083  |  -89.7 |   63.2 |    0.39 |                |
| 40 | I          |           40 | LEU            | nonpolar   | sheet     | E         |      0.0182927  | -134.8 |  138.1 |    0.4  |                |
| 38 | I          |           38 | VAL            | nonpolar   | sheet     | E         |      0.00704225 | -148.4 |  126.9 |    0.42 |                |
| 21 | I          |           21 | CYS            | polar      | sheet     | E         |      0          |  -85.1 |  148.2 |    0.45 |                |
|  3 | I          |            3 | CYS            | polar      | sheet     | E         |      0          | -140.7 |  154   |    0.4  |                |

In [13]:
display(Markdown(df_target.to_markdown()))

|    | chain_id   |   residue_id | residue_name   | property   | ss_type   | ss_code   |   accessibility |    phi |    psi |   plddt | is_interface   |
|---:|:-----------|-------------:|:---------------|:-----------|:----------|:----------|----------------:|-------:|-------:|--------:|:---------------|
|  1 | I          |            1 | ARG            | positive   | loop      | -         |      0.439516   |  360   |  162.4 |    0.62 |                |
|  2 | I          |            2 | ARG            | positive   | sheet     | E         |      0.387097   | -116.5 |  138.8 |    0.43 |                |
|  3 | I          |            3 | CYS            | polar      | sheet     | E         |      0          | -140.7 |  154   |    0.4  |                |
|  4 | I          |            4 | PHE            | nonpolar   | sheet     | E         |      0.269036   |  -70.6 |  130.8 |    0.37 |                |
|  5 | I          |            5 | THR            | polar      | sheet     | E         |      0.147887   | -124.5 |  140.8 |    0.43 |                |
|  6 | I          |            6 | THR            | polar      | loop      | T         |      0.169014   |  -96.9 |  158.2 |    0.41 |                |
|  7 | I          |            7 | PRO            | nonpolar   | loop      | T         |      0.330882   |  -71.1 |  -46.6 |    0.39 |                |
|  8 | I          |            8 | SER            | polar      | loop      | T         |      0.123077   |  -89.3 |  105.9 |    0.48 |                |
|  9 | I          |            9 | VAL            | nonpolar   | loop      | T         |      0.43662    |  -64.8 |  103.2 |    0.46 |                |
| 10 | I          |           10 | ARG            | positive   | sheet     | E         |      0.455645   | -148.7 |  157.8 |    0.55 |                |
| 11 | I          |           11 | SER            | polar      | sheet     | E         |      0.230769   |  -71.6 |  130   |    0.35 |                |
| 12 | I          |           12 | GLU            | negative   | sheet     | E         |      0.469072   | -133.1 |  146.4 |    0.52 |                |
| 13 | I          |           13 | ARG            | positive   | sheet     | E         |      0.766129   |  -65.3 |  122.2 |    0.6  |                |
| 14 | I          |           14 | CYS            | polar      | loop      | P         |      0.0222222  |  -64.3 |  144.4 |    0.69 |                |
| 15 | I          |           15 | PRO            | nonpolar   | loop      | P         |      0.544118   |  -55.9 |  127.2 |    0.78 |                |
| 16 | I          |           16 | PRO            | nonpolar   | loop      | T         |      0.948529   |  -53.1 |  145   |    0.83 |                |
| 17 | I          |           17 | GLY            | nonpolar   | loop      | T         |      0.607143   |   99.5 |  -43.3 |    0.81 |                |
| 18 | I          |           18 | GLN            | polar      | loop      | -         |      0.318182   |  -76.7 |  113.7 |    0.72 |                |
| 19 | I          |           19 | GLU            | negative   | loop      | -         |      0.57732    | -125   |   18.3 |    0.68 |                |
| 20 | I          |           20 | VAL            | nonpolar   | sheet     | E         |      0.197183   | -122.3 |  123.6 |    0.59 |                |
| 21 | I          |           21 | CYS            | polar      | sheet     | E         |      0          |  -85.1 |  148.2 |    0.45 |                |
| 22 | I          |           22 | TYR            | polar      | sheet     | E         |      0.184685   | -136.2 |  155.7 |    0.42 |                |
| 23 | I          |           23 | THR            | polar      | sheet     | E         |      0.0774648  | -127.6 |  126.4 |    0.37 |                |
| 24 | I          |           24 | LYS            | positive   | sheet     | E         |      0.326829   | -126.2 |  128.4 |    0.47 |                |
| 25 | I          |           25 | THR            | polar      | sheet     | E         |      0.0422535  | -123.4 |  149.2 |    0.45 |                |
| 26 | I          |           26 | TRP            | nonpolar   | sheet     | E         |      0.220264   | -159.6 |  172   |    0.45 |                |
| 27 | I          |           27 | THR            | polar      | sheet     | E         |      0.43662    | -100.5 |  127.9 |    0.51 |                |
| 28 | I          |           28 | ASP            | negative   | loop      | -         |      0.0858896  |  -87.4 | -175.7 |    0.45 |                |
| 29 | I          |           29 | GLY            | nonpolar   | loop      | T         |      0.380952   |  -75.7 |   -7.8 |    0.45 |                |
| 30 | I          |           30 | HIS            | positive   | loop      | T         |      0.0380435  | -106.5 |    9.4 |    0.47 |                |
| 31 | I          |           31 | GLY            | nonpolar   | helix     | H         |      0.238095   |  -60.2 |  -21   |    0.5  |                |
| 32 | I          |           32 | GLY            | nonpolar   | helix     | H         |      1          |  -70.1 |  -35.6 |    0.52 |                |
| 33 | I          |           33 | SER            | polar      | helix     | H         |      0.715385   | -107.4 |  -18.2 |    0.51 |                |
| 34 | I          |           34 | ARG            | positive   | helix     | H         |      0.0403226  | -119.2 |  -12.3 |    0.47 |                |
| 35 | I          |           35 | GLY            | nonpolar   | loop      | -         |      0.25       |   88.9 | -162.5 |    0.44 |                |
| 36 | I          |           36 | LYS            | positive   | sheet     | E         |      0.331707   |  -79.1 |  143.2 |    0.5  |                |
| 37 | I          |           37 | ARG            | positive   | sheet     | E         |      0.0927419  |  -66.6 |  161.7 |    0.43 |                |
| 38 | I          |           38 | VAL            | nonpolar   | sheet     | E         |      0.00704225 | -148.4 |  126.9 |    0.42 |                |
| 39 | I          |           39 | ASP            | negative   | sheet     | E         |      0.239264   | -117.2 |  142.6 |    0.49 |                |
| 40 | I          |           40 | LEU            | nonpolar   | sheet     | E         |      0.0182927  | -134.8 |  138.1 |    0.4  |                |
| 41 | I          |           41 | GLY            | nonpolar   | sheet     | E         |      0.0833333  | -177.3 | -172.8 |    0.4  |                |
| 42 | I          |           42 | CYS            | polar      | sheet     | E         |      0.133333   |  -96.8 |  168.4 |    0.57 |                |
| 43 | I          |           43 | ALA            | nonpolar   | sheet     | E         |      0.0943396  | -170.2 |  151.5 |    0.63 |                |
| 44 | I          |           44 | ALA            | nonpolar   | loop      | S         |      0.566038   |  -86.7 |  -21.1 |    0.61 |                |
| 45 | I          |           45 | THR            | polar      | loop      | S         |      0.802817   | -137.6 |  133.4 |    0.67 |                |
| 46 | I          |           46 | CYS            | polar      | loop      | -         |      0.4        |  -55.4 |  123.1 |    0.66 |                |
| 47 | I          |           47 | PRO            | nonpolar   | loop      | -         |      0.286765   |  -63.6 |  156   |    0.63 |                |
| 48 | I          |           48 | THR            | polar      | loop      | -         |      0.866197   | -106.1 |  121.8 |    0.65 |                |
| 49 | I          |           49 | PRO            | nonpolar   | loop      | -         |      0.455882   |  -59.3 |  132.4 |    0.72 |                |
| 50 | I          |           50 | LYS            | positive   | loop      | -         |      0.770732   |  -55.8 |  -73.5 |    0.9  |                |
| 51 | I          |           51 | LYS            | positive   | loop      | S         |      0.634146   | -167.2 | -169.6 |    0.91 |                |
| 52 | I          |           52 | LYS            | positive   | loop      | T         |      0.917073   |   58   | -139.5 |    0.95 |                |
| 53 | I          |           53 | ASP            | negative   | loop      | T         |      0.429448   |  -93   |   28   |    0.77 |                |
| 54 | I          |           54 | ILE            | nonpolar   | loop      | -         |      0.218935   |  -87.1 |  115.3 |    0.61 |                |
| 55 | I          |           55 | LYS            | positive   | sheet     | E         |      0.517073   | -117.8 |  142.6 |    0.55 |                |
| 56 | I          |           56 | ILE            | nonpolar   | sheet     | E         |      0.171598   | -130.8 |  127.4 |    0.51 |                |
| 57 | I          |           57 | ILE            | nonpolar   | sheet     | E         |      0.538462   | -115.1 |  133.6 |    0.43 |                |
| 58 | I          |           58 | CYS            | polar      | sheet     | E         |      0.296296   | -126.6 |  155.5 |    0.49 |                |
| 59 | I          |           59 | CYS            | polar      | sheet     | E         |      0.281481   | -154.8 | -179.2 |    0.43 |                |
| 60 | I          |           60 | SER            | polar      | loop      | -         |      0.584615   | -147.9 |   40   |    0.52 |                |
| 61 | I          |           61 | THR            | polar      | loop      | S         |      0.605634   | -127.5 |  163.7 |    0.44 |                |
| 62 | I          |           62 | ASP            | negative   | loop      | T         |      0.588957   |  -61.5 |  124   |    0.42 |                |
| 63 | I          |           63 | ASN            | polar      | loop      | T         |      0.312102   |   55.9 |   43.5 |    0.36 |                |
| 64 | I          |           64 | CYS            | polar      | loop      | -         |      0.22963    |  -87   |  -17   |    0.38 |                |
| 65 | I          |           65 | ASN            | polar      | loop      | -         |      0.0191083  |  -89.7 |   63.2 |    0.39 |                |
| 66 | I          |           66 | THR            | polar      | loop      | -         |      0.647887   |  -66.1 |  156.6 |    0.48 |                |
| 67 | I          |           67 | PHE            | nonpolar   | loop      | -         |      0.365482   |  -75   |  132.8 |    0.54 |                |
| 68 | I          |           68 | PRO            | nonpolar   | loop      | -         |      0.735294   |  -70.5 |  360   |    0.65 |                |

In [14]:
from Bio.PDB import PDBParser
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from Bio.Data import IUPACData
from Bio.SeqUtils import ProtParamData

# 3-letter to 1-letter code map
three_to_one = IUPACData.protein_letters_3to1

# Load PDB
pdb_file = "./../target/8d9y.pdb"
parser = PDBParser()
structure = parser.get_structure("8d9y", pdb_file)

# Extract sequence and map residues
residue_list = []
sequence = ""

for chain in structure[0]:  # First model
    if chain.id == 'F':
        for residue in chain:
            resname = residue.get_resname().strip()
            try:
                one_letter = three_to_one[resname.capitalize()]
                sequence += one_letter
                residue_list.append(residue)
            except KeyError:
                continue  # Skip non-standard residues

print("Sequence:", sequence)
print("Length:", len(sequence))

# Hydropathy analysis with window
window = 9
pa = ProteinAnalysis(sequence)
hydropathy_values = pa.protein_scale(param_dict=ProtParamData.kd, window=window)

# Align hydropathy values to the residue list
offset = window // 2  # Values are centered in the window

for i, residue in enumerate(residue_list):
    resname = residue.get_resname()
    resnum = residue.get_id()[1]
    if offset <= i < len(residue_list) - offset:
        hydro = hydropathy_values[i - offset]
        print(f"Residue {resname} {resnum:>3}: Hydropathy = {hydro:.2f}")
    else:
        print(f"Residue {resname} {resnum:>3}: Hydropathy = NA")


Sequence: EVQLVESGGGFVQPGGSLRLSCAASGFTFSNFDMHWVRQSPGKGLEWVSGLDHSGGAHYAGSVKGRFTISREDAKNSLDLQMNNLRVDDTAVYFCVRGTLYHYTSGSYYSDAFDIWGQGTLVTVSSASVKGPSVFPLAPTAALGCLVKDYFPEPVTVSWNSGALTSGVHTFPAVLQSSGLYSLSSVVTVPSSSLGTQTYICNVNHKPSNTKVDKKVEPK
Length: 219
Residue GLU   1: Hydropathy = NA
Residue VAL   2: Hydropathy = NA
Residue GLN   3: Hydropathy = NA
Residue LEU   4: Hydropathy = NA
Residue VAL   5: Hydropathy = 0.01
Residue GLU   6: Hydropathy = 0.36
Residue SER   7: Hydropathy = 0.20
Residue GLY   8: Hydropathy = 1.06
Residue GLY   9: Hydropathy = 0.24
Residue GLY  10: Hydropathy = -0.40
Residue PHE  11: Hydropathy = -0.06
Residue VAL  12: Hydropathy = -0.01
Residue GLN  13: Hydropathy = -0.06
Residue PRO  14: Hydropathy = 0.41
Residue GLY  15: Hydropathy = -0.04
Residue GLY  16: Hydropathy = 0.07
Residue SER  17: Hydropathy = -0.49
Residue LEU  18: Hydropathy = 0.18
Residue ARG  19: Hydropathy = 0.56
Residue LEU  20: Hydropathy = 0.80
Residue SER  21: Hydropathy = 0.76
Residue CYS  22: Hydropat



In [15]:
import pandas as pd
from Bio.PDB import PDBParser
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from Bio.Data import IUPACData
from Bio.SeqUtils import ProtParamData

def calculate_hydropathy_df(pdb_file, chain_id='F', window=9):
    parser = PDBParser(QUIET=True)
    structure = parser.get_structure("structure", pdb_file)
    
    three_to_one = IUPACData.protein_letters_3to1

    # Extract sequence and residues
    sequence = ""
    residue_info = []

    for chain in structure[0]:  # first model
        if chain.id == chain_id:
            for residue in chain:
                resname = residue.get_resname().strip()
                try:
                    one_letter = three_to_one[resname.capitalize()]
                    sequence += one_letter
                    residue_info.append({
                        'residue_name': resname,
                        'residue_id': residue.get_id()[1],
                        'chain_id': chain.id,
                        'one_letter': one_letter
                    })
                except KeyError:
                    continue  # skip non-standard residues

    # Calculate hydropathy
    analysed_seq = ProteinAnalysis(sequence)
    hydro_values = analysed_seq.protein_scale(param_dict=ProtParamData.kd, window=window)
    offset = window // 2

    # Add hydropathy to residue info
    for i in range(len(residue_info)):
        if offset <= i < len(residue_info) - offset:
            residue_info[i]['hydropathy'] = round(hydro_values[i - offset], 3)
        else:
            residue_info[i]['hydropathy'] = None

    # Add index
    for i, r in enumerate(residue_info):
        r['index'] = i + 1

    return pd.DataFrame(residue_info)

#  Run this
df_hydro = calculate_hydropathy_df("./../target/8d9y.pdb", chain_id='I', window=9)
print(df_hydro.head())

  residue_name  residue_id chain_id one_letter  hydropathy  index
0          ARG           1        I          R         NaN      1
1          ARG           2        I          R         NaN      2
2          CYS           3        I          C         NaN      3
3          PHE           4        I          F         NaN      4
4          THR           5        I          T      -0.367      5


In [16]:
# Ensure consistent residue name format
df_hydro = df_hydro[['chain_id', 'residue_id', 'residue_name', 'hydropathy']]
df_hydro['residue_name'] = df_hydro['residue_name'].str.upper()

# Merge only hydropathy into df_target
df_merged = df_target.merge(
    df_hydro,
    on=['chain_id', 'residue_id', 'residue_name'],
    how='left'  # keep all rows from df_target
)

# Get list of columns
cols = df_merged.columns.tolist()
# Find the index of 'accessibility' column
access_idx = cols.index('accessibility')
# Remove 'hydropathy' from its current location
cols.remove('hydropathy')
# Insert 'hydropathy' right after 'accessibility'
cols.insert(access_idx + 1, 'hydropathy')
# Reorder dataframe
df_merged = df_merged[cols]

# df_merged = df_merged.sort_values(by=['accessibility', 'hydropathy'], ascending=[False, False]).reset_index(drop=True)

# Display result
from IPython.display import display, Markdown
display(Markdown(df_merged.to_markdown(index=False)))

| chain_id   |   residue_id | residue_name   | property   | ss_type   | ss_code   |   accessibility |   hydropathy |    phi |    psi |   plddt | is_interface   |
|:-----------|-------------:|:---------------|:-----------|:----------|:----------|----------------:|-------------:|-------:|-------:|--------:|:---------------|
| I          |            1 | ARG            | positive   | loop      | -         |      0.439516   |      nan     |  360   |  162.4 |    0.62 |                |
| I          |            2 | ARG            | positive   | sheet     | E         |      0.387097   |      nan     | -116.5 |  138.8 |    0.43 |                |
| I          |            3 | CYS            | polar      | sheet     | E         |      0          |      nan     | -140.7 |  154   |    0.4  |                |
| I          |            4 | PHE            | nonpolar   | sheet     | E         |      0.269036   |      nan     |  -70.6 |  130.8 |    0.37 |                |
| I          |            5 | THR            | polar      | sheet     | E         |      0.147887   |       -0.367 | -124.5 |  140.8 |    0.43 |                |
| I          |            6 | THR            | polar      | loop      | T         |      0.169014   |       -0.367 |  -96.9 |  158.2 |    0.41 |                |
| I          |            7 | PRO            | nonpolar   | loop      | T         |      0.330882   |        0.044 |  -71.1 |  -46.6 |    0.39 |                |
| I          |            8 | SER            | polar      | loop      | T         |      0.123077   |       -0.622 |  -89.3 |  105.9 |    0.48 |                |
| I          |            9 | VAL            | nonpolar   | loop      | T         |      0.43662    |       -1.433 |  -64.8 |  103.2 |    0.46 |                |
| I          |           10 | ARG            | positive   | sheet     | E         |      0.455645   |       -1.078 | -148.7 |  157.8 |    0.55 |                |
| I          |           11 | SER            | polar      | sheet     | E         |      0.230769   |       -1.178 |  -71.6 |  130   |    0.35 |                |
| I          |           12 | GLU            | negative   | sheet     | E         |      0.469072   |       -1.178 | -133.1 |  146.4 |    0.52 |                |
| I          |           13 | ARG            | positive   | sheet     | E         |      0.766129   |       -1.133 |  -65.3 |  122.2 |    0.6  |                |
| I          |           14 | CYS            | polar      | loop      | P         |      0.0222222  |       -1.989 |  -64.3 |  144.4 |    0.69 |                |
| I          |           15 | PRO            | nonpolar   | loop      | P         |      0.544118   |       -1.878 |  -55.9 |  127.2 |    0.78 |                |
| I          |           16 | PRO            | nonpolar   | loop      | T         |      0.948529   |       -1.322 |  -53.1 |  145   |    0.83 |                |
| I          |           17 | GLY            | nonpolar   | loop      | T         |      0.607143   |       -0.656 |   99.5 |  -43.3 |    0.81 |                |
| I          |           18 | GLN            | polar      | loop      | -         |      0.318182   |       -0.3   |  -76.7 |  113.7 |    0.72 |                |
| I          |           19 | GLU            | negative   | loop      | -         |      0.57732    |       -0.656 | -125   |   18.3 |    0.68 |                |
| I          |           20 | VAL            | nonpolar   | sheet     | E         |      0.197183   |       -0.911 | -122.3 |  123.6 |    0.59 |                |
| I          |           21 | CYS            | polar      | sheet     | E         |      0          |       -0.811 |  -85.1 |  148.2 |    0.45 |                |
| I          |           22 | TYR            | polar      | sheet     | E         |      0.184685   |       -0.867 | -136.2 |  155.7 |    0.42 |                |
| I          |           23 | THR            | polar      | sheet     | E         |      0.0774648  |       -0.556 | -127.6 |  126.4 |    0.37 |                |
| I          |           24 | LYS            | positive   | sheet     | E         |      0.326829   |       -0.556 | -126.2 |  128.4 |    0.47 |                |
| I          |           25 | THR            | polar      | sheet     | E         |      0.0422535  |       -1.067 | -123.4 |  149.2 |    0.45 |                |
| I          |           26 | TRP            | nonpolar   | sheet     | E         |      0.220264   |       -1.7   | -159.6 |  172   |    0.45 |                |
| I          |           27 | THR            | polar      | sheet     | E         |      0.43662    |       -1.6   | -100.5 |  127.9 |    0.51 |                |
| I          |           28 | ASP            | negative   | loop      | -         |      0.0858896  |       -1.567 |  -87.4 | -175.7 |    0.45 |                |
| I          |           29 | GLY            | nonpolar   | loop      | T         |      0.380952   |       -1.222 |  -75.7 |   -7.8 |    0.45 |                |
| I          |           30 | HIS            | positive   | loop      | T         |      0.0380435  |       -1.644 | -106.5 |    9.4 |    0.47 |                |
| I          |           31 | GLY            | nonpolar   | helix     | H         |      0.238095   |       -1.589 |  -60.2 |  -21   |    0.5  |                |
| I          |           32 | GLY            | nonpolar   | helix     | H         |      1          |       -1.944 |  -70.1 |  -35.6 |    0.52 |                |
| I          |           33 | SER            | polar      | helix     | H         |      0.715385   |       -2.056 | -107.4 |  -18.2 |    0.51 |                |
| I          |           34 | ARG            | positive   | helix     | H         |      0.0403226  |       -1.544 | -119.2 |  -12.3 |    0.47 |                |
| I          |           35 | GLY            | nonpolar   | loop      | -         |      0.25       |       -1.578 |   88.9 | -162.5 |    0.44 |                |
| I          |           36 | LYS            | positive   | sheet     | E         |      0.331707   |       -1.111 |  -79.1 |  143.2 |    0.5  |                |
| I          |           37 | ARG            | positive   | sheet     | E         |      0.0927419  |       -1.111 |  -66.6 |  161.7 |    0.43 |                |
| I          |           38 | VAL            | nonpolar   | sheet     | E         |      0.00704225 |       -0.744 | -148.4 |  126.9 |    0.42 |                |
| I          |           39 | ASP            | negative   | sheet     | E         |      0.239264   |       -0.044 | -117.2 |  142.6 |    0.49 |                |
| I          |           40 | LEU            | nonpolar   | sheet     | E         |      0.0182927  |        0.2   | -134.8 |  138.1 |    0.4  |                |
| I          |           41 | GLY            | nonpolar   | sheet     | E         |      0.0833333  |        0.556 | -177.3 | -172.8 |    0.4  |                |
| I          |           42 | CYS            | polar      | sheet     | E         |      0.133333   |        1.333 |  -96.8 |  168.4 |    0.57 |                |
| I          |           43 | ALA            | nonpolar   | sheet     | E         |      0.0943396  |        0.689 | -170.2 |  151.5 |    0.63 |                |
| I          |           44 | ALA            | nonpolar   | loop      | S         |      0.566038   |        1     |  -86.7 |  -21.1 |    0.61 |                |
| I          |           45 | THR            | polar      | loop      | S         |      0.802817   |        0.4   | -137.6 |  133.4 |    0.67 |                |
| I          |           46 | CYS            | polar      | loop      | -         |      0.4        |        0.011 |  -55.4 |  123.1 |    0.66 |                |
| I          |           47 | PRO            | nonpolar   | loop      | -         |      0.286765   |       -0.7   |  -63.6 |  156   |    0.63 |                |
| I          |           48 | THR            | polar      | loop      | -         |      0.866197   |       -1.333 | -106.1 |  121.8 |    0.65 |                |
| I          |           49 | PRO            | nonpolar   | loop      | -         |      0.455882   |       -1.922 |  -59.3 |  132.4 |    0.72 |                |
| I          |           50 | LYS            | positive   | loop      | -         |      0.770732   |       -1.344 |  -55.8 |  -73.5 |    0.9  |                |
| I          |           51 | LYS            | positive   | loop      | S         |      0.634146   |       -2.056 | -167.2 | -169.6 |    0.91 |                |
| I          |           52 | LYS            | positive   | loop      | T         |      0.917073   |       -1.378 |   58   | -139.5 |    0.95 |                |
| I          |           53 | ASP            | negative   | loop      | T         |      0.429448   |       -0.8   |  -93   |   28   |    0.77 |                |
| I          |           54 | ILE            | nonpolar   | loop      | -         |      0.218935   |       -0.344 |  -87.1 |  115.3 |    0.61 |                |
| I          |           55 | LYS            | positive   | sheet     | E         |      0.517073   |        0.367 | -117.8 |  142.6 |    0.55 |                |
| I          |           56 | ILE            | nonpolar   | sheet     | E         |      0.171598   |        0.711 | -130.8 |  127.4 |    0.51 |                |
| I          |           57 | ILE            | nonpolar   | sheet     | E         |      0.538462   |        1.067 | -115.1 |  133.6 |    0.43 |                |
| I          |           58 | CYS            | polar      | sheet     | E         |      0.296296   |        1.067 | -126.6 |  155.5 |    0.49 |                |
| I          |           59 | CYS            | polar      | sheet     | E         |      0.281481   |        0.178 | -154.8 | -179.2 |    0.43 |                |
| I          |           60 | SER            | polar      | loop      | -         |      0.584615   |        0.889 | -147.9 |   40   |    0.52 |                |
| I          |           61 | THR            | polar      | loop      | S         |      0.605634   |        0     | -127.5 |  163.7 |    0.44 |                |
| I          |           62 | ASP            | negative   | loop      | T         |      0.588957   |       -0.578 |  -61.5 |  124   |    0.42 |                |
| I          |           63 | ASN            | polar      | loop      | T         |      0.312102   |       -0.544 |   55.9 |   43.5 |    0.36 |                |
| I          |           64 | CYS            | polar      | loop      | -         |      0.22963    |       -1     |  -87   |  -17   |    0.38 |                |
| I          |           65 | ASN            | polar      | loop      | -         |      0.0191083  |      nan     |  -89.7 |   63.2 |    0.39 |                |
| I          |           66 | THR            | polar      | loop      | -         |      0.647887   |      nan     |  -66.1 |  156.6 |    0.48 |                |
| I          |           67 | PHE            | nonpolar   | loop      | -         |      0.365482   |      nan     |  -75   |  132.8 |    0.54 |                |
| I          |           68 | PRO            | nonpolar   | loop      | -         |      0.735294   |      nan     |  -70.5 |  360   |    0.65 |                |

In [17]:
df_filtered = df_merged[(df_merged['accessibility'] > 0.7)]
display(Markdown(df_filtered.to_markdown(index=False)))

| chain_id   |   residue_id | residue_name   | property   | ss_type   | ss_code   |   accessibility |   hydropathy |    phi |    psi |   plddt | is_interface   |
|:-----------|-------------:|:---------------|:-----------|:----------|:----------|----------------:|-------------:|-------:|-------:|--------:|:---------------|
| I          |           13 | ARG            | positive   | sheet     | E         |        0.766129 |       -1.133 |  -65.3 |  122.2 |    0.6  |                |
| I          |           16 | PRO            | nonpolar   | loop      | T         |        0.948529 |       -1.322 |  -53.1 |  145   |    0.83 |                |
| I          |           32 | GLY            | nonpolar   | helix     | H         |        1        |       -1.944 |  -70.1 |  -35.6 |    0.52 |                |
| I          |           33 | SER            | polar      | helix     | H         |        0.715385 |       -2.056 | -107.4 |  -18.2 |    0.51 |                |
| I          |           45 | THR            | polar      | loop      | S         |        0.802817 |        0.4   | -137.6 |  133.4 |    0.67 |                |
| I          |           48 | THR            | polar      | loop      | -         |        0.866197 |       -1.333 | -106.1 |  121.8 |    0.65 |                |
| I          |           50 | LYS            | positive   | loop      | -         |        0.770732 |       -1.344 |  -55.8 |  -73.5 |    0.9  |                |
| I          |           52 | LYS            | positive   | loop      | T         |        0.917073 |       -1.378 |   58   | -139.5 |    0.95 |                |
| I          |           68 | PRO            | nonpolar   | loop      | -         |        0.735294 |      nan     |  -70.5 |  360   |    0.65 |                |

In [18]:
df_filtered = df_merged[(df_merged['accessibility'] > 0.7) & (df_merged['hydropathy'] > 0.4)]
display(Markdown(df_filtered.to_markdown(index=False)))

| chain_id   | residue_id   | residue_name   | property   | ss_type   | ss_code   | accessibility   | hydropathy   | phi   | psi   | plddt   | is_interface   |
|------------|--------------|----------------|------------|-----------|-----------|-----------------|--------------|-------|-------|---------|----------------|

In [19]:
df_filtered = df_merged[(df_merged['accessibility'] > 0.6) & (df_merged['hydropathy'] > 0.3)]
display(Markdown(df_filtered.to_markdown(index=False)))

| chain_id   |   residue_id | residue_name   | property   | ss_type   | ss_code   |   accessibility |   hydropathy |    phi |   psi |   plddt | is_interface   |
|:-----------|-------------:|:---------------|:-----------|:----------|:----------|----------------:|-------------:|-------:|------:|--------:|:---------------|
| I          |           45 | THR            | polar      | loop      | S         |        0.802817 |          0.4 | -137.6 | 133.4 |    0.67 |                |

In [20]:
df_filtered = df_merged[(df_merged['accessibility'] > 0.5) & (df_merged['hydropathy'] > 0.3)]
display(Markdown(df_filtered.to_markdown(index=False)))

| chain_id   |   residue_id | residue_name   | property   | ss_type   | ss_code   |   accessibility |   hydropathy |    phi |   psi |   plddt | is_interface   |
|:-----------|-------------:|:---------------|:-----------|:----------|:----------|----------------:|-------------:|-------:|------:|--------:|:---------------|
| I          |           44 | ALA            | nonpolar   | loop      | S         |        0.566038 |        1     |  -86.7 | -21.1 |    0.61 |                |
| I          |           45 | THR            | polar      | loop      | S         |        0.802817 |        0.4   | -137.6 | 133.4 |    0.67 |                |
| I          |           55 | LYS            | positive   | sheet     | E         |        0.517073 |        0.367 | -117.8 | 142.6 |    0.55 |                |
| I          |           57 | ILE            | nonpolar   | sheet     | E         |        0.538462 |        1.067 | -115.1 | 133.6 |    0.43 |                |
| I          |           60 | SER            | polar      | loop      | -         |        0.584615 |        0.889 | -147.9 |  40   |    0.52 |                |

## Detecting Inter-Chain Contacts
To determine whether a particular chain (e.g., 'F') makes contact with other chains in a PDB structure, you can check for interatomic distances between residues in chain 'F' and residues in all other chains. If any atoms are closer than a distance threshold (commonly ~5 Å), it's considered a contact.

In [23]:
from Bio.PDB import PDBParser, NeighborSearch
from Bio.PDB.Polypeptide import is_aa

def get_chain_contacts(pdb_file, target_chain_id='F', distance_threshold=3.5):
    parser = PDBParser(QUIET=True)
    structure = parser.get_structure("structure", pdb_file)
    model = structure[0]  # Use the first model
    
    # Collect atoms from chain F and other chains
    target_atoms = []
    other_atoms = []
    
    for chain in model:
        for residue in chain:
            if not is_aa(residue):
                continue
            for atom in residue:
                if chain.id == target_chain_id:
                    target_atoms.append(atom)
                else:
                    other_atoms.append(atom)

    # Search for contacts using NeighborSearch
    ns = NeighborSearch(other_atoms)
    contacts = set()

    for atom in target_atoms:
        neighbors = ns.search(atom.coord, distance_threshold)
        for neighbor in neighbors:
            neighbor_chain = neighbor.get_parent().get_parent().id
            if neighbor_chain != target_chain_id:
                contacts.add(neighbor_chain)

    return list(contacts)

pdb_file = "./../target/8d9y.pdb"
contacting_chains = get_chain_contacts(pdb_file, target_chain_id='I')
print(f"Chain 'I' contacts: {contacting_chains}")

Chain 'I' contacts: ['B', 'A']


### Residue-Level Contact Map
identify residue-residue contacts between chain 'F' and other chains in a PDB file. This means listing which residue in chain 'F' is close to which residue in other chains, based on a distance cutoff (usually ~5.0 Å).

In [34]:
from Bio.PDB import PDBParser, NeighborSearch, is_aa

def get_chain_residue_contacts(pdb_file, target_chain_id='F', distance_threshold=5.0):
    parser = PDBParser(QUIET=True)
    structure = parser.get_structure("structure", pdb_file)
    model = structure[0]

    target_residues = [res for res in model[target_chain_id] if is_aa(res)]

    # Build list of all atoms (excluding the target chain) and map them to residues
    other_atoms = []
    atom_to_residue = {}
    for chain in model:
        if chain.id == target_chain_id:
            continue
        for residue in chain:
            if not is_aa(residue):
                continue
            for atom in residue:
                other_atoms.append(atom)
                atom_to_residue[atom] = residue

    # Build neighbor search index
    ns = NeighborSearch(other_atoms)

    # Store residue-residue contacts
    contacts = []

    for res_f in target_residues:
        for atom in res_f:
            neighbors = ns.search(atom.coord, distance_threshold)
            for neighbor in neighbors:
                res_other = atom_to_residue[neighbor]
                contact = {
                    f'{target_chain_id}_residue_name': res_f.get_resname(),
                    f'{target_chain_id}_residue_id': res_f.get_id()[1],
                    'contact_chain': res_other.get_parent().id,
                    'contact_residue_name': res_other.get_resname(),
                    'contact_residue_id': res_other.get_id()[1]
                }
                contacts.append(contact)

    # Remove duplicates
    contacts = [dict(t) for t in {tuple(d.items()) for d in contacts}]
    return contacts



contacts = get_chain_residue_contacts("./../target/8d9y.pdb", target_chain_id='I', distance_threshold=3)

import pandas as pd
df_contacts = pd.DataFrame(contacts)
df_contacts = df_contacts.sort_values(by="I_residue_id")
display(df_contacts)

Unnamed: 0,I_residue_name,I_residue_id,contact_chain,contact_residue_name,contact_residue_id
4,PRO,7,B,HIS,99
1,ARG,10,B,ASN,31
3,ASP,28,B,TYR,100
5,ASP,28,A,TYR,49
0,ARG,34,B,ASP,100
2,ARG,34,B,SER,100


In [35]:
f_residue_string = ",".join(df_contacts["I_residue_id"].apply(lambda x: f"I{x}").drop_duplicates())
print(f_residue_string)

I7,I10,I28,I34


In [None]:
# contacts = get_chain_residue_contacts("./../out/bindcraft/snake-venom-binder/2506192105/Accepted/8d9y_l105_s971412_mpnn2_model2.pdb", target_chain_id='B', distance_threshold=3)
# df_contacts_target = pd.DataFrame(contacts)
# display(df_contacts_target)

### Detect Disulfide Bonds in a Chain

In [36]:
from Bio.PDB import PDBParser, is_aa
from Bio.PDB.NeighborSearch import NeighborSearch

def find_disulfide_bonds(pdb_file, chain_id='F', distance_cutoff=2.2):
    parser = PDBParser(QUIET=True)
    structure = parser.get_structure("struct", pdb_file)
    model = structure[0]
    chain = model[chain_id]
    
    # Get all CYS SG atoms
    sg_atoms = []
    residues = []
    
    for residue in chain:
        if is_aa(residue) and residue.get_resname() == "CYS":
            if "SG" in residue:
                sg_atoms.append(residue["SG"])
                residues.append(residue)

    # Search for SG-SG pairs within cutoff
    disulfides = []
    for i in range(len(sg_atoms)):
        for j in range(i + 1, len(sg_atoms)):
            dist = sg_atoms[i] - sg_atoms[j]
            if dist <= distance_cutoff:
                disulfides.append((residues[i].get_id()[1], residues[j].get_id()[1]))

    return disulfides


pdb_file = "./../target/8d9y.pdb"
disulfide_bonds = find_disulfide_bonds(pdb_file, chain_id='I')
print("Disulfide bonds (residue ID pairs):", disulfide_bonds)


Disulfide bonds (residue ID pairs): [(3, 21), (14, 42), (46, 58), (59, 64)]
