In [31]:
from Bio import PDB
import os
import numpy as np
def get_pocket_residues(pocket_pdb):
    """
    Reads a pocket PDB file and extracts pocket residue positions.
    Returns a dictionary {Chain_ID: Set(Residue Positions)}
    """
    parser = PDB.PDBParser(QUIET=True)
    structure = parser.get_structure("pocket", pocket_pdb)

    pocket_residues = {}

    for model in structure:
        for chain in model:
            chain_id = chain.id
            if chain_id not in pocket_residues:
                pocket_residues[chain_id] = set()

            for residue in chain:
                if residue.id[0] == " ":  # Exclude heteroatoms
                    pocket_residues[chain_id].add(residue.id[1])  # Store residue position

    return pocket_residues

def label_pocket_residues(protein_pdb, pocket_residues):
    """
    Reads a full protein PDB file and labels residues as pocket (1) or non-pocket (0).
    Returns a list of labeled residue features.
    """
    parser = PDB.PDBParser(QUIET=True)
    structure = parser.get_structure("protein", protein_pdb)

    labeled_residues = []

    for model in structure:
        for chain in model:
            chain_id = chain.id
            for residue in chain:
                if residue.id[0] != " ":  # Ignore heteroatoms
                    continue

                res_name = residue.get_resname()
                res_id = residue.id[1]

                b_factor = np.mean([atom.bfactor for atom in residue])
                # Check if residue is in pocket list
                pocket_label = 1 if (chain_id in pocket_residues and res_id in pocket_residues[chain_id]) else 0
                filename = os.path.basename(protein_pdb)
                filename = filename.replace("_protein.pdb", "")
                labeled_residues.append([filename, chain_id, res_name, res_id, b_factor, pocket_label])

    return labeled_residues



In [46]:
from Bio.PDB.NeighborSearch import NeighborSearch
from Bio.PDB import PDBParser

from Bio.PDB.NeighborSearch import NeighborSearch
from Bio.PDB import PDBParser

def compute_contact_number(pdb_file, cutoff=5.0):
    """
    Computes the contact number for each residue based on nearby atoms within a distance cutoff.
    """

    parser = PDBParser(QUIET=True)
    structure = parser.get_structure("protein", pdb_file)

    atoms = [atom for atom in structure.get_atoms()]
    ns = NeighborSearch(atoms)

    contact_numbers = {}
    for residue in structure.get_residues():
        res_id = f"{residue.id[1]}{residue.id[2].strip()}"  # Preserves insertion codes


        # Ensure the residue has a CA atom before accessing it
        ca_atom = residue["CA"] if "CA" in residue else None
        if ca_atom:
            contacts = ns.search(ca_atom.coord, cutoff)
            contact_numbers[res_id] = len(contacts) - 1

        else:
            contact_numbers[res_id] = 0  # Assign 0 if CA is missing

    return contact_numbers



In [60]:
pdb_path = "refined-set/186l/186l_protein.pdb"

contact_numbers = compute_contact_number(pdb_path, cutoff=10.0)

print("Residue Contact Numbers:", contact_numbers)


Residue Contact Numbers: {'1': 0, '2': 0, '3': 0, '4': 0, '5': 0, '6': 0, '7': 0, '8': 0, '9': 0, '10': 0, '11': 0, '12': 0, '13': 0, '14': 0, '15': 0, '16': 0, '17': 0, '18': 0, '19': 0, '20': 0, '21': 0, '22': 0, '23': 0, '24': 0, '25': 0, '26': 0, '27': 0, '28': 0, '29': 0, '30': 0, '31': 0, '32': 0, '33': 0, '34': 0, '35': 0, '36': 0, '37': 0, '38': 0, '39': 0, '40': 0, '41': 0, '42': 0, '43': 0, '44': 0, '45': 0, '46': 0, '47': 0, '48': 0, '49': 0, '50': 0, '51': 0, '52': 0, '53': 0, '54': 0, '55': 0, '56': 0, '57': 0, '58': 0, '59': 0, '60': 0, '61': 0, '62': 0, '63': 0, '64': 0, '65': 0, '66': 0, '67': 0, '68': 0, '69': 0, '70': 0, '71': 0, '72': 0, '73': 0, '74': 0, '75': 0, '76': 0, '77': 0, '78': 0, '79': 0, '80': 0, '81': 0, '82': 0, '83': 0, '84': 0, '85': 0, '86': 0, '87': 0, '88': 0, '89': 0, '90': 0, '91': 0, '92': 0, '93': 0, '94': 0, '95': 0, '96': 0, '97': 0, '98': 0, '99': 0, '100': 0, '101': 0, '102': 0, '103': 0, '104': 0, '105': 0, '106': 0, '107': 0, '108': 0, '1

In [32]:
pocket = get_pocket_residues('refined-set/186l/186l_pocket.pdb')
label_pocket_residues('refined-set/186l/186l_protein.pdb', pocket)

[['186l', 'A', 'MET', 1, 11.348947368421051, 0],
 ['186l', 'A', 'ASN', 2, 8.285714285714286, 0],
 ['186l', 'A', 'ILE', 3, 5.225789473684211, 0],
 ['186l', 'A', 'PHE', 4, 8.682500000000001, 0],
 ['186l', 'A', 'GLU', 5, 18.824, 0],
 ['186l', 'A', 'MET', 6, 5.645882352941176, 0],
 ['186l', 'A', 'LEU', 7, 5.729473684210526, 0],
 ['186l', 'A', 'ARG', 8, 14.051250000000001, 0],
 ['186l', 'A', 'ILE', 9, 6.598947368421053, 0],
 ['186l', 'A', 'ASP', 10, 9.479166666666666, 0],
 ['186l', 'A', 'GLU', 11, 10.4, 0],
 ['186l', 'A', 'GLY', 12, 10.024285714285712, 0],
 ['186l', 'A', 'LEU', 13, 9.607894736842105, 0],
 ['186l', 'A', 'ARG', 14, 16.830416666666668, 0],
 ['186l', 'A', 'LEU', 15, 9.407368421052633, 0],
 ['186l', 'A', 'LYS', 16, 22.625, 0],
 ['186l', 'A', 'ILE', 17, 8.146842105263158, 0],
 ['186l', 'A', 'TYR', 18, 12.66, 0],
 ['186l', 'A', 'LYS', 19, 12.678636363636365, 0],
 ['186l', 'A', 'ASP', 20, 12.022499999999999, 0],
 ['186l', 'A', 'THR', 21, 13.561428571428573, 0],
 ['186l', 'A', 'GLU'

In [62]:
# Hydrophobicity (1 = Hydrophobic, 0 = Hydrophilic)
hydrophobicity_dict = {
    "ALA": 1, "VAL": 1, "LEU": 1, "ILE": 1, "MET": 1, "PHE": 1, "TRP": 1, "PRO": 1,  # Hydrophobic
    "GLY": 0, "SER": 0, "THR": 0, "CYS": 0, "TYR": 0, "ASN": 0, "GLN": 0, "HIS": 0, "ASP": 0, "GLU": 0, "LYS": 0, "ARG": 0  # Hydrophilic
}

# Charge (-1 = Negative, 0 = Neutral, +1 = Positive)
charge_dict = {
    "ASP": -1, "GLU": -1,  # Negatively charged
    "HIS": 1, "LYS": 1, "ARG": 1,  # Positively charged
    "ALA": 0, "VAL": 0, "LEU": 0, "ILE": 0, "MET": 0, "PHE": 0, "TRP": 0, "PRO": 0,
    "GLY": 0, "SER": 0, "THR": 0, "CYS": 0, "TYR": 0, "ASN": 0, "GLN": 0  # Neutral
}

# Molecular weight (in Daltons)
molecular_weight_dict = {
    "ALA": 89.09, "ARG": 174.20, "ASN": 132.12, "ASP": 133.10, "CYS": 121.15, 
    "GLN": 146.15, "GLU": 147.13, "GLY": 75.07, "HIS": 155.15, "ILE": 131.17,
    "LEU": 131.17, "LYS": 146.19, "MET": 149.21, "PHE": 165.19, "PRO": 115.13,
    "SER": 105.09, "THR": 119.12, "TRP": 204.23, "TYR": 181.19, "VAL": 117.15
}

# Polarity (1 = Polar, 0 = Non-polar)
polarity_dict = {
    "ALA": 0, "VAL": 0, "LEU": 0, "ILE": 0, "MET": 0, "PHE": 0, "TRP": 0, "PRO": 0,  # Non-polar
    "GLY": 1, "SER": 1, "THR": 1, "CYS": 1, "TYR": 1, "ASN": 1, "GLN": 1, "HIS": 1, "ASP": 1, "GLU": 1, "LYS": 1, "ARG": 1  # Polar
}


In [63]:
from Bio import PDB

def extract_residue_properties(pdb_file):
    """
    Extracts hydrophobicity, charge, molecular weight, and polarity for each residue in a PDB file.
    Returns a dictionary {Residue Position: (Hydrophobicity, Charge, Molecular Weight, Polarity)}.
    """

    parser = PDB.PDBParser(QUIET=True)
    structure = parser.get_structure("protein", pdb_file)

    residue_properties = {}

    for model in structure:
        for chain in model:
            for residue in chain:
                if residue.id[0] != " ":  # Ignore heteroatoms
                    continue

                res_name = residue.get_resname()
                res_id = residue.id[1]  # Residue position

                # Get physicochemical properties from dictionaries
                hydrophobicity = hydrophobicity_dict.get(res_name, None)
                charge = charge_dict.get(res_name, None)
                molecular_weight = molecular_weight_dict.get(res_name, None)
                polarity = polarity_dict.get(res_name, None)

                if None in [hydrophobicity, charge, molecular_weight, polarity]:
                    print(f"Warning: Unknown residue {res_name} at position {res_id}")

                residue_properties[res_id] = (hydrophobicity, charge, molecular_weight, polarity)

    return residue_properties


In [64]:
pdb_path = "refined-set/186l/186l_protein.pdb"

residue_data = extract_residue_properties(pdb_path)

# Print results
for res_id, properties in residue_data.items():
    print(f"Residue {res_id}: Hydrophobicity={properties[0]}, Charge={properties[1]}, Molecular Weight={properties[2]}, Polarity={properties[3]}")


Residue 1: Hydrophobicity=1, Charge=0, Molecular Weight=149.21, Polarity=0
Residue 2: Hydrophobicity=0, Charge=0, Molecular Weight=132.12, Polarity=1
Residue 3: Hydrophobicity=1, Charge=0, Molecular Weight=131.17, Polarity=0
Residue 4: Hydrophobicity=1, Charge=0, Molecular Weight=165.19, Polarity=0
Residue 5: Hydrophobicity=0, Charge=-1, Molecular Weight=147.13, Polarity=1
Residue 6: Hydrophobicity=1, Charge=0, Molecular Weight=149.21, Polarity=0
Residue 7: Hydrophobicity=1, Charge=0, Molecular Weight=131.17, Polarity=0
Residue 8: Hydrophobicity=0, Charge=1, Molecular Weight=174.2, Polarity=1
Residue 9: Hydrophobicity=1, Charge=0, Molecular Weight=131.17, Polarity=0
Residue 10: Hydrophobicity=0, Charge=-1, Molecular Weight=133.1, Polarity=1
Residue 11: Hydrophobicity=0, Charge=-1, Molecular Weight=147.13, Polarity=1
Residue 12: Hydrophobicity=0, Charge=0, Molecular Weight=75.07, Polarity=1
Residue 13: Hydrophobicity=1, Charge=0, Molecular Weight=131.17, Polarity=0
Residue 14: Hydrophob