# TrimPDB
Use this notebook to isolate the extracellular domain (ECD) from any protein PDB file. Update the parameters below, then run the notebook top to bottom.


## 1. Configure inputs


In [None]:

from pathlib import Path

ROOT_DIR = Path(r'/mnt/e/Code/BindCraft').resolve()
INPUT_DIR = ROOT_DIR / 'InputTargets'
OUTPUT_DIR = ROOT_DIR / 'InputTargets' / 'trimmed'
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

TARGET_NAME = 'AF-P51679-F1-model_v6.pdb'
TARGET_PDB = INPUT_DIR / TARGET_NAME
if not TARGET_PDB.exists():
    raise FileNotFoundError(f"Target PDB {TARGET_PDB} not found")

print(f'Trimming target: {TARGET_PDB}')
print(f'Outputs will write to: {OUTPUT_DIR}')

SEGMENTS = [
    ('A', 21, 55, 'ECD_Nterm'),
    ('A', 42, 49, 'TMcap1'),
    ('A', 90, 97, 'TMcap2'),
    ('A', 100, 115, 'ECL1'),
    ('A', 120, 127, 'TMcap3'),
    ('A', 165, 215, 'ECL2_primary'),
    ('A', 170, 177, 'TMcap4'),
    ('A', 210, 217, 'TMcap5'),
    ('A', 260, 267, 'TMcap6'),
    ('A', 265, 285, 'ECL3'),
    ('A', 305, 312, 'TMcap7'),
]

print('Segments to extract:')
for chain, start, end, label in SEGMENTS:
    print(f'  {label}: chain {chain} residues {start}-{end}')


In [None]:

from Bio.PDB import PDBParser, PDBIO

parser = PDBParser(QUIET=True)
structure = parser.get_structure('target', str(TARGET_PDB))
model = structure[0]

io = PDBIO()

class SegmentSelect:
    def __init__(self, residues):
        self._residues = set(residues)

    def accept_residue(self, residue):
        return residue in self._residues

def extract_segment(chain_id, start, end, label):
    chain = model[chain_id]
    residues = [res for res in chain if start <= res.id[1] <= end]
    if not residues:
        print(f'Warning: no residues found for {label} ({chain_id}{start}-{end})')
        return
    selector = SegmentSelect(residues)
    out_name = f"{TARGET_NAME.replace('.pdb', '')}_{label}.pdb"
    out_path = OUTPUT_DIR / out_name
    io.set_structure(structure)
    io.save(str(out_path), selector)
    print(f'Saved {label} to {out_path}')

for chain_id, start, end, label in SEGMENTS:
    extract_segment(chain_id, start, end, label)


In [None]:

from pathlib import Path

ROOT_DIR = Path(r'/mnt/e/Code/BindCraft').resolve()
INPUT_DIR = ROOT_DIR / 'InputTargets'
OUTPUT_DIR = INPUT_DIR / 'trimmed'
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

TARGET_NAME = 'AF-P51679-F1-model_v6.pdb'
TARGET_PDB = INPUT_DIR / TARGET_NAME
if not TARGET_PDB.exists():
    raise FileNotFoundError(f"Target PDB {TARGET_PDB} not found")

SEGMENTS = [
    ('A', 21, 55, 'ECD_Nterm'),
    ('A', 42, 49, 'TMcap1'),
    ('A', 90, 97, 'TMcap2'),
    ('A', 100, 115, 'ECL1'),
    ('A', 120, 127, 'TMcap3'),
    ('A', 165, 215, 'ECL2_primary'),
    ('A', 170, 177, 'TMcap4'),
    ('A', 210, 217, 'TMcap5'),
    ('A', 260, 267, 'TMcap6'),
    ('A', 265, 285, 'ECL3'),
    ('A', 305, 312, 'TMcap7'),
]


In [None]:

from Bio.PDB import PDBParser, PDBIO

parser = PDBParser(QUIET=True)
structure = parser.get_structure('target', str(TARGET_PDB))
model = structure[0]
io = PDBIO()

class SegmentSelect:
    def __init__(self, residues, chain_id):
        self._residues = set(residues)
        self._chain_id = chain_id
    def accept_model(self, model):
        return True
    def accept_chain(self, chain):
        return chain.id == self._chain_id
    def accept_residue(self, residue):
        return residue in self._residues
    def accept_atom(self, atom):
        return True

def extract_segment(chain_id, start, end, label):
    chain = model[chain_id]
    residues = [res for res in chain if start <= res.id[1] <= end]
    if not residues:
        print(f'Warning: no residues found for {label} ({chain_id}{start}-{end})')
        return None
    selector = SegmentSelect(residues, chain_id)
    out_name = f"{TARGET_NAME.replace('.pdb', '')}_{label}.pdb"
    out_path = OUTPUT_DIR / out_name
    io.set_structure(structure)
    io.save(str(out_path), selector)
    print(f'Saved {label}: {out_path}')
    return residues

all_residues = []
for entry in SEGMENTS:
    res = extract_segment(*entry)
    if res:
        all_residues.extend(res)


In [None]:

# Merge all selected residues into one PDB
unique = {}
for res in all_residues:
    unique[(res.get_parent().id, res.id)] = res
selected = list(unique.values())

class MultiSegmentSelect:
    def __init__(self, residues):
        self._residues = set(residues)
        self._chains = {res.get_parent().id for res in residues}
    def accept_model(self, model):
        return True
    def accept_chain(self, chain):
        return chain.id in self._chains
    def accept_residue(self, residue):
        return residue in self._residues
    def accept_atom(self, atom):
        return True

bundle_path = OUTPUT_DIR / 'AF-P51679-F1-model_v6_trimmed_bundle.pdb'
io.set_structure(structure)
io.save(str(bundle_path), MultiSegmentSelect(selected))
print(f'Saved combined bundle: {bundle_path} (residues={len(selected)})')


In [None]:
def contiguous_ranges(res_nums):
    if not res_nums:
        return []
    res_nums = sorted(res_nums)
    ranges = []
    start = prev = res_nums[0]
    for num in res_nums[1:]:
        if num != prev + 1:
            ranges.append((start, prev))
            start = num
        prev = num
    ranges.append((start, prev))
    return ranges

if OPM_PATH and OPM_PATH.exists():
    parser = PDBParser(QUIET=True)
    opm_structure = parser.get_structure('OPM', str(OPM_PATH))
    slab_min, slab_max = -MEMBRANE_HALF_THICKNESS, MEMBRANE_HALF_THICKNESS
    extracellular = []
    for model in opm_structure:
        for chain in model:
            for residue in chain:
                if residue.id[0] != ' ':
                    continue
                if 'CA' not in residue:
                    continue
                z = residue['CA'].coord[2]
                if z > slab_max:
                    extracellular.append((chain.id, residue.id[1]))
    if extracellular:
        updated = []
        for chain in protein_residues.keys():
            res_nums = [res for cid, res in extracellular if cid == chain]
            for start, end in contiguous_ranges(res_nums):
                updated.append((chain, start, end))
        if updated:
            ECD_RANGES = updated
            print('ECD_RANGES inferred from OPM:', ECD_RANGES)
        else:
            print('No extracellular residues detected above membrane slab; using defaults.')
    else:
        print('OPM file read but no residues above membrane slab.')
else:
    print('Skipping OPM inference (no OPM_PDB_NAME provided).')


## 4. Confirm extracellular ranges


In [None]:
print('Final ECD_RANGES:', ECD_RANGES)

## 5. Trim the PDB and save


In [None]:
from Bio.PDB import Select

class RangeSelect(Select):
    def __init__(self, keep_ranges):
        self.keep_ranges = keep_ranges

    def accept_residue(self, residue):
        if residue.id[0] != ' ':
            return 0
        chain_id = residue.parent.id
        resseq = residue.id[1]
        for cid, start, end in self.keep_ranges:
            if chain_id == cid and start <= resseq <= end:
                return 1
        return 0

parser = PDBParser(QUIET=True)
structure = parser.get_structure(STRUCTURE_ID, str(PDB_PATH))
selector = RangeSelect(ECD_RANGES)
io = PDBIO()
io.set_structure(structure)
io.save(str(OUT_PATH), selector)
print('Extracellular domain written to', OUT_PATH)


## 6. Sanity check


In [None]:
structure_trimmed = parser.get_structure('ECD', str(OUT_PATH))
residue_count = sum(1 for residue in structure_trimmed.get_residues() if residue.id[0] == ' ')
print('Retained residues:', residue_count)
