# TrimPDB
Use this notebook to isolate the extracellular domain (ECD) from any protein PDB file. Update the parameters below, then run the notebook top to bottom.


## 1. Configure inputs


In [None]:
# --- User Parameters ---

from pathlib import Path

BINDCRAFT_ROOT = Path(r'/mnt/e/Code/BindCraft').resolve()

INPUT_FOLDER = Path('/mnt/e/Code/BindCraft/InputTargets')


PDB_NAME = '7AX1.pdb'

OUTPUT_NAME = 'CCR4_extracellular.pdb'



# Optional: OPM-aligned file (if available) for membrane orientation

OPM_PDB_NAME = None  # e.g., 'your_input_opm.pdb'

MEMBRANE_HALF_THICKNESS = 15.0  # Angstroms



# ECD range

Setup_ECD_Range = [('A', 26, 315)]

ECD_RANGES = Setup_ECD_Range.copy()

STRUCTURE_ID = PDB_NAME

CHAIN_IDS = [cid for cid, _, _ in ECD_RANGES]



In [None]:
from Bio.PDB import PDBParser, PDBIO

PDB_PATH = INPUT_FOLDER / PDB_NAME

OUT_PATH = INPUT_FOLDER / OUTPUT_NAME

OPM_PATH = INPUT_FOLDER / OPM_PDB_NAME if OPM_PDB_NAME else None



if not PDB_PATH.exists():

    raise FileNotFoundError(f'Cannot find {PDB_PATH}. Place the PDB in InputTargets.')



BINDCRAFT_ROOT = Path(r'/mnt/e/Code/BindCraft').resolve()

print('Input folder :', INPUT_FOLDER)

print('Input PDB    :', PDB_PATH)

print('Output PDB   :', OUT_PATH)

print('Initial ECD ranges:', ECD_RANGES)

if OPM_PATH and OPM_PATH.exists():

    print('OPM file     :', OPM_PATH)

else:

    print('OPM file     : None (set OPM_PDB_NAME if available)')



## 2. Collect protein residues from ATOM records


In [None]:
from collections import defaultdict

protein_residues = defaultdict(set)

with open(PDB_PATH) as handle:

    for line in handle:

        if line.startswith('ATOM'):

            chain = line[21].strip() or ' '  # blank chain fallback

            resseq = int(line[22:26])

            protein_residues[chain].add(resseq)

if not protein_residues:

    raise ValueError('No ATOM records found; cannot identify protein residues.')

print('Protein residue counts:')

for chain, residues in protein_residues.items():

    print(f'  Chain {chain}: {len(residues)} residues (min {min(residues)} max {max(residues)})')



In [None]:
segments = {chain: (min(res), max(res), len(res)) for chain, res in protein_residues.items()}
for chain, (start, end, count) in segments.items():
    print(f'Chain {chain}: residues {start}-{end} (count {count})')


## 3. Optional: infer ECD using OPM orientation
If available, supply an OPM-aligned PDB to refine the extracellular ranges automatically.


In [None]:
def contiguous_ranges(res_nums):
    if not res_nums:
        return []
    res_nums = sorted(res_nums)
    ranges = []
    start = prev = res_nums[0]
    for num in res_nums[1:]:
        if num != prev + 1:
            ranges.append((start, prev))
            start = num
        prev = num
    ranges.append((start, prev))
    return ranges

if OPM_PATH and OPM_PATH.exists():
    parser = PDBParser(QUIET=True)
    opm_structure = parser.get_structure('OPM', str(OPM_PATH))
    slab_min, slab_max = -MEMBRANE_HALF_THICKNESS, MEMBRANE_HALF_THICKNESS
    extracellular = []
    for model in opm_structure:
        for chain in model:
            for residue in chain:
                if residue.id[0] != ' ':
                    continue
                if 'CA' not in residue:
                    continue
                z = residue['CA'].coord[2]
                if z > slab_max:
                    extracellular.append((chain.id, residue.id[1]))
    if extracellular:
        updated = []
        for chain in protein_residues.keys():
            res_nums = [res for cid, res in extracellular if cid == chain]
            for start, end in contiguous_ranges(res_nums):
                updated.append((chain, start, end))
        if updated:
            ECD_RANGES = updated
            print('ECD_RANGES inferred from OPM:', ECD_RANGES)
        else:
            print('No extracellular residues detected above membrane slab; using defaults.')
    else:
        print('OPM file read but no residues above membrane slab.')
else:
    print('Skipping OPM inference (no OPM_PDB_NAME provided).')


## 4. Confirm extracellular ranges


In [None]:
print('Final ECD_RANGES:', ECD_RANGES)

## 5. Trim the PDB and save


In [None]:
from Bio.PDB import Select

class RangeSelect(Select):
    def __init__(self, keep_ranges):
        self.keep_ranges = keep_ranges

    def accept_residue(self, residue):
        if residue.id[0] != ' ':
            return 0
        chain_id = residue.parent.id
        resseq = residue.id[1]
        for cid, start, end in self.keep_ranges:
            if chain_id == cid and start <= resseq <= end:
                return 1
        return 0

parser = PDBParser(QUIET=True)
structure = parser.get_structure(STRUCTURE_ID, str(PDB_PATH))
selector = RangeSelect(ECD_RANGES)
io = PDBIO()
io.set_structure(structure)
io.save(str(OUT_PATH), selector)
print('Extracellular domain written to', OUT_PATH)


## 6. Sanity check


In [None]:
structure_trimmed = parser.get_structure('ECD', str(OUT_PATH))
residue_count = sum(1 for residue in structure_trimmed.get_residues() if residue.id[0] == ' ')
print('Retained residues:', residue_count)
