In [1]:
import pathlib

from Bio import PDB
from loguru import logger
from openmm.app import PDBxFile
from pdbfixer import PDBFixer
from tqdm.auto import tqdm

from loopbuilder.segment import Segment, SegmentModel

In [2]:
PROJECT_ROOT = pathlib.Path().resolve().parent
TEST_PDB_ID_LIST = ["3idp", "6X18", "8RX0"]

In [3]:
# Download PDB files
pdb_list = PDB.PDBList(server="https://files.wwpdb.org")
for id_ in TEST_PDB_ID_LIST:
    pdb_list.retrieve_pdb_file(pdb_code=id_, pdir=PROJECT_ROOT / "data")

Structure exists: 'C:\Users\janoj\GitHub\LoopBuilder\data\3idp.cif' 
Structure exists: 'C:\Users\janoj\GitHub\LoopBuilder\data\6x18.cif' 
Structure exists: 'C:\Users\janoj\GitHub\LoopBuilder\data\8rx0.cif' 




## PDBFixer

Run the loop building process multiple times (with different random seeds; non-controllable) to generate an ensemble of structures.

In [1]:
import pathlib

from loopbuilder.build import PDBFixerBuilder
from loopbuilder.score import MolProbityScorer

PROJECT_ROOT = pathlib.Path().resolve().parent

In [None]:
builder = PDBFixerBuilder(
    structure_file=PROJECT_ROOT / "data/3idp.cif",
    output_directory=PROJECT_ROOT / "sandbox/PDBFixer",
    scorers=[MolProbityScorer(docker_image="francecosta/molprobity:v0.0.1")],
    filters=[lambda x: x.scores.get("ramachandran_outliers", 0.3) <= 0.15],
    # NOTE: That's a pretty low bar but let's just use this to generate a few results
    #    A stricter filter would be: filters=[lambda x: x.scores.get("molprobity_score", 1) <= 1],
    working_directory=PROJECT_ROOT / "sandbox/PDBFixer/tmp",
)
builder.build(n=3)

[32m2025-05-22 18:54:59.828[0m | [1mINFO    [0m | [36mloopbuilder.build[0m:[36mbuild[0m:[36m143[0m - [1mBuilding n=3 models for C:\Users\janoj\GitHub\LoopBuilder\data\3idp.cif (max_tries=30)[0m
[32m2025-05-22 18:54:59.835[0m | [1mINFO    [0m | [36mloopbuilder.build[0m:[36mbuild[0m:[36m145[0m - [1mSaving output to C:\Users\janoj\GitHub\LoopBuilder\sandbox\PDBFixer[0m
[32m2025-05-22 18:54:59.837[0m | [1mINFO    [0m | [36mloopbuilder.build[0m:[36mbuild[0m:[36m146[0m - [1mUsing working directory C:\Users\janoj\GitHub\LoopBuilder\sandbox\PDBFixer\tmp[0m
[32m2025-05-22 18:54:59.839[0m | [1mINFO    [0m | [36mloopbuilder.build[0m:[36mbuild[0m:[36m151[0m - [1mUsing 1 scorer(s):
  MolProbityScorer(cleanup=True, executable=molprobity.molprobity, docker_image=francecosta/molprobity:v0.0.1, )[0m
[32m2025-05-22 18:54:59.841[0m | [1mINFO    [0m | [36mloopbuilder.build[0m:[36mbuild[0m:[36m156[0m - [1mUsing 1 filter(s):
  <function <lambda> at 

[32m2025-05-22 18:55:00.696[0m | [1mINFO    [0m | [36mloopbuilder.build[0m:[36mbuild[0m:[36m166[0m - [1mFound 3 segments[0m


Building segments:   0%|          | 0/3 [00:00<?, ?segment/s]

[32m2025-05-22 18:55:00.757[0m | [1mINFO    [0m | [36mloopbuilder.build[0m:[36mbuild[0m:[36m174[0m - [1mBuilding models for segment loop_1[0m
[32m2025-05-22 18:56:05.680[0m | [1mINFO    [0m | [36mloopbuilder.build[0m:[36mbuild[0m:[36m204[0m - [1mScored trial model 1 for segment loop_1: {'ramachandran_outliers': 0.3571, 'rotamer_outliers': 0.3571, 'cbeta_deviations': 14.0, 'clashscore': 0.0, 'rms_bonds': 0.0293, 'rms_angles': 16.58, 'molprobity_score': 2.69}[0m
[32m2025-05-22 18:56:05.753[0m | [1mINFO    [0m | [36mloopbuilder.build[0m:[36mbuild[0m:[36m208[0m - [1mTrial model 1 for segment loop_1 failed filter <function <lambda> at 0x0000029833025090>[0m


## Scratch

### Time PDBFixer

In [13]:
import time

from pdbfixer import PDBFixer

go = time.time()
fixer = PDBFixer(str(PROJECT_ROOT / "data/3idp.cif"))
ckp1 = time.time()
print(f"Reading took  {ckp1 - go:.4f} seconds")

fixer.findMissingResidues()
ckp2 = time.time()
print(f"Finding missing residues took  {ckp2 - ckp1:.4f} seconds")

fixer.findMissingAtoms()
fixer.addMissingAtoms()
ckp3 = time.time()
print(f"Fixing took  {ckp3 - ckp2:.4f} seconds")

Reading took  1.2461 seconds
Finding missing residues took  0.0040 seconds
Fixing took  31.7846 seconds


### Extract loops from full CIF

In [1]:
import pathlib

from loopbuilder.convert import extract_segment_from_mmcif, join_segments

PROJECT_ROOT = pathlib.Path().resolve().parent

In [None]:
extract_segment_from_mmcif(
    PROJECT_ROOT / "sandbox/PDBFixer/3idp_loop_0_1.cif",
    PROJECT_ROOT / "sandbox/PDBFixer/3idp_loop_0_1_looponly.cif",
    residue_indices={449, 453},
    chain_id="A",
)

In [3]:
join_segments(
    [
        PROJECT_ROOT / "sandbox/PDBFixer/3idp_loop_0_1_looponly.cif",
        PROJECT_ROOT / "sandbox/PDBFixer/3idp_loop_0_1_looponly.cif",
    ],
    PROJECT_ROOT / "sandbox/PDBFixer/3idp_loop_0_1_looponly_joined.cif",
)

### Find missing segments with Biopython (raw)

In [None]:
def find_inner_missing_segments(structure):
    missing_segments = []

    for model in structure:
        for chain in model:
            # Chains are mixed up! Why is this?
            print(chain.full_id)

            residues = [res for res in chain if res.id[0] == " "]

            if len(residues) <= 1:
                continue

            first_res_num = residues[0].id[1]
            last_res_num = residues[-1].id[1]

            present_res_nums = {res.id[1] for res in residues}

            missing_start = None

            for i in range(first_res_num + 1, last_res_num):
                if i not in present_res_nums:
                    if missing_start is None:
                        missing_start = i
                else:
                    if missing_start is not None:
                        missing_end = i - 1
                        missing_segments.append((chain.id, missing_start, missing_end))
                        missing_start = None

            if missing_start is not None:
                missing_segments.append((chain.id, missing_start, last_res_num - 1))

    return missing_segments


structure_path = PROJECT_ROOT / "data/3idp.cif"
parser = PDB.MMCIFParser()
structure = parser.get_structure("3idp", structure_path)
missing_segments = find_inner_missing_segments(structure)

print(f"Inner missing segments in structure from {structure_path}:")
if missing_segments:
    for chain_id, start, end in missing_segments:
        print(f"Chain {chain_id}: Missing residues from {start} to {end} (total of {end - start + 1} residues)")
else:
    print("No inner missing segments found.")

('3idp', 0, 'B')
('3idp', 0, 'A')
Inner missing segments in structure from C:\Users\janoj\GitHub\LoopBuilder\data\3idp.cif:
Chain B: Missing residues from 598 to 613 (total of 16 residues)
Chain A: Missing residues from 600 to 614 (total of 15 residues)
Chain A: Missing residues from 629 to 630 (total of 2 residues)


