## Data retrieval

In [None]:
import pathlib

from Bio import PDB

In [2]:
PROJECT_ROOT = pathlib.Path().resolve().parent
TEST_PDB_ID_LIST = ["3idp", "6X18", "8RX0"]

In [3]:
# Download PDB files
pdb_list = PDB.PDBList(server="https://files.wwpdb.org")
for id_ in TEST_PDB_ID_LIST:
    pdb_list.retrieve_pdb_file(pdb_code=id_, pdir=PROJECT_ROOT / "data")

Structure exists: 'C:\Users\janoj\GitHub\LoopBuilder\data\3idp.cif' 
Structure exists: 'C:\Users\janoj\GitHub\LoopBuilder\data\6x18.cif' 
Structure exists: 'C:\Users\janoj\GitHub\LoopBuilder\data\8rx0.cif' 




## PDBFixer

Run the loop building process multiple times (with different random seeds; non-controllable) to generate an ensemble of structures.

In [None]:
import pathlib

from loopbuilder.build import PDBFixerBuilder
from loopbuilder.score import MolProbityScorer

PROJECT_ROOT = pathlib.Path().resolve().parent

In [2]:
builder = PDBFixerBuilder(
    structure_file=PROJECT_ROOT / "data/3idp.cif",
    output_directory=PROJECT_ROOT / "sandbox/PDBFixer",
    scorers=[MolProbityScorer(docker_image="francecosta/molprobity:v0.0.1")],
    filters=[lambda x: x.scores.get("ramachandran_outliers", 0.3) <= 0.3],
    # NOTE: That's a pretty low bar but let's just use this to generate a few results
    #    A stricter filter would be: filters=[lambda x: x.scores.get("molprobity_score", 1) <= 1],
    working_directory=PROJECT_ROOT / "sandbox/PDBFixer/tmp",
)
builder.build(n=3)

[32m2025-05-22 19:16:09.815[0m | [1mINFO    [0m | [36mloopbuilder.build[0m:[36mbuild[0m:[36m143[0m - [1mBuilding n=3 models for C:\Users\janoj\GitHub\LoopBuilder\data\3idp.cif (max_tries=30)[0m
[32m2025-05-22 19:16:09.819[0m | [1mINFO    [0m | [36mloopbuilder.build[0m:[36mbuild[0m:[36m145[0m - [1mSaving output to C:\Users\janoj\GitHub\LoopBuilder\sandbox\PDBFixer[0m
[32m2025-05-22 19:16:09.823[0m | [1mINFO    [0m | [36mloopbuilder.build[0m:[36mbuild[0m:[36m146[0m - [1mUsing working directory C:\Users\janoj\GitHub\LoopBuilder\sandbox\PDBFixer\tmp[0m
[32m2025-05-22 19:16:09.826[0m | [1mINFO    [0m | [36mloopbuilder.build[0m:[36mbuild[0m:[36m151[0m - [1mUsing 1 scorer(s):
  MolProbityScorer(cleanup=True, executable=molprobity.molprobity, docker_image=francecosta/molprobity:v0.0.1, )[0m
[32m2025-05-22 19:16:09.828[0m | [1mINFO    [0m | [36mloopbuilder.build[0m:[36mbuild[0m:[36m156[0m - [1mUsing 1 filter(s):
  <function <lambda> at 

[32m2025-05-22 19:16:10.970[0m | [1mINFO    [0m | [36mloopbuilder.build[0m:[36mbuild[0m:[36m166[0m - [1mFound 3 segments[0m


Building segments:   0%|          | 0/3 [00:00<?, ?segment/s]

[32m2025-05-22 19:16:11.066[0m | [1mINFO    [0m | [36mloopbuilder.build[0m:[36mbuild[0m:[36m174[0m - [1mBuilding models for segment loop_1[0m
[32m2025-05-22 19:17:34.552[0m | [1mINFO    [0m | [36mloopbuilder.build[0m:[36mbuild[0m:[36m204[0m - [1mScored trial model 1 for segment loop_1: {'ramachandran_outliers': 0.5714, 'rotamer_outliers': 0.2857, 'cbeta_deviations': 15.0, 'clashscore': 0.0, 'rms_bonds': 0.0371, 'rms_angles': 21.48, 'molprobity_score': 2.64}[0m
[32m2025-05-22 19:17:34.560[0m | [1mINFO    [0m | [36mloopbuilder.build[0m:[36mbuild[0m:[36m208[0m - [1mTrial model 1 for segment loop_1 failed filter <function <lambda> at 0x00000253350F6DD0>[0m
[32m2025-05-22 19:19:03.560[0m | [1mINFO    [0m | [36mloopbuilder.build[0m:[36mbuild[0m:[36m204[0m - [1mScored trial model 2 for segment loop_1: {'ramachandran_outliers': 0.3571, 'rotamer_outliers': 0.3571, 'cbeta_deviations': 15.0, 'clashscore': 0.0, 'rms_bonds': 0.0269, 'rms_angles': 18.04

[Segment(identifier='loop_1', chain_index=0, chain_name='A', residue_start_index=149, residue_start_seqid=598, residue_index_offset=449, residue_names=['ALA', 'THR', 'GLU', 'LYS', 'SER', 'ARG', 'TRP', 'SER', 'GLY', 'SER', 'HIS', 'GLN', 'PHE', 'GLU', 'GLN', 'LEU'], parent_structure_file=WindowsPath('C:/Users/janoj/GitHub/LoopBuilder/data/3idp.cif'), models=[SegmentModel(identifier='loop_1', structure_file=WindowsPath('C:/Users/janoj/GitHub/LoopBuilder/sandbox/PDBFixer/3idp_loop_1_1.cif'), scores={'ramachandran_outliers': 0.2857, 'rotamer_outliers': 0.3571, 'cbeta_deviations': 14.0, 'clashscore': 0.0, 'rms_bonds': 0.0362, 'rms_angles': 19.87, 'molprobity_score': 2.72}, index=1)]),
 Segment(identifier='loop_2', chain_index=1, chain_name='B', residue_start_index=151, residue_start_seqid=600, residue_index_offset=449, residue_names=['GLU', 'LYS', 'SER', 'ARG', 'TRP', 'SER', 'GLY', 'SER', 'HIS', 'GLN', 'PHE', 'GLU', 'GLN', 'LEU', 'SER'], parent_structure_file=WindowsPath('C:/Users/janoj/GitH

In [None]:
builder = PDBFixerBuilder(
    structure_file=PROJECT_ROOT / "data/6x18.cif",
    output_directory=PROJECT_ROOT / "sandbox/PDBFixer/6x18",
    scorers=[MolProbityScorer(docker_image="francecosta/molprobity:v0.0.1")],
    filters=[lambda x: x.scores.get("ramachandran_outliers", 0.3) <= 0.3],
    working_directory=PROJECT_ROOT / "sandbox/PDBFixer/tmp",
)
builder.build(n=3)

[32m2025-05-22 19:42:10.783[0m | [1mINFO    [0m | [36mloopbuilder.build[0m:[36mbuild[0m:[36m143[0m - [1mBuilding n=3 models for C:\Users\janoj\GitHub\LoopBuilder\data\6x18.cif (max_tries=30)[0m
[32m2025-05-22 19:42:10.786[0m | [1mINFO    [0m | [36mloopbuilder.build[0m:[36mbuild[0m:[36m145[0m - [1mSaving output to C:\Users\janoj\GitHub\LoopBuilder\sandbox\PDBFixer\6x18[0m
[32m2025-05-22 19:42:10.790[0m | [1mINFO    [0m | [36mloopbuilder.build[0m:[36mbuild[0m:[36m146[0m - [1mUsing working directory C:\Users\janoj\GitHub\LoopBuilder\sandbox\PDBFixer\tmp[0m
[32m2025-05-22 19:42:10.792[0m | [1mINFO    [0m | [36mloopbuilder.build[0m:[36mbuild[0m:[36m151[0m - [1mUsing 1 scorer(s):
  MolProbityScorer(cleanup=True, executable=molprobity.molprobity, docker_image=francecosta/molprobity:v0.0.1, )[0m
[32m2025-05-22 19:42:10.795[0m | [1mINFO    [0m | [36mloopbuilder.build[0m:[36mbuild[0m:[36m156[0m - [1mUsing 1 filter(s):
  <function <lambda

Building segments:   0%|          | 0/4 [00:00<?, ?segment/s]

[32m2025-05-22 19:42:12.636[0m | [1mINFO    [0m | [36mloopbuilder.build[0m:[36mbuild[0m:[36m174[0m - [1mBuilding models for segment loop_1[0m
[32m2025-05-22 19:42:50.573[0m | [1mINFO    [0m | [36mloopbuilder.build[0m:[36mbuild[0m:[36m204[0m - [1mScored trial model 1 for segment loop_1: {'ramachandran_outliers': 0.6, 'rotamer_outliers': 0.4, 'cbeta_deviations': 17.0, 'clashscore': 0.0, 'rms_bonds': 0.0356, 'rms_angles': 18.91, 'molprobity_score': 2.79}[0m
[32m2025-05-22 19:42:50.576[0m | [1mINFO    [0m | [36mloopbuilder.build[0m:[36mbuild[0m:[36m208[0m - [1mTrial model 1 for segment loop_1 failed filter <function <lambda> at 0x0000025347667D90>[0m
[32m2025-05-22 19:43:27.088[0m | [1mINFO    [0m | [36mloopbuilder.build[0m:[36mbuild[0m:[36m204[0m - [1mScored trial model 2 for segment loop_1: {'ramachandran_outliers': 0.6, 'rotamer_outliers': 0.4667, 'cbeta_deviations': 16.0, 'clashscore': 0.0, 'rms_bonds': 0.0323, 'rms_angles': 20.06, 'molpro

[Segment(identifier='loop_1', chain_index=0, chain_name='A', residue_start_index=54, residue_start_seqid=65, residue_index_offset=11, residue_names=['VAL', 'ASN', 'GLY', 'PHE', 'ASN', 'GLY', 'GLU', 'GLY', 'GLY', 'GLU', 'GLU', 'ASP', 'PRO', 'GLN', 'ALA', 'ALA', 'ARG', 'SER', 'ASN', 'SER', 'ASP', 'GLY'], parent_structure_file=WindowsPath('C:/Users/janoj/GitHub/LoopBuilder/data/6x18.cif'), models=[SegmentModel(identifier='loop_1', structure_file=WindowsPath('C:/Users/janoj/GitHub/LoopBuilder/sandbox/PDBFixer/6x18/6x18_loop_1_1.cif'), scores={'ramachandran_outliers': 0.2, 'rotamer_outliers': 0.4667, 'cbeta_deviations': 17.0, 'clashscore': 0.0, 'rms_bonds': 0.0289, 'rms_angles': 18.28, 'molprobity_score': 2.77}, index=1), SegmentModel(identifier='loop_1', structure_file=WindowsPath('C:/Users/janoj/GitHub/LoopBuilder/sandbox/PDBFixer/6x18/6x18_loop_1_2.cif'), scores={'ramachandran_outliers': 0.3, 'rotamer_outliers': 0.4667, 'cbeta_deviations': 17.0, 'clashscore': 0.0, 'rms_bonds': 0.0427, 'r

In [None]:
# NOTE: This one has many missing segments, so beware that it can take a while
builder = PDBFixerBuilder(
    structure_file=PROJECT_ROOT / "data/8rx0.cif",
    output_directory=PROJECT_ROOT / "sandbox/PDBFixer/8rx0",
    scorers=[MolProbityScorer(docker_image="francecosta/molprobity:v0.0.1")],
    filters=[lambda x: x.scores.get("ramachandran_outliers", 0.3) <= 0.3],
    working_directory=PROJECT_ROOT / "sandbox/PDBFixer/tmp",
)
builder.build(n=3)

[32m2025-05-22 20:49:46.367[0m | [1mINFO    [0m | [36mloopbuilder.build[0m:[36mbuild[0m:[36m143[0m - [1mBuilding n=3 models for C:\Users\janoj\GitHub\LoopBuilder\data\8rx0.cif (max_tries=30)[0m
[32m2025-05-22 20:49:46.369[0m | [1mINFO    [0m | [36mloopbuilder.build[0m:[36mbuild[0m:[36m145[0m - [1mSaving output to C:\Users\janoj\GitHub\LoopBuilder\sandbox\PDBFixer\8rx0[0m
[32m2025-05-22 20:49:46.371[0m | [1mINFO    [0m | [36mloopbuilder.build[0m:[36mbuild[0m:[36m146[0m - [1mUsing working directory C:\Users\janoj\GitHub\LoopBuilder\sandbox\PDBFixer\tmp[0m
[32m2025-05-22 20:49:46.372[0m | [1mINFO    [0m | [36mloopbuilder.build[0m:[36mbuild[0m:[36m151[0m - [1mUsing 1 scorer(s):
  MolProbityScorer(cleanup=True, executable=molprobity.molprobity, docker_image=francecosta/molprobity:v0.0.1, )[0m
[32m2025-05-22 20:49:46.374[0m | [1mINFO    [0m | [36mloopbuilder.build[0m:[36mbuild[0m:[36m156[0m - [1mUsing 1 filter(s):
  <function <lambda

Building segments:   0%|          | 0/9 [00:00<?, ?segment/s]

[32m2025-05-22 20:49:48.176[0m | [1mINFO    [0m | [36mloopbuilder.build[0m:[36mbuild[0m:[36m174[0m - [1mBuilding models for segment loop_1[0m
[32m2025-05-22 20:52:13.687[0m | [1mINFO    [0m | [36mloopbuilder.build[0m:[36mbuild[0m:[36m204[0m - [1mScored trial model 1 for segment loop_1: {'ramachandran_outliers': 0.6667, 'rotamer_outliers': 0.4286, 'cbeta_deviations': 8.0, 'clashscore': 0.0, 'rms_bonds': 0.0248, 'rms_angles': 17.49, 'molprobity_score': 2.79}[0m
[32m2025-05-22 20:52:13.705[0m | [1mINFO    [0m | [36mloopbuilder.build[0m:[36mbuild[0m:[36m208[0m - [1mTrial model 1 for segment loop_1 failed filter <function <lambda> at 0x00000253472783A0>[0m
[32m2025-05-22 20:54:27.457[0m | [1mINFO    [0m | [36mloopbuilder.build[0m:[36mbuild[0m:[36m204[0m - [1mScored trial model 2 for segment loop_1: {'ramachandran_outliers': 0.5, 'rotamer_outliers': 0.4286, 'cbeta_deviations': 8.0, 'clashscore': 0.0, 'rms_bonds': 0.0243, 'rms_angles': 18.39, 'mo

KeyboardInterrupt: 

## Scratch

### Time PDBFixer

In [13]:
import time

from pdbfixer import PDBFixer

go = time.time()
fixer = PDBFixer(str(PROJECT_ROOT / "data/3idp.cif"))
ckp1 = time.time()
print(f"Reading took  {ckp1 - go:.4f} seconds")

fixer.findMissingResidues()
ckp2 = time.time()
print(f"Finding missing residues took  {ckp2 - ckp1:.4f} seconds")

fixer.findMissingAtoms()
fixer.addMissingAtoms()
ckp3 = time.time()
print(f"Fixing took  {ckp3 - ckp2:.4f} seconds")

Reading took  1.2461 seconds
Finding missing residues took  0.0040 seconds
Fixing took  31.7846 seconds


### Extract loops from full CIF

In [1]:
import pathlib

from loopbuilder.convert import extract_segment_from_mmcif, join_segments

PROJECT_ROOT = pathlib.Path().resolve().parent

In [None]:
extract_segment_from_mmcif(
    PROJECT_ROOT / "sandbox/PDBFixer/3idp_loop_0_1.cif",
    PROJECT_ROOT / "sandbox/PDBFixer/3idp_loop_0_1_looponly.cif",
    residue_indices={449, 453},
    chain_id="A",
)

In [3]:
join_segments(
    [
        PROJECT_ROOT / "sandbox/PDBFixer/3idp_loop_0_1_looponly.cif",
        PROJECT_ROOT / "sandbox/PDBFixer/3idp_loop_0_1_looponly.cif",
    ],
    PROJECT_ROOT / "sandbox/PDBFixer/3idp_loop_0_1_looponly_joined.cif",
)

### Find missing segments with Biopython (raw)

In [None]:
def find_inner_missing_segments(structure):
    missing_segments = []

    for model in structure:
        for chain in model:
            # Chains are mixed up! Why is this?
            print(chain.full_id)

            residues = [res for res in chain if res.id[0] == " "]

            if len(residues) <= 1:
                continue

            first_res_num = residues[0].id[1]
            last_res_num = residues[-1].id[1]

            present_res_nums = {res.id[1] for res in residues}

            missing_start = None

            for i in range(first_res_num + 1, last_res_num):
                if i not in present_res_nums:
                    if missing_start is None:
                        missing_start = i
                else:
                    if missing_start is not None:
                        missing_end = i - 1
                        missing_segments.append((chain.id, missing_start, missing_end))
                        missing_start = None

            if missing_start is not None:
                missing_segments.append((chain.id, missing_start, last_res_num - 1))

    return missing_segments


structure_path = PROJECT_ROOT / "data/3idp.cif"
parser = PDB.MMCIFParser()
structure = parser.get_structure("3idp", structure_path)
missing_segments = find_inner_missing_segments(structure)

print(f"Inner missing segments in structure from {structure_path}:")
if missing_segments:
    for chain_id, start, end in missing_segments:
        print(f"Chain {chain_id}: Missing residues from {start} to {end} (total of {end - start + 1} residues)")
else:
    print("No inner missing segments found.")

('3idp', 0, 'B')
('3idp', 0, 'A')
Inner missing segments in structure from C:\Users\janoj\GitHub\LoopBuilder\data\3idp.cif:
Chain B: Missing residues from 598 to 613 (total of 16 residues)
Chain A: Missing residues from 600 to 614 (total of 15 residues)
Chain A: Missing residues from 629 to 630 (total of 2 residues)




### MDTraj

In [5]:
import mdtraj

mdtraj.load("C:\\Users\\janoj\\GitHub\\LoopBuilder\\sandbox\\PDBFixer\\3idp\\3idp_loop_2.cif")

<mdtraj.Trajectory with 3 frames, 127 atoms, 15 residues, and unitcells at 0x1fc72d2bf40>