In [None]:
# Cell 1) 공통 설정 (여기만 먼저 수정)
from pathlib import Path
import os

# (필수) pepbind03 파이프라인 파일 경로
PIPELINE_PY = Path("peptide_binding_mvp/notebooks/pepbind03.py").resolve()

# (선택) 작업 기본 폴더 (pepbind03.py의 BASE_DIR)
BASE_DIR = Path(os.environ.get("PEPBIND_BASE_DIR", "~/work/pipeline")).expanduser()

# (선택) 새 실행이면 비워두고, 재개(resume)면 기존 워크스페이스 폴더명을 넣기
RUN_ID = ""  # 예: "PDP_20251223_205338"

# OpenMM 파라미터 (필요시 조정)
MD_TIME_PS = 100.0
TIMESTEP_FS = 2.0
RESTRAINT_K = 1.0

In [None]:
# Cell 2) pipeline 모듈 로드
import importlib.util

assert PIPELINE_PY.exists(), f"PIPELINE_PY not found: {PIPELINE_PY}"

# pepbind03.py 로딩 전에 BASE_DIR를 환경변수로 고정
os.environ["PEPBIND_BASE_DIR"] = str(BASE_DIR)

spec = importlib.util.spec_from_file_location("pepbind03", str(PIPELINE_PY))
pep = importlib.util.module_from_spec(spec)
spec.loader.exec_module(pep)  # type: ignore

print("Loaded pipeline:", PIPELINE_PY)
print("BASE_DIR:", pep.BASE_DIR)
print("OpenMM available:", getattr(pep, "_OPENMM_AVAILABLE", None))
print("DEVICE:", getattr(pep, "DEVICE", None))

In [None]:
# Cell 3) 워크스페이스 생성/로드

def _make_folders_from_existing(root: Path):
    # init_workspace와 동일한 구조를 맞춘다
    folders = {
        "root": root,
        "fasta": root / "fasta",
        "pdb": root / "pdb",
        "colabfold_out": root / "pdb" / "colabfold_output",
        "results": root / "results",
        "vina": root / "results" / "vina",
        "plip": root / "results" / "plip",
        "prodigy": root / "results" / "prodigy",
        "temp": root / "temp",
    }
    for d in folders.values():
        d.mkdir(parents=True, exist_ok=True)
    return folders

if RUN_ID:
    ws_root = BASE_DIR / RUN_ID
    assert ws_root.exists(), f"RUN_ID workspace not found: {ws_root}"
    folders = _make_folders_from_existing(ws_root)
    print("Resume workspace:", ws_root)
else:
    folders = pep.init_workspace()
    ws_root = folders["root"]
    RUN_ID = ws_root.name
    print("New workspace:", ws_root)

for k, v in folders.items():
    print(f"{k:12s} -> {v}")

In [None]:
# Cell 4) 유틸: peptides / rank1_pdbs / refined_pdbs 로드
from pathlib import Path


def load_peptides_from_fasta(fasta_path: Path):
    peptides = []
    if not fasta_path.exists():
        return peptides
    for ln in fasta_path.read_text(encoding="utf-8", errors="ignore").splitlines():
        ln = ln.strip()
        if not ln or ln.startswith(">"):
            continue
        peptides.append(ln)
    return peptides


def find_rank1_pdbs(colabfold_out: Path):
    pats = ["*_unrelaxed_rank_001_*.pdb", "*_rank_001_*.pdb", "*rank_001*.pdb"]
    out = []
    for pat in pats:
        out.extend(colabfold_out.glob(pat))
    return sorted(set(out))


def find_refined_pdbs(refined_dir: Path, originals=None):
    pats = ["*_openmm_refined.pdb", "*_relax.pdb", "*_openmm_refined_relax.pdb"]
    out = []
    for pat in pats:
        out.extend(refined_dir.glob(pat))
    out = sorted(set(out))
    if out:
        return out
    return originals or []


peptides_fa = folders["fasta"] / "peptides.fasta"
peptides = load_peptides_from_fasta(peptides_fa)
print("peptides:", len(peptides), "from", peptides_fa)

rank1_pdbs = find_rank1_pdbs(folders["colabfold_out"])
print("rank1_pdbs:", len(rank1_pdbs), "from", folders["colabfold_out"])

refined_dir = folders["pdb"] / "refined"
refined_rank1_pdbs = find_refined_pdbs(refined_dir, rank1_pdbs)
print("refined_rank1_pdbs:", len(refined_rank1_pdbs), "from", refined_dir)

In [None]:
# Cell 5) STEP 2 실행 (새로 생성하고 싶을 때만)
# - 이미 peptides.fasta가 있으면 스킵

target_fa = folders["fasta"] / "target_protein.fasta"
if not target_fa.exists():
    pep.write_target_fasta(folders["fasta"], pep.TARGET_SEQUENCE)

if peptides_fa.exists() and len(peptides) > 0:
    print("peptides.fasta already exists -> skip STEP2")
else:
    tokenizer, model = pep.load_esm_mlm()
    peptides = pep.generate_peptides_with_mlm(
        tokenizer=tokenizer,
        model=model,
        target_sequence=pep.TARGET_SEQUENCE,
        num_peptides=pep.NUM_PEPTIDES,
        peptide_len=pep.PEPTIDE_LENGTH,
        top_k=pep.PEPMLM_TOP_K,
        temperature=pep.PEPMLM_TEMPERATURE,
    )
    pep.write_peptide_fasta(folders["fasta"], peptides)
    print("Wrote peptides.fasta ->", peptides_fa)

try:
    pep.clear_gpu_memory()
except Exception:
    pass

# 갱신
peptides = load_peptides_from_fasta(peptides_fa)
print("peptides:", len(peptides))

In [None]:
# Cell 6) STEP 3 실행 (peptides.fasta -> batch_complexes.csv -> ColabFold)

if len(peptides) == 0:
    peptides = load_peptides_from_fasta(peptides_fa)
assert len(peptides) > 0, "peptides is empty. Run STEP2 first."

csv_path = pep.prepare_colabfold_batch_csv(
    temp_dir=folders["temp"],
    target_sequence=pep.TARGET_SEQUENCE,
    peptides=peptides,
)
print("batch csv:", csv_path)

pep.run_colabfold_batch_with_progress(
    csv_path=csv_path,
    out_dir=folders["colabfold_out"],
    total_complexes=len(peptides),
)

# rank1 pdb 갱신
rank1_pdbs = find_rank1_pdbs(folders["colabfold_out"])
print("rank1_pdbs:", len(rank1_pdbs))
if rank1_pdbs:
    print("example:", rank1_pdbs[0].name)

In [None]:
# Cell 7) STEP 3b 실행 (rank1 pdb -> refined pdb)
if len(rank1_pdbs) == 0:
    rank1_pdbs = find_rank1_pdbs(folders["colabfold_out"])
assert len(rank1_pdbs) > 0, "rank1_pdbs is empty. Run STEP3 first."

refined_rank1_pdbs = pep.refine_structures_with_openmm_and_relax(
    rank1_pdbs=rank1_pdbs,
    pdb_root_dir=folders["pdb"],
    md_time_ps=MD_TIME_PS,
    timestep_fs=TIMESTEP_FS,
    restraint_k=RESTRAINT_K,
)

print("refined PDB count:", len(refined_rank1_pdbs))
if refined_rank1_pdbs:
    print("example:", refined_rank1_pdbs[0].name)

In [None]:
# Cell 8) STEP 4-6 실행 (refined pdb 기준)
if len(refined_rank1_pdbs) == 0:
    refined_rank1_pdbs = find_refined_pdbs(
        folders["pdb"] / "refined",
        find_rank1_pdbs(folders["colabfold_out"]),
    )
assert len(refined_rank1_pdbs) > 0, "refined_rank1_pdbs is empty. Run STEP3b first."

pep.run_vina_on_rank1(refined_rank1_pdbs, folders["vina"])
print("Vina done:", folders["vina"])

pep.run_plip_on_rank1(refined_rank1_pdbs, folders["plip"])
print("PLIP done:", folders["plip"])

pep.run_prodigy_on_rank1(refined_rank1_pdbs, folders["prodigy"])
print("PRODIGY done:", folders["prodigy"])

In [None]:
# Cell 9) STEP 7 실행
if len(refined_rank1_pdbs) == 0:
    refined_rank1_pdbs = find_refined_pdbs(
        folders["pdb"] / "refined",
        find_rank1_pdbs(folders["colabfold_out"]),
    )

# (선택) refined pdb zip
pdb_zip = pep.zip_rank1_pdbs(refined_rank1_pdbs, folders["results"])
print("PDB zip:", pdb_zip)

final_xlsx = pep.build_and_save_final_table(
    folders=folders,
    peptides=peptides,
    rank1_pdbs=refined_rank1_pdbs,
    start_time=None,
    end_time=None,
    step_timings=None,
)
print("Final Excel:", final_xlsx)
print("Workspace:", folders["root"])

In [None]:
# Cell 10) 전체를 순차 실행하고 싶으면 사용
# - RUN_ID가 비어있으면 새 워크스페이스에서 시작
# - 이미 만들어진 산출물이 있으면 최대한 재사용(있으면 스킵)

# STEP2
if not (peptides_fa.exists() and len(load_peptides_from_fasta(peptides_fa)) > 0):
    tokenizer, model = pep.load_esm_mlm()
    peptides = pep.generate_peptides_with_mlm(
        tokenizer=tokenizer,
        model=model,
        target_sequence=pep.TARGET_SEQUENCE,
        num_peptides=pep.NUM_PEPTIDES,
        peptide_len=pep.PEPTIDE_LENGTH,
        top_k=pep.PEPMLM_TOP_K,
        temperature=pep.PEPMLM_TEMPERATURE,
    )
    pep.write_peptide_fasta(folders["fasta"], peptides)
peptides = load_peptides_from_fasta(peptides_fa)

# STEP3
csv_path = pep.prepare_colabfold_batch_csv(folders["temp"], pep.TARGET_SEQUENCE, peptides)
pep.run_colabfold_batch_with_progress(csv_path, folders["colabfold_out"], len(peptides))
rank1_pdbs = find_rank1_pdbs(folders["colabfold_out"])

# STEP3b
refined_rank1_pdbs = pep.refine_structures_with_openmm_and_relax(
    rank1_pdbs,
    folders["pdb"],
    md_time_ps=MD_TIME_PS,
    timestep_fs=TIMESTEP_FS,
    restraint_k=RESTRAINT_K,
)

# STEP4-6
pep.run_vina_on_rank1(refined_rank1_pdbs, folders["vina"])
pep.run_plip_on_rank1(refined_rank1_pdbs, folders["plip"])
pep.run_prodigy_on_rank1(refined_rank1_pdbs, folders["prodigy"])

# STEP7
pdb_zip = pep.zip_rank1_pdbs(refined_rank1_pdbs, folders["results"])
final_xlsx = pep.build_and_save_final_table(folders, peptides, refined_rank1_pdbs, None, None, None)

print("PDB zip:", pdb_zip)
print("Final Excel:", final_xlsx)
print("Workspace:", folders["root"])