In [None]:
import os
import yaml
import pandas as pd
from collections import defaultdict
import subprocess
from tqdm import tqdm
from bioblocks.io import read_model, write_model

# ===================== 读取 YAML =====================
with open("config.yaml", "r") as f:
    config = yaml.safe_load(f)

common_params = config.get("common", {})
af3_params = config.get("af3_results_ppi", {})

CHAIN_TARGET = common_params.get("CHAIN_TARGET", "A")
CHAIN_QUERY = common_params.get("CHAIN_QUERY", "H,L")
NO_PYMOL = common_params.get("NO_PYMOL", True)

CIF_FOLDER = af3_params["CIF_FOLDER"]
PDB_FOLDER = af3_params["PDB_FOLDER"]
RMSD_FILE = af3_params["RMSD_FILE"]
PPI_FOLDER = af3_params["PPI_FOLDER"]
PPI_SUMMARY = af3_params["PPI_SUMMARY"]
MAPPING_FILE = af3_params["MAPPING_FILE"]
OUTPUT_MAPPING = af3_params["OUTPUT_MAPPING"]

os.makedirs(PDB_FOLDER, exist_ok=True)
os.makedirs(PPI_FOLDER, exist_ok=True)
os.makedirs(os.path.dirname(PPI_SUMMARY), exist_ok=True)
os.makedirs(os.path.dirname(OUTPUT_MAPPING), exist_ok=True)

In [None]:
# ===================== 1️⃣ CIF -> PDB (RMSD <= 60) =====================
rmsd_df = pd.read_csv(RMSD_FILE)
valid_files = set(rmsd_df[rmsd_df["RMSD"] <= 60]["file"].tolist())

for cif_file in os.listdir(CIF_FOLDER):
    if cif_file.endswith(".cif") and cif_file in valid_files:
        cif_path = os.path.join(CIF_FOLDER, cif_file)
        pdb_path = os.path.join(PDB_FOLDER, cif_file.replace(".cif", ".pdb"))
        model = read_model(cif_path)
        write_model(model, pdb_path)
        print(f"✅ Converted: {cif_file}")

In [None]:
# ===================== 2️⃣ 计算 PPI =====================
pdb_files = [f for f in os.listdir(PDB_FOLDER) if f.endswith(".pdb")]
print(f"将处理 {len(pdb_files)} 个 PDB 文件...\n")

for pdb_file in tqdm(pdb_files):
    pdb_path = os.path.join(PDB_FOLDER, pdb_file)
    cmd = [
        "ppi.analyse",
        pdb_path,
        PPI_FOLDER,
        "--no-pymol" if NO_PYMOL else "",
        "--chain-target", CHAIN_TARGET,
        "--chain-query", CHAIN_QUERY
    ]
    # 过滤掉空参数
    cmd = [c for c in cmd if c]
    subprocess.run(cmd, check=True)
    print(f"✅ 已完成: {pdb_file}")


In [None]:
# ===================== 3️⃣ 汇总 AF3 的结合位点 =====================
model_to_residues = defaultdict(set)

for csv_file in sorted(os.listdir(PPI_FOLDER)):
    if not csv_file.endswith("_interactions.csv"):
        continue
    csv_path = os.path.join(PPI_FOLDER, csv_file)
    try:
        df = pd.read_csv(csv_path)
    except Exception as e:
        print(f"⚠️ 无法读取 {csv_file}: {e}")
        continue
    df_antigen = df[df["chainA"] == CHAIN_TARGET]
    if df_antigen.empty:
        continue
    model_name = str(df_antigen["model_name"].iloc[0]).strip() if "model_name" in df_antigen.columns else os.path.basename(csv_file).split("_")[0]
    residues = df_antigen["resiA"].dropna().astype(str).str.replace(r"\D", "", regex=True).astype(int).tolist()
    model_to_residues[model_name].update(residues)

summary_data = [{"model_name": k, "antigen_binding_sites_AF3": ",".join(map(str, sorted(v)))} for k, v in sorted(model_to_residues.items())]
df_summary = pd.DataFrame(summary_data)
df_summary.to_csv(PPI_SUMMARY, index=False)
print(f"✅ 输出文件: {PPI_SUMMARY}")