In [1]:
import os
import yaml
import pandas as pd
from collections import defaultdict

# ===================== 读取 YAML =====================
with open("config.yaml", "r") as f:
    config = yaml.safe_load(f)

common_params = config.get("common", {})
ppi_params = config.get("gd_results_ppi", {})

PDB_FILE = ppi_params["PDB_FILE"]
PPI_DIR = ppi_params["PPI_DIR"]
OUTPUT_SUMMARY = ppi_params["OUTPUT_SUMMARY"]

CHAIN_TARGET = ppi_params.get("CHAIN_TARGET", "A")
CHAIN_QUERY = ppi_params.get("CHAIN_QUERY", "H,L")
NO_PYMOL = ppi_params.get("NO_PYMOL", True)

# ===================== 调用 PPI 分析 =====================
ppi_cmd = f"ppi.analyse {PDB_FILE} {PPI_DIR}"
if NO_PYMOL:
    ppi_cmd += " --no-pymol"
if CHAIN_TARGET:
    ppi_cmd += f" --chain-target {CHAIN_TARGET}"
if CHAIN_QUERY:
    ppi_cmd += f" --chain-query {CHAIN_QUERY}"

# 在 Notebook 中运行 shell 命令
os.system(ppi_cmd)

MDAnalysis.topology.tables has been moved to MDAnalysis.guesser.tables. This import point will be removed in MDAnalysis version 3.0.0
Processing PDB files:   0%|          | 0/1 [00:00<?, ?it/s]


Found 1 PDB file(s)

Will process 1 file(s)
Running 1 PDB files with 1 processes
Processing /home/yuyang/lb_yaml/data/IL23/5njd.pdb
Available chains in /home/yuyang/lb_yaml/data/IL23/5njd.pdb: ['L', 'A', 'B', 'H', 'Y']
Analysing ['A'] vs ['H', 'L']
[('segid A', 'segid H or segid L')]


[22:42:51] Explicit valence for atom # 86 N, 4, is greater than permitted
[22:42:52] Explicit valence for atom # 616 N, 4, is greater than permitted


Successfully processed segid A vs segid H or segid L

Outputs saved to: /home/yuyang/lb_yaml/data/IL23/ppi_csv_gd


Processing PDB files: 100%|██████████| 1/1 [00:05<00:00,  5.53s/it]


0

In [2]:
model_to_residues = defaultdict(set)

for csv_file in sorted(os.listdir(PPI_DIR)):
    if not csv_file.endswith("_interactions.csv"):
        continue
    csv_path = os.path.join(PPI_DIR, csv_file)
    try:
        df = pd.read_csv(csv_path)
    except Exception as e:
        print(f"⚠️ 无法读取 {csv_file}: {e}")
        continue

    required_cols = {"model_name", "chainA", "resiA", "chainB"}
    if not required_cols.issubset(df.columns):
        print(f"⚠️ {csv_file} 缺少必要列，跳过")
        continue

    df_antigen = df[df["chainA"] == CHAIN_TARGET]
    if df_antigen.empty:
        continue

    model_name = str(df_antigen["model_name"].iloc[0]).strip() if "model_name" in df_antigen.columns else os.path.basename(csv_file).split("_")[0]

    residues = (
        df_antigen["resiA"]
        .dropna()
        .astype(str)
        .str.replace(r"\D", "", regex=True)
        .astype(int)
        .tolist()
    )
    model_to_residues[model_name].update(residues)

summary_data = [{"model_name": k, "antigen_binding_sites_gd": ",".join(map(str, sorted(v)))} for k, v in sorted(model_to_residues.items())]

df_summary = pd.DataFrame(summary_data)
df_summary.to_csv(OUTPUT_SUMMARY, index=False)

print(f"✅ 输出文件: {OUTPUT_SUMMARY}")

✅ 输出文件: /home/yuyang/lb_yaml/data/IL23/ppi_summary_gd.csv
