In [2]:
#处理gd的pdb文件，确保能被ppi计算

import MDAnalysis as mda
import os

def process_groundtruth_single(input_pdb, output_dir):
    """
    处理单个 groundtruth PDB 文件：
    1. 删除非 A/H/L 链；
    2. 生成：
        - H_to_B_no_L：H链改为B，删除L；
        - L_to_B_no_H：L链改为B，删除H。
    """
    base_name = os.path.splitext(os.path.basename(input_pdb))[0]
    os.makedirs(output_dir, exist_ok=True)

    try:
        # 载入 PDB 文件
        u = mda.Universe(input_pdb)
        all_chains = sorted(set(u.atoms.chainIDs))
        print(f"处理文件: {os.path.basename(input_pdb)}")
        print(f"  原始链: {all_chains}")

        # 只保留 A/H/L
        keep_chains = [c for c in ['A', 'H', 'L'] if c in all_chains]
        if not keep_chains:
            print("  ⚠️ 没有检测到 A/H/L 链，停止处理。")
            return

        u_filtered = u.select_atoms(" or ".join([f"chainID {c}" for c in keep_chains]))
        u_filtered = mda.Merge(u_filtered)
        filtered_chains = sorted(set(u_filtered.atoms.chainIDs))
        print(f"  保留链: {filtered_chains}")

        # --- 方案1：H_to_B_no_L ---
        if 'H' in filtered_chains:
            selection1 = u_filtered.select_atoms("chainID A or chainID H")
            u1 = mda.Merge(selection1)
            for atom in u1.atoms:
                if atom.chainID == 'H':
                    atom.chainID = 'B'
            output1 = os.path.join(output_dir, f"{base_name}_H_to_B_no_L.pdb")
            u1.atoms.write(output1)
            print(f"  ✅ 保存文件: {output1}")

        # --- 方案2：L_to_B_no_H ---
        if 'L' in filtered_chains:
            selection2 = u_filtered.select_atoms("chainID A or chainID L")
            u2 = mda.Merge(selection2)
            for atom in u2.atoms:
                if atom.chainID == 'L':
                    atom.chainID = 'B'
            output2 = os.path.join(output_dir, f"{base_name}_L_to_B_no_H.pdb")
            u2.atoms.write(output2)
            print(f"  ✅ 保存文件: {output2}")

    except Exception as e:
        print(f"❌ 处理 {input_pdb} 时出错: {e}")


if __name__ == "__main__":
    input_pdb = "/home/yuyang/lb/data/IL23/5njd.pdb"
    output_dir = "/home/yuyang/lb/data/IL23/processed_groundtruth"

    process_groundtruth_single(input_pdb, output_dir)


处理文件: 5njd.pdb
  原始链: ['A', 'B', 'H', 'L', 'Y']
  保留链: ['A', 'H', 'L']
  ✅ 保存文件: /home/yuyang/lb/data/IL23/processed_groundtruth/5njd_H_to_B_no_L.pdb




  ✅ 保存文件: /home/yuyang/lb/data/IL23/processed_groundtruth/5njd_L_to_B_no_H.pdb


In [3]:
#利用计算ppi计算gd的H/L和A的interaction

!ppi.analyse /home/yuyang/lb/data/IL23/processed_groundtruth /home/yuyang/lb/data/IL23/ppi_csv_gd --no-pymol

MDAnalysis.topology.tables has been moved to MDAnalysis.guesser.tables. This import point will be removed in MDAnalysis version 3.0.0

Found 2 PDB file(s)

Will process 2 file(s)
Running 2 PDB files with 1 threads
Processing PDB files: 100%|███████████████████████| 2/2 [00:10<00:00,  5.11s/it]

Outputs saved to: /home/yuyang/lb/data/IL23/ppi_csv_gd


In [5]:
#汇总gd的结合位点

import os
import pandas as pd
from collections import defaultdict

# ====== 配置路径 ======
ppi_dir = "/home/yuyang/lb/data/IL23/ppi_csv_gd"
output_file = "/home/yuyang/lb/data/IL23/ppi_summary_gd.csv"

# ====== 存储结果（仅一个 groundtruth 模型） ======
model_name = "5njd"
binding_residues = set()

# ====== 遍历符合命名的文件 ======
for suffix in ["H_to_B_no_L", "L_to_B_no_H"]:
    csv_path = os.path.join(ppi_dir, f"{model_name}_{suffix}_interactions.csv")
    if not os.path.exists(csv_path):
        print(f"⚠️ 文件不存在: {csv_path}")
        continue

    try:
        df = pd.read_csv(csv_path)
    except Exception as e:
        print(f"⚠️ 无法读取 {csv_path}: {e}")
        continue

    if "resiA" not in df.columns:
        print(f"⚠️ {csv_path} 缺少 resiA 列，跳过")
        continue

    # 提取抗原残基编号（A链）
    residues = (
        df["resiA"]
        .dropna()
        .astype(str)
        .str.replace(r"\D", "", regex=True)  # 去掉非数字字符
        .astype(int)
        .tolist()
    )

    print(f"✅ 从 {os.path.basename(csv_path)} 提取到 {len(residues)} 个残基位点")
    binding_residues.update(residues)

# ====== 生成汇总表 ======
summary_df = pd.DataFrame([{
    "model_name": model_name,
    "antigen_binding_sites_gd": ",".join(map(str, sorted(binding_residues)))
}])

# 保存结果
summary_df.to_csv(output_file, index=False)

print("=" * 60)
print(f"✅ 已汇总 groundtruth 模型: {model_name}")
print(f"✅ 总结合位点数: {len(binding_residues)}")
print(f"✅ 输出文件: {output_file}")


✅ 从 5njd_H_to_B_no_L_interactions.csv 提取到 28 个残基位点
✅ 从 5njd_L_to_B_no_H_interactions.csv 提取到 18 个残基位点
✅ 已汇总 groundtruth 模型: 5njd
✅ 总结合位点数: 15
✅ 输出文件: /home/yuyang/lb/data/IL23/ppi_summary_gd.csv
