In [10]:
#根据rmsd的结果选择<=30的抗体将CIF转换为PDB文件

from bioblocks.io import read_model, write_model
import os
import pandas as pd

# 文件夹路径
cif_folder = "/home/yuyang/lb/data/IL23/cif"
pdb_folder = "/home/yuyang/lb/data/IL23/cif_pdb"
os.makedirs(pdb_folder, exist_ok=True)

# RMSD 文件
rmsd_file = "/home/yuyang/lb/data/IL23/VHVL_RMSD_results.csv"
rmsd_df = pd.read_csv(rmsd_file)

# 只保留 RMSD <= 30 的文件
valid_files = set(rmsd_df[rmsd_df["RMSD"] <= 30]["file"].tolist())

# 遍历 CIF 文件
for cif_file in os.listdir(cif_folder):
    if cif_file.endswith(".cif") and cif_file in valid_files:
        cif_path = os.path.join(cif_folder, cif_file)
        pdb_path = os.path.join(pdb_folder, cif_file.replace(".cif", ".pdb"))
        model = read_model(cif_path)
        write_model(model, pdb_path)
        print(f"✅ Converted: {cif_file}")

print("✅ CIF -> PDB conversion done for RMSD <= 30")


✅ Converted: MJ00D_94.cif
✅ Converted: MJ00D_53.cif
✅ Converted: MJ00D_317.cif
✅ Converted: MJ00D_28.cif
✅ Converted: MJ00D_226.cif
✅ Converted: MJ00D_152.cif
✅ Converted: MJ00D_111.cif
✅ Converted: MJ00D_200.cif
✅ Converted: MJ00D_137.cif
✅ Converted: MJ00D_154.cif
✅ Converted: MJ00D_131.cif
✅ Converted: MJ00D_114.cif
✅ Converted: MJ00D_133.cif
✅ Converted: MJ00D_100.cif
✅ Converted: MJ00D_252.cif
✅ Converted: MJ00D_55.cif
✅ Converted: MJ00D_316.cif
✅ Converted: MJ00D_285.cif
✅ Converted: MJ00D_279.cif
✅ Converted: MJ00D_173.cif
✅ Converted: MJ00D_389.cif
✅ Converted: MJ00D_202.cif
✅ Converted: MJ00D_147.cif
✅ Converted: MJ00D_277.cif
✅ Converted: MJ00D_167.cif
✅ Converted: MJ00D_361.cif
✅ Converted: MJ00D_264.cif
✅ Converted: MJ00D_122.cif
✅ Converted: MJ00D_371.cif
✅ Converted: MJ00D_124.cif
✅ Converted: MJ00D_295.cif
✅ Converted: MJ00D_240.cif
✅ Converted: MJ00D_350.cif
✅ Converted: MJ00D_359.cif
✅ Converted: MJ00D_40.cif
✅ Converted: MJ00D_291.cif
✅ Converted: MJ00D_210.cif
✅ Conv

In [26]:
#将AF3的pdb文件处理成双链，为了后续ppi计算

import MDAnalysis as mda
import os
import glob

def process_single_antibody_pdb(input_pdb, output_dir):
    """
    处理单个抗体PDB文件
    """
    # 获取基础文件名（不含扩展名）
    base_name = os.path.splitext(os.path.basename(input_pdb))[0]
    
    try:
        # 加载PDB文件
        u = mda.Universe(input_pdb)
        
        # 检查存在的链
        chains = sorted(set(u.atoms.chainIDs))
        print(f"处理文件: {os.path.basename(input_pdb)}")
        print(f"  存在的链: {chains}")
        
        # 检查是否包含所需的链
        if not all(chain in chains for chain in ['A', 'H', 'L']):
            print(f"  警告: 文件缺少A、H、L链中的某些链，跳过处理")
            return False
        
        # 方案1: H链重命名为B，删除L链
        selection1 = u.select_atoms("chainID A or chainID H")
        u1 = mda.Merge(selection1)
        
        # 重命名H链为B
        for atom in u1.atoms:
            if atom.chainID == 'H':
                atom.chainID = 'B'
        
        # 保存文件
        output1 = os.path.join(output_dir, f"{base_name}_H_to_B_no_L.pdb")
        u1.atoms.write(output1)
        print(f"  已保存: {os.path.basename(output1)}")
        
        # 方案2: L链重命名为B，删除H链
        selection2 = u.select_atoms("chainID A or chainID L")
        u2 = mda.Merge(selection2)
        
        # 重命名L链为B
        for atom in u2.atoms:
            if atom.chainID == 'L':
                atom.chainID = 'B'
        
        # 保存文件
        output2 = os.path.join(output_dir, f"{base_name}_L_to_B_no_H.pdb")
        u2.atoms.write(output2)
        print(f"  已保存: {os.path.basename(output2)}")
        
        return True
        
    except Exception as e:
        print(f"  错误处理文件 {input_pdb}: {str(e)}")
        return False

def batch_process_antibody_pdb(input_dir, output_dir=None):
    """
    批量处理文件夹中的所有PDB文件
    
    参数:
        input_dir: 输入文件夹路径
        output_dir: 输出文件夹路径，如果为None则创建在输入目录下的processed文件夹
    """
    if output_dir is None:
        output_dir = os.path.join(input_dir, "processed")
    
    # 创建输出目录
    os.makedirs(output_dir, exist_ok=True)
    
    # 查找所有的PDB文件
    pdb_files = glob.glob(os.path.join(input_dir, "*.pdb"))
    
    if not pdb_files:
        print(f"在目录 {input_dir} 中没有找到PDB文件")
        return
    
    print(f"找到 {len(pdb_files)} 个PDB文件")
    print(f"输出目录: {output_dir}")
    print("=" * 60)
    
    success_count = 0
    failed_count = 0
    
    # 处理每个文件
    for pdb_file in pdb_files:
        if process_single_antibody_pdb(pdb_file, output_dir):
            success_count += 1
        else:
            failed_count += 1
        print()  # 空行分隔
    
    print("=" * 60)
    print(f"处理完成!")
    print(f"成功: {success_count} 个文件")
    print(f"失败: {failed_count} 个文件")

# 使用方法
if __name__ == "__main__":
    # 设置输入文件夹路径
    input_directory = "/home/yuyang/lb/data/IL23/cif_pdb"
    
    # 设置输出文件夹路径（可选，默认为输入目录下的processed文件夹）
    output_directory = "/home/yuyang/lb/data/IL23/processed_cif_pdb"
    
    if os.path.exists(input_directory):
        batch_process_antibody_pdb(input_directory, output_directory)
    else:
        print(f"错误: 目录 {input_directory} 不存在")

找到 73 个PDB文件
输出目录: /home/yuyang/lb/data/IL23/processed_cif_pdb
处理文件: MJ00D_374.pdb
  存在的链: ['A', 'H', 'L']
  已保存: MJ00D_374_H_to_B_no_L.pdb




  已保存: MJ00D_374_L_to_B_no_H.pdb

处理文件: MJ00D_164.pdb
  存在的链: ['A', 'H', 'L']
  已保存: MJ00D_164_H_to_B_no_L.pdb
  已保存: MJ00D_164_L_to_B_no_H.pdb

处理文件: MJ00D_64.pdb
  存在的链: ['A', 'H', 'L']
  已保存: MJ00D_64_H_to_B_no_L.pdb
  已保存: MJ00D_64_L_to_B_no_H.pdb

处理文件: MJ00D_28.pdb
  存在的链: ['A', 'H', 'L']
  已保存: MJ00D_28_H_to_B_no_L.pdb
  已保存: MJ00D_28_L_to_B_no_H.pdb

处理文件: MJ00D_297.pdb
  存在的链: ['A', 'H', 'L']
  已保存: MJ00D_297_H_to_B_no_L.pdb
  已保存: MJ00D_297_L_to_B_no_H.pdb

处理文件: MJ00D_247.pdb
  存在的链: ['A', 'H', 'L']
  已保存: MJ00D_247_H_to_B_no_L.pdb
  已保存: MJ00D_247_L_to_B_no_H.pdb

处理文件: MJ00D_52.pdb
  存在的链: ['A', 'H', 'L']
  已保存: MJ00D_52_H_to_B_no_L.pdb
  已保存: MJ00D_52_L_to_B_no_H.pdb

处理文件: MJ00D_387.pdb
  存在的链: ['A', 'H', 'L']
  已保存: MJ00D_387_H_to_B_no_L.pdb
  已保存: MJ00D_387_L_to_B_no_H.pdb

处理文件: MJ00D_352.pdb
  存在的链: ['A', 'H', 'L']
  已保存: MJ00D_352_H_to_B_no_L.pdb
  已保存: MJ00D_352_L_to_B_no_H.pdb

处理文件: MJ00D_210.pdb
  存在的链: ['A', 'H', 'L']
  已保存: MJ00D_210_H_to_B_no_L.pdb
  已保存: MJ0

In [28]:
#利用计算ppi计算H/L和A的interaction

import os
from tqdm import tqdm
import subprocess

# 定义文件路径
processed_cif_pdb_folder = "/home/yuyang/lb/data/IL23/processed_cif_pdb"
ppi_csv_folder = "/home/yuyang/lb/data/IL23/ppi_csv"

# 获取已经处理的 PDB 文件
processed_files = set()
for pdb_file in os.listdir(processed_cif_pdb_folder):
    if pdb_file.endswith("_processed.pdb"):
        processed_files.add(pdb_file)

# 获取所有需要处理的 CIF 文件
cif_files = [f for f in os.listdir(processed_cif_pdb_folder) if f.endswith(".pdb")]

# 过滤已经处理过的 CIF 文件
files_to_process = [
    cif_file for cif_file in cif_files if cif_file.replace(".pdb", "_processed.pdb") not in processed_files
]

# 显示需要处理的文件数量
print(f"将处理 {len(files_to_process)} 个文件...")

# 遍历需要处理的文件并执行 PPI 分析
for cif_file in tqdm(files_to_process):
    cif_path = os.path.join(processed_cif_pdb_folder, cif_file)
    processed_pdb_path = os.path.join(processed_cif_pdb_folder, cif_file.replace(".pdb", "_processed.pdb"))
    
    # 执行 ppi.analyse 命令
    try:
        # 调用 PPI 分析命令，这里使用 subprocess 来运行外部命令
        subprocess.run([
            "ppi.analyse", 
            #processed_cif_pdb_folder,
            cif_path, 
            ppi_csv_folder, 
            "--no-pymol"
        ], check=True)
        
        print(f"✅ 已处理文件: {cif_file}")
        
    except subprocess.CalledProcessError as e:
        print(f"❌ 处理文件 {cif_file} 失败: {e}")


将处理 146 个文件...


MDAnalysis.topology.tables has been moved to MDAnalysis.guesser.tables. This import point will be removed in MDAnalysis version 3.0.0
Processing PDB files:   0%|          | 0/1 [00:00<?, ?it/s]


Found 1 PDB file(s)

Will process 1 file(s)
Running 1 PDB files with 1 threads


  0%|          | 0/146 [00:03<?, ?it/s]


KeyboardInterrupt: 

In [24]:
files_to_process


['MJ00D_332_L_to_B_no_H.pdb',
 'MJ00D_55_H_to_B_no_L.pdb',
 'MJ00D_277_H_to_B_no_L.pdb',
 'MJ00D_194_L_to_B_no_H.pdb',
 'MJ00D_28_L_to_B_no_H.pdb',
 'MJ00D_167_L_to_B_no_H.pdb',
 'MJ00D_269_L_to_B_no_H.pdb',
 'MJ00D_264_H_to_B_no_L.pdb',
 'MJ00D_194_H_to_B_no_L.pdb',
 'MJ00D_391_L_to_B_no_H.pdb',
 'MJ00D_100_H_to_B_no_L.pdb',
 'MJ00D_297_H_to_B_no_L.pdb',
 'MJ00D_78_H_to_B_no_L.pdb',
 'MJ00D_52_L_to_B_no_H.pdb',
 'MJ00D_28_H_to_B_no_L.pdb',
 'MJ00D_145_H_to_B_no_L.pdb',
 'MJ00D_124_H_to_B_no_L.pdb',
 'MJ00D_188_H_to_B_no_L.pdb',
 'MJ00D_122_L_to_B_no_H.pdb',
 'MJ00D_122_H_to_B_no_L.pdb',
 'MJ00D_142_L_to_B_no_H.pdb',
 'MJ00D_164_H_to_B_no_L.pdb',
 'MJ00D_202_L_to_B_no_H.pdb',
 'MJ00D_40_L_to_B_no_H.pdb',
 'MJ00D_374_H_to_B_no_L.pdb',
 'MJ00D_147_H_to_B_no_L.pdb',
 'MJ00D_317_L_to_B_no_H.pdb',
 'MJ00D_285_L_to_B_no_H.pdb',
 'MJ00D_247_H_to_B_no_L.pdb',
 'MJ00D_43_L_to_B_no_H.pdb',
 'MJ00D_390_L_to_B_no_H.pdb',
 'MJ00D_319_H_to_B_no_L.pdb',
 'MJ00D_134_H_to_B_no_L.pdb',
 'MJ00D_291_L_to_

In [29]:
#汇总AF3的ppi结果

import os
import pandas as pd
from collections import defaultdict

# ====== 配置路径 ======
ppi_dir = "/home/yuyang/lb/data/IL23/ppi_csv"
output_file = "/home/yuyang/lb/data/IL23/ppi_summary_af3.csv"

# ====== 存储模型对应的残基集合 ======
model_to_residues = defaultdict(set)

# ====== 遍历所有 *_interactions.csv 文件 ======
for csv_file in sorted(os.listdir(ppi_dir)):
    if not csv_file.endswith("_interactions.csv"):
        continue

    file_path = os.path.join(ppi_dir, csv_file)

    # 文件名示例：MJ00D_55_H_to_B_no_L_interactions.csv
    parts = csv_file.split("_")
    if len(parts) < 2:
        continue

    # 提取模型名（前两部分）
    model_name = parts[0] + "_" + parts[1]

    # 读取 CSV
    try:
        df = pd.read_csv(file_path)
    except Exception as e:
        print(f"⚠️ 无法读取 {csv_file}: {e}")
        continue

    if "resiA" not in df.columns:
        print(f"⚠️ {csv_file} 缺少 resiA 列，跳过")
        continue

    # 提取抗原链残基编号
    residues = (
        df["resiA"]
        .dropna()
        .astype(str)
        .str.replace(r"\D", "", regex=True)  # 移除非数字字符
        .astype(int)
        .tolist()
    )

    model_to_residues[model_name].update(residues)

# ====== 生成汇总表 ======
summary_data = []
for model, residues in sorted(model_to_residues.items()):
    summary_data.append({
        "model_name": model,
        "antigen_binding_sites_AF3": ",".join(map(str, sorted(residues)))
    })

summary_df = pd.DataFrame(summary_data)
summary_df.to_csv(output_file, index=False)

print(f"✅ 已处理 {len(summary_df)} 个模型")
print(f"✅ 输出文件: {output_file}")



✅ 已处理 64 个模型
✅ 输出文件: /home/yuyang/lb/data/IL23/ppi_summary_af3.csv


In [30]:
#将AF3的结合位点map到gd上

import pandas as pd
import os

# ====== 配置路径 ======
af3_summary_file = "/home/yuyang/lb/data/IL23/ppi_summary_af3.csv"
mapping_file = "/home/yuyang/lb/data/IL23/pdb_to_af3_mapping.csv"
output_file = "/home/yuyang/lb/data/IL23/ppi_summary_mapping.csv"

# ====== 读取文件 ======
af3_df = pd.read_csv(af3_summary_file)
mapping_df = pd.read_csv(mapping_file)

# 将 mapping 的列名改成 MJ00D_0 这种（去掉后缀 _for_msa.aln）
mapping_df.columns = [
    col.replace("_for_msa.aln", "") if col != "pdb_index" else col
    for col in mapping_df.columns
]

# ====== 建立映射函数 ======
def map_to_groundtruth(model_name, antigen_sites_str):
    """
    输入：
      model_name: 如 'MJ00D_55'
      antigen_sites_str: 如 '37,96,98'
    输出：
      对应的 pdb_index 列表（去重、排序）
    """
    if model_name not in mapping_df.columns:
        return ""
    try:
        antigen_sites = [int(x) for x in str(antigen_sites_str).split(",") if x.strip()]
    except ValueError:
        return ""

    # 对于每个 residue index，查找 mapping_df 中的行
    mapped = []
    for res in antigen_sites:
        row = mapping_df.loc[mapping_df[model_name] == res, "pdb_index"]
        if not row.empty:
            mapped.extend(row.tolist())

    # 去重 + 排序
    mapped = sorted(set(mapped))
    return ",".join(map(str, mapped))

# ====== 应用映射 ======
af3_df["antigen_binding_sites_groundtruth"] = af3_df.apply(
    lambda row: map_to_groundtruth(row["model_name"], row["antigen_binding_sites_AF3"]),
    axis=1
)

# ====== 保存输出 ======
af3_df.to_csv(output_file, index=False)
print(f"✅ 已完成映射，输出文件：{output_file}")


✅ 已完成映射，输出文件：/home/yuyang/lb/data/IL23/ppi_summary_mapping.csv
