In [11]:
import os
import yaml
from bioblocks.io import read_model
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO
from Bio.Align.Applications import ClustalOmegaCommandline
from Bio import AlignIO
from Bio.PDB import PDBParser
import pandas as pd

# ===================== 读取 YAML 参数 =====================
with open("config.yaml", "r") as f:
    config = yaml.safe_load(f)

common_params = config.get("common", {})
msa_params = config.get("msa_index_map", {})

ANTIGEN_CHAIN = common_params.get("ANTIGEN_CHAIN", "A")

PDB_FILE = msa_params["PDB_FILE"]
CIF_DIR = msa_params["CIF_DIR"]

OUTPUT_FASTA = msa_params["OUTPUT_FASTA"]
OUTPUT_ALN = msa_params["OUTPUT_ALN"]
OUTPUT_MAPPING = msa_params["OUTPUT_MAPPING"]

# 默认读取 cif 文件夹下第一个 CIF 文件
cif_files = sorted([f for f in os.listdir(CIF_DIR) if f.lower().endswith(".cif")])
AF3_CIF_FILE = os.path.join(CIF_DIR, cif_files[0])

# 确保输出目录存在
os.makedirs(os.path.dirname(OUTPUT_FASTA), exist_ok=True)
os.makedirs(os.path.dirname(OUTPUT_ALN), exist_ok=True)
os.makedirs(os.path.dirname(OUTPUT_MAPPING), exist_ok=True)

# 读取 ground truth PDB
pdb_seq = read_model(PDB_FILE)[ANTIGEN_CHAIN].sequence

# 读取 AF3 CIF
af3_seq = read_model(AF3_CIF_FILE)[ANTIGEN_CHAIN].sequence

# 写 FASTA
SeqIO.write([
    SeqRecord(Seq(pdb_seq), id="PDB_A", description="Experimental validated antigen"),
    SeqRecord(Seq(af3_seq), id="AF3_A", description="AF3 predicted antigen")
], OUTPUT_FASTA, "fasta")


2

In [12]:
# MSA 输出
input_fasta = OUTPUT_FASTA
output_aln = OUTPUT_ALN
clustalomega_cline = ClustalOmegaCommandline(
    infile=input_fasta,
    outfile=output_aln,
    verbose=True,
    auto=True
)
stdout, stderr = clustalomega_cline()
alignment = AlignIO.read(output_aln, "fasta")

In [13]:
# 建立映射表
parser = PDBParser(QUIET=True)
structure = parser.get_structure("pdb", PDB_FILE)
chain = structure[0][ANTIGEN_CHAIN]

# 1. 获取PDB残基编号
pdb_residues = [res.id[1] for res in chain if res.id[0] == " "]

# 从 MSA 读取序列
pdb_seq_msa = None
af3_seq_msa = None
for record in alignment:
    if record.id.startswith("PDB_A"):
        pdb_seq_msa = str(record.seq)
    elif record.id.startswith("AF3_A"):
        af3_seq_msa = str(record.seq)

# 2. 建立三个映射关系
# 映射1: PDB残基 ↔ MSA位置
pdb_to_msa = {}  # PDB残基ID -> MSA索引(从0开始)
msa_to_pdb = {}  # MSA索引(从0开始) -> PDB残基ID

pdb_idx = 0
for msa_pos, aa in enumerate(pdb_seq_msa):
    if aa != "-":
        pdb_res = pdb_residues[pdb_idx]
        pdb_to_msa[pdb_res] = msa_pos
        msa_to_pdb[msa_pos] = pdb_res
        pdb_idx += 1

# 映射2: MSA位置 ↔ AF3残基
msa_to_af3 = {}  # MSA索引(从0开始) -> AF3残基ID
af3_to_msa = {}  # AF3残基ID -> MSA索引(从0开始)

af3_res_id = 1  # AF3残基从1开始编号
for msa_pos, aa in enumerate(af3_seq_msa):
    if aa != "-":
        msa_to_af3[msa_pos] = af3_res_id
        af3_to_msa[af3_res_id] = msa_pos
        af3_res_id += 1

# 映射3: PDB残基 ↔ AF3残基
pdb_to_af3 = {}  # PDB残基ID -> AF3残基ID
af3_to_pdb = {}  # AF3残基ID -> PDB残基ID

for pdb_res, msa_pos in pdb_to_msa.items():
    if msa_pos in msa_to_af3:
        af3_res = msa_to_af3[msa_pos]
        pdb_to_af3[pdb_res] = af3_res
        af3_to_pdb[af3_res] = pdb_res

# 保存结果
mapping_data = []
for pdb_res, msa_pos in pdb_to_msa.items():
    af3_res = pdb_to_af3.get(pdb_res)
    mapping_data.append({
        "pdb_residue_id": pdb_res,
        "msa_index": msa_pos + 1,  # 显示时从1开始更直观
        "af3_residue_id": af3_res
    })

df = pd.DataFrame(mapping_data)
df.to_csv(OUTPUT_MAPPING, index=False)

# 打印映射关系示例
print("=== 三个映射关系 ===")
print(f"1. PDB残基 ↔ MSA位置: {len(pdb_to_msa)} 个映射")
print(f"2. MSA位置 ↔ AF3残基: {len(msa_to_af3)} 个映射") 
print(f"3. PDB残基 ↔ AF3残基: {len(pdb_to_af3)} 个映射")
print("\n=== 前10个映射示例 ===")
print(df.head(10))

=== 三个映射关系 ===
1. PDB残基 ↔ MSA位置: 298 个映射
2. MSA位置 ↔ AF3残基: 328 个映射
3. PDB残基 ↔ AF3残基: 298 个映射

=== 前10个映射示例 ===
   pdb_residue_id  msa_index  af3_residue_id
0              23         23              23
1              24         24              24
2              25         25              25
3              26         26              26
4              27         27              27
5              28         28              28
6              29         29              29
7              30         30              30
8              31         31              31
9              32         32              32
