In [None]:
#提取 antigen 链的序列


from bioblocks.io import read_model
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import os
from tqdm import tqdm

# ========== CONFIG ==========
PDB_FILE = "/home/yuyang/lb/data/IL23/5njd.pdb"       # ground truth PDB
AF3_CIF_DIR = "/home/yuyang/lb/data/IL23/cif"        # AF3 CIF 文件夹
OUTPUT_DIR = "/home/yuyang/lb/data/IL23/msa_fasta"   # 输出 FASTA 目录
os.makedirs(OUTPUT_DIR, exist_ok=True)

ANTIGEN_CHAIN = "A"  # 假设只有一条 antigen 链

# 读取 ground truth PDB
pdb_model = read_model(PDB_FILE)
pdb_antigen_seq = pdb_model[ANTIGEN_CHAIN].sequence

# 遍历 AF3 CIF 文件
cif_files = sorted([f for f in os.listdir(AF3_CIF_DIR) if f.lower().endswith(".cif")])

for cif_file in tqdm(cif_files, desc="Processing CIFs"):
    af3_path = os.path.join(AF3_CIF_DIR, cif_file)
    try:
        af3_model = read_model(af3_path)
        af3_antigen_seq = af3_model[ANTIGEN_CHAIN].sequence
    except Exception as e:
        print(f"[READ ERROR] {cif_file}: {e}")
        continue

    # 生成 FASTA
    seq_records = [
        SeqRecord(Seq(pdb_antigen_seq), id="PDB_A", description="Experimental validated antigen"),
        SeqRecord(Seq(af3_antigen_seq), id=f"AF3_A_{cif_file}", description="AF3 predicted antigen")
    ]

    output_fasta = os.path.join(OUTPUT_DIR, cif_file.replace(".cif", "_for_msa.fasta"))
    with open(output_fasta, "w") as f:
        SeqIO.write(seq_records, f, "fasta")

    #print(f"✅ {cif_file} sequences written to {output_fasta}")


In [None]:
# 安装（如果服务器上没有安装）
# conda install -c bioconda mafft
import os
import subprocess
from tqdm import tqdm

# ========== CONFIG ==========
FASTA_DIR = "/home/yuyang/lb/data/IL23/msa_fasta"   # 上一步生成的 fasta 文件夹
OUTPUT_DIR = "/home/yuyang/lb/data/IL23/msa_aln"    # MSA 输出目录
os.makedirs(OUTPUT_DIR, exist_ok=True)

# 遍历 fasta 文件并运行 MAFFT
fasta_files = sorted([f for f in os.listdir(FASTA_DIR) if f.lower().endswith(".fasta")])

for fasta_file in tqdm(fasta_files, desc="Running MAFFT"):
    input_path = os.path.join(FASTA_DIR, fasta_file)
    output_path = os.path.join(OUTPUT_DIR, fasta_file.replace(".fasta", ".aln.fasta"))
    
    try:
        # 方法1：使用 subprocess.run（推荐）
        # subprocess.run(["mafft", "--auto", input_path], stdout=open(output_path, "w"), check=True)
        
        # 方法2：如果你在 Jupyter，也可以用魔法命令：
        !mafft --auto {input_path} > {output_path}
        
        #print(f"✅ MSA written to {output_path}")
    except Exception as e:
        print(f"[MAFFT ERROR] {fasta_file}: {e}")



In [11]:
#读取 MSA 并建立 index 对应关系

import os
import pandas as pd
from Bio import AlignIO
from tqdm import tqdm

# ========== CONFIG ==========
MSA_DIR = "/home/yuyang/lb/data/IL23/msa_aln"       # MAFFT 输出文件夹
OUTPUT_FILE = "/home/yuyang/lb/data/IL23/pdb_to_af3_mapping.csv"
msa_files = sorted([f for f in os.listdir(MSA_DIR) if f.lower().endswith(".aln.fasta")])

# 用于存储所有 mapping，每个 PDB 残基对应一个 dict
all_mappings = {}

for msa_file in tqdm(msa_files, desc="Processing MSA files"):
    alignment = AlignIO.read(os.path.join(MSA_DIR, msa_file), "fasta")
    
    # 假设 alignment[0] 是 PDB，alignment[1] 是 AF3
    pdb_seq_aligned = str(alignment[0].seq)
    af3_seq_aligned = str(alignment[1].seq)

    pdb_idx = 0
    af3_idx = 0
    for res_pdb, res_af3 in zip(pdb_seq_aligned, af3_seq_aligned):
        if res_pdb != "-":
            pdb_idx += 1
        if res_af3 != "-":
            af3_idx += 1
        if res_pdb != "-" and res_af3 != "-":
            if pdb_idx not in all_mappings:
                all_mappings[pdb_idx] = {}
            # 列名去掉扩展名方便查看
            col_name = os.path.splitext(msa_file)[0]
            all_mappings[pdb_idx][col_name] = af3_idx

# 将 dict 转换为 DataFrame
df = pd.DataFrame.from_dict(all_mappings, orient="index")
df.index.name = "pdb_index"
df = df.sort_index()
df.to_csv(OUTPUT_FILE)
print(f"✅ PDB -> AF3 mapping table saved to {OUTPUT_FILE}")




Processing MSA files: 100%|██████████| 393/393 [00:00<00:00, 1500.21it/s]


✅ PDB -> AF3 mapping table saved to /home/yuyang/lb/data/IL23/pdb_to_af3_mapping.csv
