In [15]:
import os
import yaml
from bioblocks.io import read_model, write_model
from bioblocks.transform import rename_chains
from anarci import anarci
from Bio import pairwise2
from Bio.pairwise2 import format_alignment

# ===================== YAML 读取 =====================
with open("config.yaml", "r") as f:
    config = yaml.safe_load(f)

common_params = config.get("common", {})
rename_params = config.get("rename", {})

scheme = common_params.get("scheme", "kabat")
similarity_threshold = common_params.get("similarity_threshold", 0.9)
ncpu = common_params.get("ncpu", 1)

PDB_NAME = rename_params["PDB_NAME"]
PDB_DIR = rename_params["PDB_DIR"]
CIF_DIR = rename_params["CIF_DIR"]

PDB_FILE = os.path.join(PDB_DIR, PDB_NAME)
PDB_OUTPUT_DIR = PDB_DIR

# 默认读取 cif 文件夹下第一个 CIF 文件
cif_files = sorted([f for f in os.listdir(CIF_DIR) if f.lower().endswith(".cif")])
AF3_example_cif = os.path.join(CIF_DIR, cif_files[0])


In [16]:
import requests
from Bio.PDB import PDBParser, PDBIO, Select

def fetch_pdb_first_model(pdb_id: str, assembly_id=1, output_file=None):
    """
    下载 PDB 文件，并只保留第一个 model。
    
    Args:
        pdb_id: PDB ID
        assembly_id: 生物组装编号（通常为 1）
        output_file: 输出 PDB 文件路径
    """
    pdb_id = pdb_id.lower()
    url = f"https://files.rcsb.org/download/{pdb_id}.pdb{assembly_id}"
    if output_file is None:
        output_file = f"{pdb_id}_pdb{assembly_id}_model1.pdb"

    print(f"⬇️  Downloading biological assembly {assembly_id} for {pdb_id} ...")
    r = requests.get(url)
    if r.status_code != 200:
        raise ValueError(f"❌ 无法下载 {pdb_id} assembly {assembly_id} (HTTP {r.status_code})")

    raw_pdb_path = f"{pdb_id}_temp.pdb"
    with open(raw_pdb_path, "wb") as f:
        f.write(r.content)
    
    # 解析 PDB 并只保留第一个 model
    parser = PDBParser(QUIET=True)
    structure = parser.get_structure(pdb_id, raw_pdb_path)
    first_model = structure[0]  # 第一个 model

    class FirstModelSelect(Select):
        def accept_model(self, model):
            return model.id == 0  # 只保留第一个 model

    io = PDBIO()
    io.set_structure(structure)
    io.save(output_file, select=FirstModelSelect())
    
    print(f"✅ Saved first model -> {output_file}")
    return output_file


In [17]:
def classify_chain(seq, chain_id=None, scheme="chothia", ncpu=1):
    
    name = chain_id if chain_id else "seq"
    alignment_results, hit_tables, domain_alignments = anarci([(name, seq)], 
                                                              scheme=scheme, 
                                                              ncpu=ncpu, 
                                                              output=False)

    if not hit_tables or hit_tables[0] is None or len(hit_tables[0]) == 0:
        return "None"

    chain_type = hit_tables[0][0]["chain_type"]
    if chain_type == "H":
        return "H"
    elif chain_type in ["K", "L"]:
        return "L"
    else:
        return "A"

In [18]:
def sequence_similarity(seq1, seq2, ):

    seq1, seq2 = seq1.strip().upper(), seq2.strip().upper()

    alignments = pairwise2.align.globalxx(seq1, seq2)

    best_alignment = alignments[0]
    aln_seq1, aln_seq2, score, start, end = best_alignment

    matches = sum(a == b for a, b in zip(aln_seq1, aln_seq2))
    identity = matches / max(len(seq1), len(seq2))

    similarity = identity

    return {
        "identity": identity,
        "similarity": similarity,
        "alignment": format_alignment(*best_alignment)
    }

In [19]:
fetch_pdb_first_model(PDB_NAME.split(".")[0], output_file=PDB_FILE)

model = read_model(PDB_FILE)
af3_model = read_model(AF3_example_cif)


⬇️  Downloading biological assembly 1 for 5njd ...
✅ Saved first model -> /home/yuyang/lb_yaml/data/IL23/5njd.pdb


In [20]:
mapping_dict = {}
for chain in model.get_chains():
    seq = model[chain.id].sequence
    classification = classify_chain(seq, scheme=scheme)
    mapping_dict[chain.id] = classification

In [21]:
# 抗原链处理
antigen_chain_ids = [k for k, v in mapping_dict.items() if v == "None"]
target_antigen_seq = af3_model["A"].sequence

In [22]:
mapping_dict

{'A': 'None', 'B': 'None', 'U': 'L', 'V': 'H', 'Y': 'None'}

In [23]:
for antigen_chain_id in antigen_chain_ids:
    potential_seq = model[antigen_chain_id].sequence
    result = sequence_similarity(target_antigen_seq, potential_seq)
    if result["identity"] > similarity_threshold:
        mapping_dict[antigen_chain_id] = "A"
    else:
        del mapping_dict[antigen_chain_id]

In [24]:
mapping_dict

{'A': 'A', 'U': 'L', 'V': 'H'}

In [25]:
new_model = rename_chains(model=model, rename_mapping=mapping_dict)


In [26]:
write_model(new_model, f"{PDB_FILE}")