In [25]:
import requests
import os
import subprocess
from itertools import combinations

In [None]:
def fetch_pdb_ids_by_ec(ec, rows_per_page=100):
    pdb_ids = []
    start = 0

    while True:
        query = {
            "query": {
                "type": "terminal",
                "service": "text",
                "parameters": {
                    "attribute": "rcsb_polymer_entity.rcsb_ec_lineage.id",
                    "operator": "exact_match",
                    "value": ec
                }
            },
            "return_type": "entry",
            "request_options": {
                "paginate": {
                    "start": start,
                    "rows": rows_per_page
                }
            }
        }

        resp = requests.post(
            "https://search.rcsb.org/rcsbsearch/v2/query?json",
            json=query
        )
        resp.raise_for_status()
        data = resp.json()

        results = data.get("result_set", [])
        if not results:
            break

        pdb_ids.extend([entry["identifier"] for entry in results])
        start += rows_per_page

    return pdb_ids

def download_pdb_files(pdb_ids, ec, out_root, file_format="pdb"):
    """下载所有 PDB 文件到指定 EC 编号的文件夹下"""
    ec_folder = ec.replace(".", "_")
    out_dir = os.path.join(out_root, ec_folder)
    os.makedirs(out_dir, exist_ok=True)

    for pdb_id in pdb_ids:
        url = f"https://files.rcsb.org/download/{pdb_id.upper()}.{file_format}"
        response = requests.get(url)
        if response.status_code == 200:
            out_path = os.path.join(out_dir, f"{pdb_id}.{file_format}")
            with open(out_path, "wb") as f:
                f.write(response.content)
            print(f"Downloaded: {pdb_id}")
        else:
            print(f"Failed to download {pdb_id} (HTTP {response.status_code})")


def run_tmalign(pdb1, pdb2):
    """调用 TM-align 进行结构比对并提取 TM-score"""
    result = subprocess.run(["TMalign", pdb1, pdb2], stdout=subprocess.PIPE, stderr=subprocess.DEVNULL, text=True)
    for line in result.stdout.splitlines():
        if line.startswith("TM-score="):
            return float(line.split('=')[1].split()[0])
    return None

def align_all_structures(pdb_dir, output_tsv="tmalign_results.tsv"):
    """遍历并比对该目录下所有结构，输出 TM-score 表格"""
    pdb_files = [f for f in os.listdir(pdb_dir) if f.endswith(".pdb")]
    pdb_paths = [os.path.join(pdb_dir, f) for f in pdb_files]

    with open(os.path.join(pdb_dir, output_tsv), "w") as f:
        f.write("pdb1\tpdb2\ttm_score\n")
        for pdb1, pdb2 in combinations(pdb_paths, 2):
            score = run_tmalign(pdb1, pdb2)
            if score is not None:
                f.write(f"{os.path.basename(pdb1)}\t{os.path.basename(pdb2)}\t{score:.4f}\n")
                print(f"{pdb1} vs {pdb2} => TM-score: {score:.4f}")

In [23]:

ec_code = "2.4.99"
save_path = "/hpcfs/fhome/shizhenkun/codebase/RXNRECer/case/fusarium_venenatum/data/ecclass"

pdb_ids = fetch_pdb_ids_by_ec(ec_code)
print(f"Found {len(pdb_ids)} PDB entries for EC {ec_code}")
print(pdb_ids)

download_pdb_files(pdb_ids, ec_code, save_path, file_format="pdb")

Found 288 PDB entries for EC 2.4.99
['1LBE', '1PSW', '1R0S', '1R12', '1R15', '1R16', '1RO7', '1RO8', '1VKY', '1WDI', '1YH3', '1YY3', '1ZVM', '2BG1', '2C5W', '2C6W', '2DRJ', '2EF1', '2EG9', '2EX0', '2EX1', '2FFF', '2GT1', '2H1F', '2H1H', '2HCT', '2I65', '2I66', '2I67', '2IHJ', '2IHK', '2IHZ', '2II6', '2IIB', '2IIQ', '2ILV', '2JCH', '2JE5', '2LGZ', '2MGV', '2O3Q', '2O3R', '2O3S', '2O3T', '2O3U', '2OLU', '2OLV', '2OQO', '2P2V', '2P56', '2PGJ', '2PGL', '2UWX', '2V2F', '2WML', '2WNF', '2WQQ', '2X61', '2X62', '2X63', '2XCI', '2XCU', '2XD1', '2XD5', '2Y2G', '2Y2H', '2Y2I', '2Y2J', '2Y2K', '2Y2L', '2Y2M', '2Y2N', '2Y2O', '2Y2P', '2Y2Q', '2YK4', '2YK5', '2YK6', '2YK7', '2ZAG', '2ZAI', '2ZC5', '2ZC6', '2ZWI', '3AAG', '3D0F', '3D3H', '3DWK', '3DZF', '3DZG', '3DZH', '3DZI', '3DZJ', '3DZK', '3F6Y', '3FWL', '3GC6', '3GH3', '3GHH', '3HZS', '3I9J', '3I9K', '3I9L', '3I9M', '3I9N', '3I9O', '3KOU', '3NB6', '3NB7', '3OFS', '3P5S', '3RAJ', '3RCE', '3ROK', '3ROM', '3ROP', '3ROQ', '3S44', '3U4H', '3U4I', '3U

In [28]:
import pandas as pd
pd.read_feather('/hpcfs/fhome/shizhenkun/codebase/enyrnx/data/brenda/brenda_reaction_uniprot_dataset.feather')

Unnamed: 0,reaction_id,equation_string,equation,substrates_smile,products_smile,uniprot_id,organism,ec,len,seq,equation_smiles
0,brnx:1,4-nitrophenyl propanoate + H2O = 4-nitrophenol...,cid:74777 = cid:980 + cid:104745,CCC(=O)Oc1ccc([N+](=O)[O-])cc1,C1=CC(=CC=C1[N+](=O)[O-])O^CCC(=O)[O-],D0EPY0,Bacillus subtilis,,300,MSNHSSSIPELSDNGIRYYQTYNESLSLWPVRCKSFYISTRFGQTH...,CCC(=O)Oc1ccc([N+](=O)[O-])cc1>>C1=CC(=CC=C1[N...
1,brnx:1,4-nitrophenyl propanoate + H2O = 4-nitrophenol...,cid:74777 = cid:980 + cid:104745,CCC(=O)Oc1ccc([N+](=O)[O-])cc1,C1=CC(=CC=C1[N+](=O)[O-])O^CCC(=O)[O-],Q5V5N6,Haloarcula marismortui (strain ATCC 43049 / DS...,,327,MSTTARPMPVTERAPESVTVQRDIPFHEVDGETLTLDLYDAAAASG...,CCC(=O)Oc1ccc([N+](=O)[O-])cc1>>C1=CC(=CC=C1[N...
2,brnx:1,4-nitrophenyl propanoate + H2O = 4-nitrophenol...,cid:74777 = cid:980 + cid:104745,CCC(=O)Oc1ccc([N+](=O)[O-])cc1,C1=CC(=CC=C1[N+](=O)[O-])O^CCC(=O)[O-],Q7M529,Sulfolobus acidocaldarius,3.1.1.1,20,PLDPTIKCLLESGFVIPIGK,CCC(=O)Oc1ccc([N+](=O)[O-])cc1>>C1=CC(=CC=C1[N...
3,brnx:1,4-nitrophenyl propanoate + H2O = 4-nitrophenol...,cid:74777 = cid:980 + cid:104745,CCC(=O)Oc1ccc([N+](=O)[O-])cc1,C1=CC(=CC=C1[N+](=O)[O-])O^CCC(=O)[O-],Q15166,Homo sapiens (Human),3.1.1.2; 3.1.1.81; 3.1.8.1,354,MGKLVALVLLGVGLSLVGEMFLAFRERVNASREVEPVEPENCHLIE...,CCC(=O)Oc1ccc([N+](=O)[O-])cc1>>C1=CC(=CC=C1[N...
4,brnx:1,4-nitrophenyl propanoate + H2O = 4-nitrophenol...,cid:74777 = cid:980 + cid:104745,CCC(=O)Oc1ccc([N+](=O)[O-])cc1,C1=CC(=CC=C1[N+](=O)[O-])O^CCC(=O)[O-],G2QH51,Myceliophthora thermophila (strain ATCC 42464 ...,3.1.1.74,231,MKFLSLLTAAGVAAALPTSPAEVSSAGEIEARQLASTRNELENGDS...,CCC(=O)Oc1ccc([N+](=O)[O-])cc1>>C1=CC(=CC=C1[N...
...,...,...,...,...,...,...,...,...,...,...,...
29774,brnx:9247,serotonin + UDP-glucuronate = UDP + serotonin ...,cid:5202 + cid:17473 = cid:6031 + cid:129627419,NCCc1c[nH]c2ccc(O)cc12^O=C(O)[C@H]1O[C@H](OP(=...,C1=CN(C(=O)NC1=O)[C@H]2[C@@H]([C@@H]([C@H](O2)...,P35503,Homo sapiens (Human),2.4.1.17,534,MATGLQVPLPWLATGLLLLLSVQPWAESGKVLVVPIDGSHWLSMRE...,NCCc1c[nH]c2ccc(O)cc12.O=C(O)[C@H]1O[C@H](OP(=...
29775,brnx:9247,serotonin + UDP-glucuronate = UDP + serotonin ...,cid:5202 + cid:17473 = cid:6031 + cid:129627419,NCCc1c[nH]c2ccc(O)cc12^O=C(O)[C@H]1O[C@H](OP(=...,C1=CN(C(=O)NC1=O)[C@H]2[C@@H]([C@@H]([C@H](O2)...,P16662,Homo sapiens (Human),2.4.1.17,529,MSVKWTSVILLIQLSFCFSSGNCGKVLVWAAEYSHWMNIKTILDEL...,NCCc1c[nH]c2ccc(O)cc12.O=C(O)[C@H]1O[C@H](OP(=...
29776,brnx:9247,serotonin + UDP-glucuronate = UDP + serotonin ...,cid:5202 + cid:17473 = cid:6031 + cid:129627419,NCCc1c[nH]c2ccc(O)cc12^O=C(O)[C@H]1O[C@H](OP(=...,C1=CN(C(=O)NC1=O)[C@H]2[C@@H]([C@@H]([C@H](O2)...,P35504,Homo sapiens (Human),2.4.1.17,534,MATGLQVPLPQLATGLLLLLSVQPWAESGKVLVVPTDGSHWLSMRE...,NCCc1c[nH]c2ccc(O)cc12.O=C(O)[C@H]1O[C@H](OP(=...
29777,brnx:9248,ATP + N-acetyl-D-muramate = ADP + N-acetyl-alp...,cid:5957 + cid:16738680 = cid:6022 + cid:1293...,Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O...,C1=NC(=C2C(=N1)N(C=N2)[C@H]3[C@@H]([C@@H]([C@H...,Q88QT3,Pseudomonas putida (strain ATCC 47054 / DSM 61...,2.7.1.221,339,MPEHDVRLQQLTVWLDEQLNDLFRDNAWGEVPAGSLTAASSDASFR...,CC(=O)N[C@H]1C(O)O[C@H](CO)[C@@H](O)[C@@H]1O[C...


In [26]:
ec_dir = "/hpcfs/fhome/shizhenkun/codebase/RXNRECer/case/fusarium_venenatum/data/ecclass/2_4_99"
align_all_structures(ec_dir)

/hpcfs/fhome/shizhenkun/codebase/RXNRECer/case/fusarium_venenatum/data/ecclass/2_4_99/4CMH.pdb vs /hpcfs/fhome/shizhenkun/codebase/RXNRECer/case/fusarium_venenatum/data/ecclass/2_4_99/2Y2Q.pdb => TM-score: 0.3159
/hpcfs/fhome/shizhenkun/codebase/RXNRECer/case/fusarium_venenatum/data/ecclass/2_4_99/4CMH.pdb vs /hpcfs/fhome/shizhenkun/codebase/RXNRECer/case/fusarium_venenatum/data/ecclass/2_4_99/1VKY.pdb => TM-score: 0.3228
/hpcfs/fhome/shizhenkun/codebase/RXNRECer/case/fusarium_venenatum/data/ecclass/2_4_99/4CMH.pdb vs /hpcfs/fhome/shizhenkun/codebase/RXNRECer/case/fusarium_venenatum/data/ecclass/2_4_99/3DZH.pdb => TM-score: 0.9725
/hpcfs/fhome/shizhenkun/codebase/RXNRECer/case/fusarium_venenatum/data/ecclass/2_4_99/4CMH.pdb vs /hpcfs/fhome/shizhenkun/codebase/RXNRECer/case/fusarium_venenatum/data/ecclass/2_4_99/8P1U.pdb => TM-score: 0.1132
/hpcfs/fhome/shizhenkun/codebase/RXNRECer/case/fusarium_venenatum/data/ecclass/2_4_99/4CMH.pdb vs /hpcfs/fhome/shizhenkun/codebase/RXNRECer/case/fus