##  使用Brenda EC Structure 对镰刀菌进行评价  

> 2025-04-02  
> zhenkun.shi@tib.cas.cn 

In [194]:

import sys,os
sys.path.insert(0, os.path.dirname(os.path.realpath('__file__')))
sys.path.insert(1,'../../')
from config import conf as cfg
import pandas as pd
import subprocess
import shutil
from tools import filetool 
from itertools import combinations
from tqdm import tqdm
from Bio.PDB import PDBList
import requests
from pandarallel import pandarallel 
pandarallel.initialize(progress_bar=False)
%load_ext autoreload
%autoreload 2

INFO: Pandarallel will run on 192 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [196]:


def get_3digit_ec(ec_str):
    if ec_str == '-':
        return '-'
    else:
        ec_array = ec_str.split(cfg.SPLITER)
        ec_array = [x.strip() for x in ec_array]
        for idx in range(len(ec_array)):
            if '.' in ec_array[idx]:
                ec_array[idx] = '.'.join(ec_array[idx].split('.')[:-1])
            else:
                ec_array[idx] = ec_array[idx]
        
        res = cfg.SPLITER.join(set(ec_array))
        return res

def cp_pdb_files(src, target):
    # 如果是完整目标路径（包含文件名）
    if os.path.exists(src):
        target_dir = os.path.dirname(target)
        os.makedirs(target_dir, exist_ok=True)
        shutil.copyfile(src, target)
    else:
        print(f"文件不存在，跳过：{src}")

def run_tmalign(pdb1, pdb2):
    """调用 TM-align 进行结构比对并提取 TM-score（返回6位小数）"""
    result = subprocess.run(["TMalign", pdb1, pdb2], 
                           stdout=subprocess.PIPE, 
                           stderr=subprocess.DEVNULL, 
                           text=True)
    for line in result.stdout.splitlines():
        if line.startswith("TM-score="):
            score = float(line.split('=')[1].split()[0])
            return round(score, 6)  # 返回6位小数
    return None

def run_rmsd(pdb1: str, pdb2: str) -> float:
    """通过 PyMOL 计算 RMSD（返回6位小数）"""
    cmd = f"""
    pymol -c -q -d '
    load {pdb1}, obj1;
    load {pdb2}, obj2;
    align obj1 and name CA, obj2 and name CA;
    quit'
    """
    try:
        result = subprocess.run(
            cmd,
            shell=True,
            executable="/bin/bash",
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True,
            timeout=30
        )
        for line in result.stdout.splitlines():
            if "RMSD =" in line:
                rmsd = float(line.split("=")[1].strip().split()[0])
                return round(rmsd, 6)  # 返回6位小数
    except Exception as e:
        print(f"PyMOL error ({os.path.basename(pdb1)} vs {os.path.basename(pdb2)}): {str(e)}")
    return None

def align_all_structures(pdb_dir, output_tsv="tmalign_results.tsv"):
    """遍历并比对该目录下所有结构，输出 TM-score 表格"""
    pdb_files = [f for f in os.listdir(pdb_dir) if f.endswith(".pdb")]
    pdb_paths = [os.path.join(pdb_dir, f) for f in pdb_files]
    
    pdb_pairs = pd.DataFrame(combinations(pdb_paths, 2), columns=['pdb1', 'pdb2'])
    pdb_pairs['tm_score'] = pdb_pairs.parallel_apply(lambda x: run_tmalign(x['pdb1'], x['pdb2']), axis=1)
    pdb_pairs['rmsd'] = pdb_pairs.parallel_apply(lambda x: run_rmsd(x['pdb1'], x['pdb2']), axis=1)
    pdb_pairs['pdb1'] = pdb_pairs.pdb1.apply(lambda x:x.split('/')[-1].replace('.pdb', '').replace('-F1-model_v4', ''))            
    pdb_pairs['pdb2'] = pdb_pairs.pdb2.apply(lambda x:x.split('/')[-1].replace('.pdb', '').replace('-F1-model_v4', ''))   
    pdb_pairs = pdb_pairs.dropna().sort_values(["tm_score", "rmsd"], ascending=[False, True]).reset_index(drop=True)
         
    print(f'save to:{output_tsv}')
    return pdb_pairs
                
                
def prep(case_ec, data_brenda, case_clean_wrong):
    
    level = len(case_ec.split("."))
    
    source_base_dir ='/hpcfs/fpublic/database/alphafold/predicted_pdbs/'
    target_base_dir =f'/hpcfs/fhome/shizhenkun/codebase/RXNRECer/case/fusarium_venenatum/middle/ec_{case_ec.replace(".", "_")}/'
    esmpdb_dir = '/hpcfs/fhome/shizhenkun/codebase/RXNRECer/case/fusarium_venenatum/data/esmpdb/'
    
    if level ==4:
        case_clean_wrong = case_clean_wrong[case_clean_wrong.clean==case_ec]
        case_data = data_brenda[data_brenda.ec==case_ec].copy().reset_index(drop=True)
    elif level == 3:
        case_clean_wrong =case_clean_wrong[case_clean_wrong.clean_3digit==case_ec]
        case_data = data_brenda[data_brenda.ec_3digit==case_ec].copy().reset_index(drop=True)
    
    # 复制brenda EC 对应蛋白的PDB
    case_data.uniprot_id.apply(lambda x: filetool.cp_pdb_file(src=f'{source_base_dir}AF-{x}-F1-model_v4.pdb.gz', target=target_base_dir))
    os.system(f"gzip -d {target_base_dir}*.gz")
    
    # 复制镰刀菌pdb
    case_clean_wrong.input_id.apply(lambda x: filetool.cp_pdb_file(src=f'{esmpdb_dir}{x}.pdb', target=target_base_dir))
    
        
    # 结构比对  
    align_all_structures(pdb_dir=target_base_dir, output_tsv=f'ec_{case_ec.replace(".", "_")}_tmalign_results.tsv')

In [125]:
target_base_dir =f'/hpcfs/fhome/shizhenkun/codebase/RXNRECer/case/fusarium_venenatum/middle/ec_{case_ec.replace(".", "_")}/'
pdb_files = [f for f in os.listdir(target_base_dir) if f.endswith(".pdb")]
pdb_paths = [os.path.join(target_base_dir, f) for f in pdb_files]
pdb_pairs = pd.DataFrame(combinations(pdb_paths, 2), columns=['pdb1', 'pdb2'])


In [185]:
data_expasy = pd.read_feather('/hpcfs/fhome/shizhenkun/codebase/enyrnx/data/expasy/ec_expasy.feather')
data_expasy = data_expasy[data_expasy.ref_swissprot !=''].reset_index(drop=True)
data_expasy['ref_swissprot'] = data_expasy.ref_swissprot.apply(lambda x: (cfg.SPLITER).join([item.split(',')[0].strip() for item in x.split(';')]))
data_expasy

Unnamed: 0,ec,name,alternative_name,reaction,comments,ref_swissprot
0,1.1.1.1,alcohol dehydrogenase.,aldehyde reductase.,(1) a primary alcohol + NAD(+) = an aldehyde +...,-!- Acts on primary or secondary alcohols or h...,P07327;P28469;Q5RBP7;P25405;P25406;P00327;P003...
1,1.1.1.2,alcohol dehydrogenase (NADP(+)).,aldehyde reductase (NADPH).,a primary alcohol + NADP(+) = an aldehyde + NA...,-!- Some members of this group oxidize only pr...,Q6AZW2;Q568L5;Q24857;Q04894;P25377;O57380;P747...
2,1.1.1.3,homoserine dehydrogenase.,,(1) L-homoserine + NAD(+) = L-aspartate 4-semi...,-!- The enzyme from Saccharomyces cerevisiae a...,P00561;P27725;P00562;Q9SA18;P49079;O81852;P490...
3,1.1.1.4,"(R,R)-butanediol dehydrogenase.","(R)-2,3-butanediol dehydrogenase.(R)-diacetyl ...","(R,R)-butane-2,3-diol + NAD(+) = (R)-acetoin +...",-!- Also converts diacetyl into acetoin with N...,P14940;Q0KDL6;P39714;O34788;Q5FA46;Q00796;
4,1.1.1.6,glycerol dehydrogenase.,NAD-linked glycerol dehydrogenase.,glycerol + NAD(+) = dihydroxyacetone + NADH + ...,"-!- Also acts on 1,2-propanediol.",A4IP64;O13702;P45511;P0A9S6;P0A9S5;P32816;P501...
...,...,...,...,...,...,...
5650,7.6.2.12,ABC-type capsular-polysaccharide transporter.,capsular-polysaccharide-transporting ATPase.,ATP + H2O + capsular polysaccharide-[capsular ...,-!- ATP-binding cassette (ABC) type transporte...,P57013;P32016;
5651,7.6.2.13,ABC-type autoinducer-2 transporter.,autoinducer-2 ABC transporter.autoinducer-2 tr...,"ATP + H2O + (2R,4S)-2-methyl-2,3,3,4-tetrahydr...",-!- ATP-binding cassette (ABC) type transporte...,Q8XAY7;B1XEA1;A8A066;B1IRU7;P77257;B1LFA2;A4WE...
5652,7.6.2.14,ABC-type aliphatic sulfonate transporter.,aliphatic sulfonate ABC transporter.aliphatic ...,ATP + H2O + aliphatic sulfonate-[sulfonate-bin...,-!- ATP-binding cassette (ABC) type transporte...,Q8U8D6;A0K739;Q39GW5;Q1BWL4;Q0RT43;Q5Z0P5;A1B9...
5653,7.6.2.15,ABC-type thiamine transporter.,thiamine ABC transporter.thiamine transporting...,thiamine(out) + ATP + H2O = thiamine(in) + ADP...,-!- ATP-binding cassette (ABC) type transporte...,Q8UBY6;Q5E882;Q2YLW6;Q57BC2;Q8YJ04;Q8FYU9;Q3IY...


In [None]:
# 转换为DataFrame
def json_to_dataframe(json_data):
    # 提取UniProt ID
    uniprot_id = list(json_data.keys())[0]
    
    # 为每个条目添加UniProt ID
    for entry in json_data[uniprot_id]:
        entry['uniprot_id'] = uniprot_id
    
    # 创建DataFrame
    df = pd.DataFrame(json_data[uniprot_id])
    
    # 重新排列列顺序，将uniprot_id放在前面
    cols = ['uniprot_id'] + [col for col in df.columns if col != 'uniprot_id']
    df = df[cols]
    
    return df


# 假设df是您提供的DataFrame
def select_best_pdb(df):
    # 规则1+2：先按分辨率升序，再按实验方法排序（X-ray优先）
    df_sorted = df.sort_values(
        by=['resolution', 'experimental_method'],
        ascending=[True, False]  # resolution越小越好，method按字母倒序X-ray优先
    )
    
    # 规则3：如果分辨率相同，选择链ID字母序靠前的（A链优先）
    df_sorted = df_sorted.sort_values('chain_id', ascending=True)
    
    # 规则4（可选）：如果需要特定物种，可以添加筛选
    # df_sorted = df_sorted[df_sorted['tax_id'] == 特定物种ID]
    
    # 返回第一个（最优）条目
    best_row = df_sorted.iloc[0]
    return best_row['pdb_id'], best_row['chain_id']


def download_pdb(pdb_id, save_path=None):
    """下载PDB文件"""
    url = f"https://files.rcsb.org/download/{pdb_id}.pdb"
    try:
        response = requests.get(url)
        response.raise_for_status()
        
        if save_path:
            with open(save_path, 'w') as f:
                f.write(response.text)
            print(f"PDB文件已保存到: {save_path}")
        return response.text
    except Exception as e:
        print(f"下载失败: {str(e)}")
        return None
    

def parse_pdb_info(pdb_json):
    """解析PDB JSON数据并提取关键信息"""
    # 主要信息提取
    main_info = {
        "pdb_id": pdb_json.get("entry", {}).get("id", ""),
        "resolution": pdb_json.get("rcsb_entry_info", {}).get("resolution_combined", [None])[0],
        "method": pdb_json.get("exptl", [{}])[0].get("method", "").title(),
        "journal": f"{pdb_json.get('citation', [{}])[0].get('journal_abbrev', '')} "
                  f"({pdb_json.get('citation', [{}])[0].get('year', '')})",
        "ref_doi": pdb_json.get("citation", [{}])[0].get("pdbx_database_id_doi", "")
    }

    # 配体信息提取
    ligands = []
    if "rcsb_binding_affinity" in pdb_json:
        ligands = [{
            "chemical_id": lig["chemical_id"],
            "chemical_name": lig.get("chemical_name", ""),
            "formula": lig.get("formula", "")
        } for lig in pdb_json["rcsb_binding_affinity"]]

    return main_info, ligands

def get_best_pdb(uniprot_id: str):
    """
    修正版：通过 UniProt ID 获取最优 PDB 结构
    返回: {
        "pdb_id": str,
        "resolution": float,
        "method": str,
        "ligands": list
    }
    """
    # 步骤1：通过 PDBe API 获取映射数据
    url = f"https://www.ebi.ac.uk/pdbe/api/mappings/best_structures/{uniprot_id}"
    response = requests.get(url).json()

    if not response:
        raise ValueError(f"No PDB found for UniProt ID: {uniprot_id}")
    
    response = json_to_dataframe(response)
    
    # 执行筛选
    best_pdb, best_chain = select_best_pdb(response)
    print(f"Best PDB ID: {best_pdb}")
    

    # 步骤4：获取每个 PDB 的元数据

    try:
        pdb_api = f"https://data.rcsb.org/rest/v1/core/entry/{best_pdb}"
        pdb_info = requests.get(pdb_api).json()
        main_info, ligands = parse_pdb_info(pdb_info)
        print(main_info, ligands)
        
    except Exception as e:
        print(f"Error processing PDB RCSB_{uniprot_id}_{best_pdb}: {str(e)}")
       
    download_pdb(best_pdb, f"RCSB_{uniprot_id}_{best_pdb}.pdb")


    
    # return best_pdb

In [275]:
best_structure = get_best_pdb("X5F427")  # 凝血酶原的 UniProt ID
# print(f"最优 PDB 结构: {best_structure}")

Best PDB ID: 7qtl
{'pdb_id': '7QTL', 'resolution': 2.48, 'method': 'Electron Microscopy', 'journal': 'Cell Rep (2023)', 'ref_doi': '10.1016/j.celrep.2022.111901'} []
PDB文件已保存到: RCSB_X5F427_7qtl.pdb


In [166]:
align_all_structures(pdb_dir=target_base_dir, output_tsv="tmalign_results.tsv")

save to:tmalign_results.tsv


Unnamed: 0,pdb1,pdb2,tm_score,rmsd
0,AF-P15144,AF-P15684,0.98496,0.249
1,AF-P04825,AF-P15144,0.83402,2.977
2,AF-P04825,AF-P15684,0.83226,8.905
3,XP_065465146.1,XP_025589480.1,0.42133,0.628
4,XP_065465146.1,AF-P04825,0.41582,7.267
5,XP_065465146.1,AF-P15144,0.41186,20.83
6,XP_065465146.1,AF-P15684,0.406,18.757
7,XP_025592412.1,AF-P04825,0.34454,20.511
8,XP_025592412.1,AF-P15684,0.33226,19.741
9,XP_025592412.1,AF-P15144,0.33113,20.658


In [31]:
data_brenda = pd.read_feather('/hpcfs/fhome/shizhenkun/codebase/enyrnx/data/brenda/brenda_reaction_uniprot_dataset.feather')[['uniprot_id', 'ec', 'seq', 'len', 'equation_string' ]]
data_brenda= data_brenda[~data_brenda.ec.isnull()].reset_index(drop=True)
data_brenda['ec_3digit']=data_brenda.ec.apply(lambda x: get_3digit_ec(x))
data_brenda.head(3)

Unnamed: 0,uniprot_id,ec,seq,len,equation_string,ec_3digit
0,Q7M529,3.1.1.1,PLDPTIKCLLESGFVIPIGK,20,4-nitrophenyl propanoate + H2O = 4-nitrophenol...,3.1.1
1,Q15166,3.1.1.2; 3.1.1.81; 3.1.8.1,MGKLVALVLLGVGLSLVGEMFLAFRERVNASREVEPVEPENCHLIE...,354,4-nitrophenyl propanoate + H2O = 4-nitrophenol...,3.1.8;3.1.1
2,G2QH51,3.1.1.74,MKFLSLLTAAGVAAALPTSPAEVSSAGEIEARQLASTRNELENGDS...,231,4-nitrophenyl propanoate + H2O = 4-nitrophenol...,3.1.1


In [103]:
case_clean_wrong= pd.read_feather('middle/case_clean_wrong.feather')
case_clean_wrong.head(3)

Unnamed: 0,input_id,RXNRECer,RXNRECer2EC,clean,ECRECer,MSA_RXN,MSA_EC,deepec,catfam,seq_len,...,clean_3digit,ECRECer_3digit,MSA_EC_3digit,deepec_3digit,catfam_3digit,rxnrecer_clean_3digit,rxnrecer_ecrecer_3digit,rxnrecer_msa_3digit,rxnrecer_deepec_3digit,rxnrecer_catfam_3digit
0,XP_025580784.2,-,-,2.7.7.n1,-,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,-,254,...,2.7.7,-,NO-PREDICTION,NO-PREDICTION,-,False,True,False,False,True
1,XP_025580788.2,-,-,2.3.1.48,-,-,-,NO-PREDICTION,-,96,...,2.3.1,-,-,NO-PREDICTION,-,False,True,True,False,True
2,XP_025580789.2,-,-,3.1.1.42,-,-,-,NO-PREDICTION,-,460,...,3.1.1,-,-,NO-PREDICTION,-,False,True,True,False,True


In [107]:
ec_code_1 = "2.4.99"
prep(case_ec=ec_code_1, data_brenda=data_brenda, case_clean_wrong=case_clean_wrong)

警告: 源文件不存在，跳过: /hpcfs/fpublic/database/alphafold/predicted_pdbs//AF-A0R3B1-F1-model_v4.pdb.gz
save to:ec_2_4_99_tmalign_results.tsv


0it [00:00, ?it/s]

120it [00:37,  3.24it/s]


In [110]:
ec_code_2='3.4.11'
prep(case_ec=ec_code_2, data_brenda=data_brenda, case_clean_wrong=case_clean_wrong)

警告: 源文件不存在，跳过: /hpcfs/fpublic/database/alphafold/predicted_pdbs/AF-F0HXE4-F1-model_v4.pdb.gz
警告: 源文件不存在，跳过: /hpcfs/fpublic/database/alphafold/predicted_pdbs/AF-Q96VT2-F1-model_v4.pdb.gz
警告: 源文件不存在，跳过: /hpcfs/fpublic/database/alphafold/predicted_pdbs/AF-B2IQ22-F1-model_v4.pdb.gz
警告: 源文件不存在，跳过: /hpcfs/fpublic/database/alphafold/predicted_pdbs/AF-F9W8I6-F1-model_v4.pdb.gz
警告: 源文件不存在，跳过: /hpcfs/fpublic/database/alphafold/predicted_pdbs/AF-Q4A9M4-F1-model_v4.pdb.gz
警告: 源文件不存在，跳过: /hpcfs/fpublic/database/alphafold/predicted_pdbs/AF-Q95V76-F1-model_v4.pdb.gz
警告: 源文件不存在，跳过: /hpcfs/fpublic/database/alphafold/predicted_pdbs/AF-G0UZY5-F1-model_v4.pdb.gz
警告: 源文件不存在，跳过: /hpcfs/fpublic/database/alphafold/predicted_pdbs/AF-Q7KF27-F1-model_v4.pdb.gz
警告: 源文件不存在，跳过: /hpcfs/fpublic/database/alphafold/predicted_pdbs/AF-Q95V75-F1-model_v4.pdb.gz
警告: 源文件不存在，跳过: /hpcfs/fpublic/database/alphafold/predicted_pdbs/AF-Q96VT2-F1-model_v4.pdb.gz
警告: 源文件不存在，跳过: /hpcfs/fpublic/database/alphafold/predicted_pdbs/AF-Q8I

595it [06:04,  1.63it/s]


In [None]:
case_ec_3='3.4.11.2'
prep(case_ec=case_ec_3, data_brenda=data_brenda, case_clean_wrong=case_clean_wrong)

警告: 源文件不存在，跳过: /hpcfs/fpublic/database/alphafold/predicted_pdbs//AF-Q8IEK1-F1-model_v4.pdb.gz
警告: 源文件不存在，跳过: /hpcfs/fhome/shizhenkun/codebase/RXNRECer/case/fusarium_venenatum/data/esmpdb/XP_025581578.2.pdb
警告: 源文件不存在，跳过: /hpcfs/fhome/shizhenkun/codebase/RXNRECer/case/fusarium_venenatum/data/esmpdb/XP_025582496.1.pdb
警告: 源文件不存在，跳过: /hpcfs/fhome/shizhenkun/codebase/RXNRECer/case/fusarium_venenatum/data/esmpdb/XP_025582990.1.pdb
警告: 源文件不存在，跳过: /hpcfs/fhome/shizhenkun/codebase/RXNRECer/case/fusarium_venenatum/data/esmpdb/XP_025593565.1.pdb
save to:ec_3_4_11_2_tmalign_results.tsv


0it [00:00, ?it/s]

15it [00:23,  1.56s/it]


In [115]:
prep(case_ec='5.4.99.5', data_brenda=data_brenda, case_clean_wrong=case_clean_wrong)

警告: 源文件不存在，跳过: /hpcfs/fpublic/database/alphafold/predicted_pdbs/AF-B2JYH9-F1-model_v4.pdb.gz
警告: 源文件不存在，跳过: /hpcfs/fpublic/database/alphafold/predicted_pdbs/AF-A0A2K1JMA3-F1-model_v4.pdb.gz
警告: 源文件不存在，跳过: /hpcfs/fpublic/database/alphafold/predicted_pdbs/AF-Q2SY64-F1-model_v4.pdb.gz
警告: 源文件不存在，跳过: /hpcfs/fpublic/database/alphafold/predicted_pdbs/AF-A9S498-F1-model_v4.pdb.gz
警告: 源文件不存在，跳过: /hpcfs/fpublic/database/alphafold/predicted_pdbs/AF-D8R1Y1-F1-model_v4.pdb.gz
警告: 源文件不存在，跳过: /hpcfs/fpublic/database/alphafold/predicted_pdbs/AF-W1PFX5-F1-model_v4.pdb.gz
警告: 源文件不存在，跳过: /hpcfs/fpublic/database/alphafold/predicted_pdbs/AF-U5D896-F1-model_v4.pdb.gz
警告: 源文件不存在，跳过: /hpcfs/fhome/shizhenkun/codebase/RXNRECer/case/fusarium_venenatum/data/esmpdb/XP_025584056.1.pdb
警告: 源文件不存在，跳过: /hpcfs/fhome/shizhenkun/codebase/RXNRECer/case/fusarium_venenatum/data/esmpdb/XP_025586352.2.pdb
save to:ec_5_4_99_5_tmalign_results.tsv


1225it [01:21, 15.01it/s]


In [114]:
case_clean_wrong[case_clean_wrong.clean=='5.4.99.5']

Unnamed: 0,input_id,RXNRECer,RXNRECer2EC,clean,ECRECer,MSA_RXN,MSA_EC,deepec,catfam,seq_len,...,clean_3digit,ECRECer_3digit,MSA_EC_3digit,deepec_3digit,catfam_3digit,rxnrecer_clean_3digit,rxnrecer_ecrecer_3digit,rxnrecer_msa_3digit,rxnrecer_deepec_3digit,rxnrecer_catfam_3digit
75,XP_025580970.2,-,-,5.4.99.5,-,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,-,167,...,5.4.99,-,NO-PREDICTION,NO-PREDICTION,-,False,True,False,False,True
151,XP_025581141.1,-,-,5.4.99.5,-,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,-,120,...,5.4.99,-,NO-PREDICTION,NO-PREDICTION,-,False,True,False,False,True
165,XP_025581176.1,-,-,5.4.99.5,-,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,-,489,...,5.4.99,-,NO-PREDICTION,NO-PREDICTION,-,False,True,False,False,True
534,XP_025582040.1,-,-,5.4.99.5,-,-,-,NO-PREDICTION,-,163,...,5.4.99,-,-,NO-PREDICTION,-,False,True,True,False,True
587,XP_025582179.1,-,-,5.4.99.5,-,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,-,192,...,5.4.99,-,NO-PREDICTION,NO-PREDICTION,-,False,True,False,False,True
614,XP_025582252.1,-,-,5.4.99.5,-,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,-,146,...,5.4.99,-,NO-PREDICTION,NO-PREDICTION,-,False,True,False,False,True
636,XP_025582311.1,-,-,5.4.99.5,-,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,-,116,...,5.4.99,-,NO-PREDICTION,NO-PREDICTION,-,False,True,False,False,True
644,XP_025582333.1,-,-,5.4.99.5,-,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,-,190,...,5.4.99,-,NO-PREDICTION,NO-PREDICTION,-,False,True,False,False,True
806,XP_025582733.1,-,-,5.4.99.5,-,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,-,151,...,5.4.99,-,NO-PREDICTION,NO-PREDICTION,-,False,True,False,False,True
907,XP_025582975.1,-,-,5.4.99.5,-,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,-,362,...,5.4.99,-,NO-PREDICTION,NO-PREDICTION,-,False,True,False,False,True


In [1]:
from pathlib import Path
# 设置目标目录
target_dir = "/hpcfs/fhome/shizhenkun/codebase/RXNRECer/data/structure/pdb/ncbi/"

# 获取所有.pdb文件的完整路径
pdb_files = list(Path(target_dir).rglob("*.pdb"))

# 创建DataFrame
df = pd.DataFrame({
    "filename": [f.name for f in pdb_files],  # 文件名（不含路径）
    "filepath": [str(f) for f in pdb_files],  # 完整路径
    "parent_dir": [f.parent.name for f in pdb_files]  # 父目录名
})
df


NameError: name 'pd' is not defined

In [18]:
import os
import shutil
from pathlib import Path

# 源目录和目标目录
source_dir = "/hpcfs/fhome/shizhenkun/codebase/RXNRECer/data/structure/pdb/ncbi/XP_0654/"
target_base = "/hpcfs/fhome/shizhenkun/codebase/RXNRECer/data/structure/pdb/ncbi/"

# 遍历源目录中的所有.pdb文件
for pdb_file in Path(source_dir).glob("*.pdb"):
    # 提取前缀（如 XP_025583 -> XP_025）
    prefix = pdb_file.stem[:9]  # 取前6个字符（如 "XP_025"）
    
    # 目标目录路径（仅一层：/ncbi/XP_025/）
    target_dir = Path(target_base) / prefix
    target_dir.mkdir(parents=True, exist_ok=True)
    
    # 目标文件路径（直接放在前缀目录下，如 /ncbi/XP_025/XP_025583093.1.pdb）
    target_path = target_dir / pdb_file.name
    
    # 移动文件（如果目标已存在，自动覆盖）
    shutil.move(str(pdb_file), str(target_path))
    print(f"Moved {pdb_file.name} to {target_path}")

Moved XP_065464244.1.pdb to /hpcfs/fhome/shizhenkun/codebase/RXNRECer/data/structure/pdb/ncbi/XP_065464/XP_065464244.1.pdb
Moved XP_065466127.1.pdb to /hpcfs/fhome/shizhenkun/codebase/RXNRECer/data/structure/pdb/ncbi/XP_065466/XP_065466127.1.pdb
Moved XP_065463689.1.pdb to /hpcfs/fhome/shizhenkun/codebase/RXNRECer/data/structure/pdb/ncbi/XP_065463/XP_065463689.1.pdb
Moved XP_065465982.1.pdb to /hpcfs/fhome/shizhenkun/codebase/RXNRECer/data/structure/pdb/ncbi/XP_065465/XP_065465982.1.pdb
Moved XP_065465071.1.pdb to /hpcfs/fhome/shizhenkun/codebase/RXNRECer/data/structure/pdb/ncbi/XP_065465/XP_065465071.1.pdb
Moved XP_065463213.1.pdb to /hpcfs/fhome/shizhenkun/codebase/RXNRECer/data/structure/pdb/ncbi/XP_065463/XP_065463213.1.pdb
Moved XP_065463433.1.pdb to /hpcfs/fhome/shizhenkun/codebase/RXNRECer/data/structure/pdb/ncbi/XP_065463/XP_065463433.1.pdb
Moved XP_065465774.1.pdb to /hpcfs/fhome/shizhenkun/codebase/RXNRECer/data/structure/pdb/ncbi/XP_065465/XP_065465774.1.pdb
Moved XP_0654658