In [32]:
import sys,os
sys.path.insert(0, f"{os.path.dirname(os.path.realpath('__file__'))}/../../../")
from rxnrecer.config import config as cfg
import pandas as pd
import numpy as np
from types import SimpleNamespace
from Bio import SeqIO
import torch
import argparse
import hashlib
import Reaction as rxnTool
import json
from tqdm import tqdm
tqdm.pandas()
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [33]:
rxns = pd.read_feather(cfg.FILE_RHEA_REACTION)
rxns.head(2)

Unnamed: 0,reaction_id,equation,chebi_id,ec_number,equation_chebi,equation_smiles
0,RHEA:22636,dCTP + H2O = dCMP + diphosphate + H(+),CHEBI:61481;CHEBI:15377;CHEBI:57566;CHEBI:3301...,EC:3.6.1.9;EC:3.6.1.12;EC:3.6.1.65,CHEBI:61481 + CHEBI:15377 = CHEBI:57566 + CHEB...,Nc1ccn([C@H]2C[C@H](O)[C@@H](COP([O-])(=O)OP([...
1,RHEA:22640,NADP(+) + sphinganine = 3-oxosphinganine + H(+...,CHEBI:58349;CHEBI:57817;CHEBI:58299;CHEBI:1537...,EC:1.1.1.102,CHEBI:58349 + CHEBI:57817 = CHEBI:58299 + CHEB...,NC(=O)c1ccc[n+](c1)[C@@H]1O[C@H](COP([O-])(=O)...


In [34]:
def build_rxn_files(rxn_row):
    """构建反应JSON文件，添加错误处理"""
    rxn_id = rxn_row['reaction_id']
    rxn_smiles = rxn_row['equation_smiles']
    rxn_equation = rxn_row['equation']
    rxn_equation_ref_chebi = rxn_row['equation_chebi']
    rxn_ec = rxn_row['ec_number']
    
    try:
        reaction = rxnTool.Reaction(rxn_smiles, rxn_equation, rxn_equation_ref_chebi, rxn_id=rxn_id, rxn_ec=rxn_ec)
        json_path = f'{cfg.DIR_RXN_JSON}{rxn_id.replace(":", "_")}.json'
        reaction.save_json_file(json_path)
        return True
    except Exception as e:
        print(f'处理失败 {rxn_id}: {e}')
        return False

# 测试单个反应
print("测试单个反应...")
result = build_rxn_files(rxn_row=rxns.iloc[0])
print(f"测试结果: {'成功' if result else '失败'}")

测试单个反应...
测试结果: 成功


In [35]:
# 处理所有反应并统计结果
print("开始处理所有反应...")
results = rxns.progress_apply(build_rxn_files, axis=1)

# 统计结果
success_count = results.sum()
total_count = len(results)
failed_count = total_count - success_count

print(f"\n处理完成!")
print(f"总数: {total_count}")
print(f"成功: {success_count} ({success_count/total_count*100:.2f}%)")
print(f"失败: {failed_count} ({failed_count/total_count*100:.2f}%)")

# 保存失败的反应ID
if failed_count > 0:
    failed_indices = results[~results].index
    failed_rxn_ids = rxns.loc[failed_indices, 'reaction_id'].tolist()
    
    with open('notebook_failed_reactions.txt', 'w') as f:
        for rxn_id in failed_rxn_ids:
            f.write(f'{rxn_id}\n')
    
    print(f"失败的反应ID已保存到: notebook_failed_reactions.txt")
    print(f"前10个失败的反应ID: {failed_rxn_ids[:10]}")

results

开始处理所有反应...


 45%|████▌     | 7389/16410 [01:57<01:46, 84.40it/s] 

警告: 无法创建产物分子 beta-hematin (SMILES: C1=2N3C(C=C4[N+]5=C(C=C6N7C8=CC9=[N+](C(=C1)C(=C9CCC([O-])=O)C)[Fe-2]573OC(CCC=\%10C=\%11C=C\%12C(=C(C\%13=CC\%14=[N+]\%15C(=CC=\%16N\%17C(=C(C\%16C=C)C)C=C([N+]\%11[Fe-2]\%15\%17(N\%13\%12)OC(CCC8=C6C)=O)C\%10C)C(=C\%14C=C)C)C)CCC([O-])=O)=O)C(=C4C)C=C)=C(C2C)C=C): 无法解析SMILES字符串: C1=2N3C(C=C4[N+]5=C(C=C6N7C8=CC9=[N+](C(=C1)C(=C9CCC([O-])=O)C)[Fe-2]573OC(CCC=\%10C=\%11C=C\%12C(=C(C\%13=CC\%14=[N+]\%15C(=CC=\%16N\%17C(=C(C\%16C=C)C)C=C([N+]\%11[Fe-2]\%15\%17(N\%13\%12)OC(CCC8=C6C)=O)C\%10C)C(=C\%14C=C)C)C)CCC([O-])=O)=O)C(=C4C)C=C)=C(C2C)C=C


 48%|████▊     | 7929/16410 [02:04<01:45, 80.05it/s]

解析反应失败 RHEA:55876: SMILES字符串为空
解析反应失败 RHEA:55920: SMILES字符串为空


 49%|████▉     | 8028/16410 [02:07<03:31, 39.66it/s]

解析反应失败 RHEA:56300: SMILES字符串为空


 50%|████▉     | 8147/16410 [02:09<01:54, 72.45it/s]

解析反应失败 RHEA:56788: SMILES字符串为空
解析反应失败 RHEA:56800: SMILES字符串为空


 51%|█████     | 8372/16410 [02:11<01:52, 71.17it/s] 

解析反应失败 RHEA:57688: SMILES字符串为空


 51%|█████▏    | 8450/16410 [02:13<02:21, 56.12it/s]

警告: 无法创建反应物分子 cyclic tetraadenylate (SMILES: NC1=NC=NC2=C1N=CN2[C@@]3(O[C@@]4(COP(=O)([O-])O[C@@]5([C@](O[C@@](N6C=7N=CN=C(N)C7N=C6)([C@@H]5O)[H])(COP(=O)([O-])O[C@@]8([C@](O[C@@](N9C=\%10N=CN=C(N)C\%10N=C9)([C@@H]8O)[H])(COP(=O)([O-])O[C@@]\%11([C@](O[C@@](N\%12C=\%13N=CN=C(N)C\%13N=C\%12)([C@@H]\%11O)[H])(COP(=O)([O-])O[C@]4([C@H]3O)[H])[H])[H])[H])[H])[H])[H])[H])[H]): 无法解析SMILES字符串: NC1=NC=NC2=C1N=CN2[C@@]3(O[C@@]4(COP(=O)([O-])O[C@@]5([C@](O[C@@](N6C=7N=CN=C(N)C7N=C6)([C@@H]5O)[H])(COP(=O)([O-])O[C@@]8([C@](O[C@@](N9C=\%10N=CN=C(N)C\%10N=C9)([C@@H]8O)[H])(COP(=O)([O-])O[C@@]\%11([C@](O[C@@](N\%12C=\%13N=CN=C(N)C\%13N=C\%12)([C@@H]\%11O)[H])(COP(=O)([O-])O[C@]4([C@H]3O)[H])[H])[H])[H])[H])[H])[H])[H])[H]


 52%|█████▏    | 8516/16410 [02:14<01:42, 76.78it/s]

警告: 无法创建产物分子 cyclic hexaadenylate (SMILES: NC1=NC=NC2=C1N=CN2[C@@]3(O[C@@]4(COP(=O)([O-])O[C@@]5([C@](O[C@@](N6C=7N=CN=C(N)C7N=C6)([C@@H]5O)[H])(COP(=O)([O-])O[C@@]8([C@](O[C@@](N9C=\%10N=CN=C(N)C\%10N=C9)([C@@H]8O)[H])(COP(=O)([O-])O[C@@]\%11([C@](O[C@@](N\%12C=\%13N=CN=C(N)C\%13N=C\%12)([C@@H]\%11O)[H])(COP(=O)([O-])O[C@@]\%14([C@](O[C@@](N\%15C=\%16N=CN=C(N)C\%16N=C\%15)([C@@H]\%14O)[H])(COP(=O)([O-])O[C@@]\%17([C@](O[C@@](N\%18C=\%19N=CN=C(N)C\%19N=C\%18)([C@@H]\%17O)[H])(COP(=O)([O-])O[C@]4([C@H]3O)[H])[H])[H])[H])[H])[H])[H])[H])[H])[H])[H])[H])[H]): 无法解析SMILES字符串: NC1=NC=NC2=C1N=CN2[C@@]3(O[C@@]4(COP(=O)([O-])O[C@@]5([C@](O[C@@](N6C=7N=CN=C(N)C7N=C6)([C@@H]5O)[H])(COP(=O)([O-])O[C@@]8([C@](O[C@@](N9C=\%10N=CN=C(N)C\%10N=C9)([C@@H]8O)[H])(COP(=O)([O-])O[C@@]\%11([C@](O[C@@](N\%12C=\%13N=CN=C(N)C\%13N=C\%12)([C@@H]\%11O)[H])(COP(=O)([O-])O[C@@]\%14([C@](O[C@@](N\%15C=\%16N=CN=C(N)C\%16N=C\%15)([C@@H]\%14O)[H])(COP(=O)([O-])O[C@@]\%17([C@](O[C@@](N\%18C=\%19N=CN=C(N)C\%19N=C\%18)([

 58%|█████▊    | 9581/16410 [02:28<01:20, 85.02it/s] 

警告: 无法创建产物分子 2 cornusiin E (SMILES: [C@@H]12COC(=O)C=3C(=C(C(=C(C3)O)O)O)C4=C(C(O[C@H]1[C@H](OC(C=5C=C(C(=C(C5)O)O)O)=O)[C@H]([C@@H](O2)OC(=O)C6=CC(=C(C(=C6)O)O)O)OC(=O)C=7C=C(C(=C(C7)O)O)O)=O)C=C(C(=C4[O-])[O-])OC8=C(C(O[C@@H]9O[C@@H]\%10COC(=O)C=\%11C(=C(C(=C(C\%11)O)O)O)C\%12=C(C(O[C@H]\%10[C@H](OC(C=\%13C=C(C(=C(C\%13)O)O)O)=O)[C@H]9OC(=O)C=\%14C=C(C(=C(C\%14)O)O)O)=O)C=C(C(=C\%12[O-])O)O)=O)C=C(C(=C8O)[O-])O): 无法解析SMILES字符串: [C@@H]12COC(=O)C=3C(=C(C(=C(C3)O)O)O)C4=C(C(O[C@H]1[C@H](OC(C=5C=C(C(=C(C5)O)O)O)=O)[C@H]([C@@H](O2)OC(=O)C6=CC(=C(C(=C6)O)O)O)OC(=O)C=7C=C(C(=C(C7)O)O)O)=O)C=C(C(=C4[O-])[O-])OC8=C(C(O[C@@H]9O[C@@H]\%10COC(=O)C=\%11C(=C(C(=C(C\%11)O)O)O)C\%12=C(C(O[C@H]\%10[C@H](OC(C=\%13C=C(C(=C(C\%13)O)O)O)=O)[C@H]9OC(=O)C=\%14C=C(C(=C(C\%14)O)O)O)=O)C=C(C(=C\%12[O-])O)O)=O)C=C(C(=C8O)[O-])O


 62%|██████▏   | 10226/16410 [02:36<01:35, 64.64it/s]

警告: 无法创建反应物分子 a 5'-end (5'-triphosphoguanosine)-adenylyl-adenylyl-cytidylyl-adenosine in mRNA (SMILES: C1(=O)NC(=NC2=C1N=CN2[C@@H]3O[C@H](COP(OP(OP(=O)([O-])OC[C@H]4O[C@@H](N5C=6N=CN=C(N)C6N=C5)[C@@H]([C@@H]4OP(OC[C@H]7O[C@@H](N8C=9N=CN=C(N)C9N=C8)[C@@H]([C@@H]7OP(OC[C@H]\%10O[C@@H](N\%11C(NC(=O)C(=C\%11)C)=O)[C@@H]([C@@H]\%10OP(OC[C@H]\%12O[C@@H](N\%13C=\%14N=CN=C(N)C\%14N=C\%13)[C@@H]([C@@H]\%12*)O)(=O)[O-])O)(=O)[O-])O)(=O)[O-])O)(=O)[O-])(=O)[O-])[C@@H](O)[C@H]3O)N): 无法解析SMILES字符串: C1(=O)NC(=NC2=C1N=CN2[C@@H]3O[C@H](COP(OP(OP(=O)([O-])OC[C@H]4O[C@@H](N5C=6N=CN=C(N)C6N=C5)[C@@H]([C@@H]4OP(OC[C@H]7O[C@@H](N8C=9N=CN=C(N)C9N=C8)[C@@H]([C@@H]7OP(OC[C@H]\%10O[C@@H](N\%11C(NC(=O)C(=C\%11)C)=O)[C@@H]([C@@H]\%10OP(OC[C@H]\%12O[C@@H](N\%13C=\%14N=CN=C(N)C\%14N=C\%13)[C@@H]([C@@H]\%12*)O)(=O)[O-])O)(=O)[O-])O)(=O)[O-])O)(=O)[O-])(=O)[O-])[C@@H](O)[C@H]3O)N
警告: 无法创建产物分子 a 5'-end (N(7)-methyl 5'-triphosphoguanosine)-(2'-O-methyladenylyl)-adenylyl-cytidylyl-adenosine in mRNA (SMILES: C1(=O)NC(=N

 62%|██████▏   | 10242/16410 [02:36<01:30, 68.51it/s]

警告: 无法创建反应物分子 a 5'-end triphospho-adenylyl-adenylyl-cytidylyl-adenosine in mRNA (SMILES: [O-]P(OP(OP(=O)([O-])OC[C@H]1O[C@@H](N2C=3N=CN=C(N)C3N=C2)[C@@H]([C@@H]1OP(OC[C@H]4O[C@@H](N5C=6N=CN=C(N)C6N=C5)[C@@H]([C@@H]4OP(OC[C@H]7O[C@@H](N8C(NC(=O)C(=C8)C)=O)[C@@H]([C@@H]7OP(OC[C@H]9O[C@@H](N\%10C=\%11N=CN=C(N)C\%11N=C\%10)[C@@H]([C@@H]9*)O)(=O)[O-])O)(=O)[O-])O)(=O)[O-])O)(=O)[O-])(=O)[O-]): 无法解析SMILES字符串: [O-]P(OP(OP(=O)([O-])OC[C@H]1O[C@@H](N2C=3N=CN=C(N)C3N=C2)[C@@H]([C@@H]1OP(OC[C@H]4O[C@@H](N5C=6N=CN=C(N)C6N=C5)[C@@H]([C@@H]4OP(OC[C@H]7O[C@@H](N8C(NC(=O)C(=C8)C)=O)[C@@H]([C@@H]7OP(OC[C@H]9O[C@@H](N\%10C=\%11N=CN=C(N)C\%11N=C\%10)[C@@H]([C@@H]9*)O)(=O)[O-])O)(=O)[O-])O)(=O)[O-])O)(=O)[O-])(=O)[O-]
警告: 无法创建产物分子 a 5'-end (5'-triphosphoguanosine)-adenylyl-adenylyl-cytidylyl-adenosine in mRNA (SMILES: C1(=O)NC(=NC2=C1N=CN2[C@@H]3O[C@H](COP(OP(OP(=O)([O-])OC[C@H]4O[C@@H](N5C=6N=CN=C(N)C6N=C5)[C@@H]([C@@H]4OP(OC[C@H]7O[C@@H](N8C=9N=CN=C(N)C9N=C8)[C@@H]([C@@H]7OP(OC[C@H]\%10O[C@@H](N\%11C(NC

 64%|██████▍   | 10534/16410 [02:40<01:11, 82.30it/s] 

警告: 无法创建反应物分子 (tRNA)-3'-end (ribonucleoside 5'-phosphate)-(guanosyl 5'-phosphate)-(cytosyl 5'-phosphate)-(cytosyl 5'-phosphate)-(adenosine 5'-phosphate) (SMILES: N1([C@@H]2O[C@H](COP(O[C@H]3[C@H]([C@H](*)O[C@@H]3COP(*)([O-])=O)O)([O-])=O)[C@H]([C@H]2O)OP(OC[C@H]4O[C@@H](N5C(N=C(N)C=C5)=O)[C@@H]([C@@H]4OP(OC[C@H]6O[C@@H](N7C(N=C(N)C=C7)=O)[C@@H]([C@@H]6OP(OC[C@H]8O[C@@H](N9C=\%10N=CN=C(N)C\%10N=C9)[C@@H]([C@@H]8O)O)([O-])=O)O)([O-])=O)O)([O-])=O)C=\%11N=C(NC(=O)C\%11N=C1)N): 无法解析SMILES字符串: N1([C@@H]2O[C@H](COP(O[C@H]3[C@H]([C@H](*)O[C@@H]3COP(*)([O-])=O)O)([O-])=O)[C@H]([C@H]2O)OP(OC[C@H]4O[C@@H](N5C(N=C(N)C=C5)=O)[C@@H]([C@@H]4OP(OC[C@H]6O[C@@H](N7C(N=C(N)C=C7)=O)[C@@H]([C@@H]6OP(OC[C@H]8O[C@@H](N9C=\%10N=CN=C(N)C\%10N=C9)[C@@H]([C@@H]8O)O)([O-])=O)O)([O-])=O)O)([O-])=O)C=\%11N=C(NC(=O)C\%11N=C1)N
警告: 无法创建产物分子 GpCpCpA (SMILES: N1([C@@H]2O[C@H](CO)[C@H]([C@H]2O)OP(OC[C@H]3O[C@@H](N4C(N=C(N)C=C4)=O)[C@@H]([C@@H]3OP(OC[C@H]5O[C@@H](N6C(N=C(N)C=C6)=O)[C@@H]([C@@H]5OP(OC[C@H]7O[C@@H](N8C=9N

 78%|███████▊  | 12846/16410 [03:08<00:36, 97.96it/s] 

解析反应失败 RHEA:75967: SMILES字符串为空


100%|██████████| 16410/16410 [03:50<00:00, 71.08it/s] 


处理完成!
总数: 16410
成功: 16410 (100.00%)
失败: 0 (0.00%)





0        True
1        True
2        True
3        True
4        True
         ... 
16405    True
16406    True
16407    True
16408    True
16409    True
Length: 16410, dtype: bool