# 2018 Later 5 Years

> author: Shizhenkun   
> email: zhenkun.shi@tib.cas.cn   
> date: 2025-09-2  


## 1. Import packages

In [15]:
import sys,os
sys.path.insert(0, os.path.dirname(os.path.realpath('__file__')))
sys.path.insert(1,'../../')
sys.path.insert(1,'../../../')
from config import conf as cfg
from modules import commonfunction as cmfunc
from tqdm import tqdm
import numpy as np
import rxnrecer as production
from tools import filetool as ftool
import tools.bioFunctionLib as bfl
from pandarallel import pandarallel # 导入pandaralle
pandarallel.initialize(progress_bar=False)
from evaluation import evTools
from joblib import parallel_backend
from multiprocessing import Pool
import pandas as pd

FIRST_TIME_RUN = False # For the initial run, please set this flag to True. This will allow the program to download data from UniProt and RHEA, which may take longer depending on your internet speed.

%load_ext autoreload
%autoreload 2

INFO: Pandarallel will run on 192 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## 2. Load Data

In [None]:
train_set = pd.read_feather(cfg.FILE_DS_TRAIN)
test_set = pd.read_feather(cfg.FILE_DS_TEST)
data_set = pd.concat([train_set,test_set], axis=0).reset_index(drop=True)

data_set = data_set[data_set.ec_number=='-'].reset_index(drop=True) # 筛选EC为空
data_set = data_set[data_set.reaction_id=='-'].reset_index(drop=True) # 筛选反应为空

data_set.head(3)

Unnamed: 0,uniprot_id,seq,reaction_id,ec_number,functionCounts,ec_specific_level,isenzyme,label
0,Q6GZX4,MAFSAEDVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQV...,-,-,0,0,False,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,Q6GZX3,MSIIGATRLQNDKSDTYSAGPCYAGGCSAFTPRGTCGKDWDLGEQT...,-,-,0,0,False,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,Q197F8,MASNTVSAQGGSNRPVRDFSNIQDVAQFLLFDPIWNEQPGSIVPWK...,-,-,0,0,False,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [None]:
data_set

Unnamed: 0,uniprot_id,seq,reaction_id,ec_number,functionCounts,ec_specific_level,isenzyme,label
0,Q6GZX4,MAFSAEDVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQV...,-,-,0,0,False,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,Q6GZX3,MSIIGATRLQNDKSDTYSAGPCYAGGCSAFTPRGTCGKDWDLGEQT...,-,-,0,0,False,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,Q197F8,MASNTVSAQGGSNRPVRDFSNIQDVAQFLLFDPIWNEQPGSIVPWK...,-,-,0,0,False,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,Q197F7,MYQAINPCPQSWYGSPQLEREIVCKMSGAPHYPNYYPVHPNALGGA...,-,-,0,0,False,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,Q6GZX2,MARPLLGKTSSVRRRLESLSACSIFFFLRKFCQKMASLVFLNSPVY...,-,-,0,0,False,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...,...,...
290627,P0DW91,MSGAEEAGGGGPAAGPAGSVPAGVGVGAGAGAGVGVGAGPGAAAGP...,-,-,0,0,False,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
290628,P0DTL6,MSGAEEAGGGGPAAGPAGSVPAGVGVGVGAGPGAAAGQAAAAALGE...,-,-,0,0,False,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
290629,P0DW87,MSGAEEAGGGGPAAGPAGAVPAGVGVGAGPGAAAGPAAAALGEAAG...,-,-,0,0,False,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
290630,P0DW89,MSGAEEAGGGGPAAGPAGAVPAGVGVGVGPGAAAGPAAAALGEAAG...,-,-,0,0,False,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [None]:
bfl.table2fasta(table=data_set[['uniprot_id', 'seq']], file_out=)

## 3. RXNRECEer-S1

In [None]:
# res_rxnrecer_s1 = production.step_by_step_prediction(input_data=data_set[['uniprot_id', 'seq']], Ensemble=False, output_file=None,batch_size=600)
# res_rxnrecer_s1.to_feather('./res/2018previous_noneEnzyme-rxnrecer-s1.feather')
res_rxnrecer_s1 = pd.read_feather('./res/2018previous_noneEnzyme-rxnrecer-s1.feather')

res_rxnrecer_s1["RXNRECer_with_prob"] = (
    res_rxnrecer_s1["RXNRECer_with_prob"]
    .apply(lambda d: {k: v for k, v in d.items() if v is not None})
)
res_rxnrecer_s1 = res_rxnrecer_s1[res_rxnrecer_s1.RXNRECer!='-'].reset_index(drop=True)

## 4. 整理表格

In [34]:
res_rxnrecer_s1

Unnamed: 0,input_id,RXNRECer,RXNRECer_with_prob
0,Q6GZS4,RHEA:14709,{'RHEA:14709': 0.496992}
1,Q06429,RHEA:21744,{'RHEA:21744': 0.995994}
2,Q8QZQ9,RHEA:17989;RHEA:46608,"{'RHEA:17989': 0.998949, 'RHEA:46608': 0.999166}"
3,D9J0Z8,RHEA:36103,{'RHEA:36103': 0.369401}
4,Q54W20,RHEA:45956,{'RHEA:45956': 0.621955}
...,...,...,...
575,O53518,RHEA:11348,{'RHEA:11348': 0.986992}
576,I6Y276,RHEA:26168,{'RHEA:26168': 0.314996}
577,Q1LVI2,RHEA:32215;RHEA:32227,"{'RHEA:32215': 0.975971, 'RHEA:32227': 0.977251}"
578,A0A0R4IQZ2,RHEA:36683,{'RHEA:36683': 0.99958}


Unnamed: 0,input_id,RXNRECer,RXNRECer_with_prob
0,Q6GZS4,RHEA:14709,{'RHEA:14709': 0.496992}
1,Q06429,RHEA:21744,{'RHEA:21744': 0.995994}
2,Q8QZQ9,RHEA:17989;RHEA:46608,"{'RHEA:17989': 0.998949, 'RHEA:46608': 0.999166}"
3,D9J0Z8,RHEA:36103,{'RHEA:36103': 0.369401}
4,Q54W20,RHEA:45956,{'RHEA:45956': 0.621955}
...,...,...,...
566,O53518,RHEA:11348,{'RHEA:11348': 0.986992}
567,I6Y276,RHEA:26168,{'RHEA:26168': 0.314996}
568,Q1LVI2,RHEA:32215;RHEA:32227,"{'RHEA:32215': 0.975971, 'RHEA:32227': 0.977251}"
569,A0A0R4IQZ2,RHEA:36683,{'RHEA:36683': 0.99958}


In [42]:
ds_brenda = pd.read_feather('/hpcfs/fhome/shizhenkun/codebase/RXNRECer/data/brenda/brenda_reaction_uniprot_dataset.feather')
rxn = pd.read_feather(cfg.FILE_RHEA_REACTION)
ds_brenda.head(3)

Unnamed: 0,reaction_id,equation_string,equation,substrates_smile,products_smile,uniprot_id,organism,ec,len,seq,equation_smiles
0,brnx:1,4-nitrophenyl propanoate + H2O = 4-nitrophenol...,cid:74777 = cid:980 + cid:104745,CCC(=O)Oc1ccc([N+](=O)[O-])cc1,C1=CC(=CC=C1[N+](=O)[O-])O^CCC(=O)[O-],D0EPY0,Bacillus subtilis,,300,MSNHSSSIPELSDNGIRYYQTYNESLSLWPVRCKSFYISTRFGQTH...,CCC(=O)Oc1ccc([N+](=O)[O-])cc1>>C1=CC(=CC=C1[N...
1,brnx:1,4-nitrophenyl propanoate + H2O = 4-nitrophenol...,cid:74777 = cid:980 + cid:104745,CCC(=O)Oc1ccc([N+](=O)[O-])cc1,C1=CC(=CC=C1[N+](=O)[O-])O^CCC(=O)[O-],Q5V5N6,Haloarcula marismortui (strain ATCC 43049 / DS...,,327,MSTTARPMPVTERAPESVTVQRDIPFHEVDGETLTLDLYDAAAASG...,CCC(=O)Oc1ccc([N+](=O)[O-])cc1>>C1=CC(=CC=C1[N...
2,brnx:1,4-nitrophenyl propanoate + H2O = 4-nitrophenol...,cid:74777 = cid:980 + cid:104745,CCC(=O)Oc1ccc([N+](=O)[O-])cc1,C1=CC(=CC=C1[N+](=O)[O-])O^CCC(=O)[O-],Q7M529,Sulfolobus acidocaldarius,3.1.1.1,20,PLDPTIKCLLESGFVIPIGK,CCC(=O)Oc1ccc([N+](=O)[O-])cc1>>C1=CC(=CC=C1[N...


In [40]:
kk1 = res_rxnrecer_s1[res_rxnrecer_s1.input_id.isin(ds_brenda.uniprot_id.to_list())].reset_index(drop=True)
kk1

Unnamed: 0,input_id,RXNRECer,RXNRECer_with_prob
0,Q96A70,RHEA:22964,{'RHEA:22964': 0.998765}
1,Q9AVK1,RHEA:24604,{'RHEA:24604': 0.910718}
2,Q4GZD1,RHEA:33299,{'RHEA:33299': 0.988275}
3,P31852,RHEA:17325,{'RHEA:17325': 0.999733}
4,P52917,RHEA:13065,{'RHEA:13065': 0.98596}
5,P73408,RHEA:23516,{'RHEA:23516': 0.978619}
6,A0A384E126,RHEA:17521,{'RHEA:17521': 0.930629}
7,S0ENM8,RHEA:27954,{'RHEA:27954': 0.246609}
8,G4VGH9,RHEA:24388,{'RHEA:24388': 0.999357}


In [45]:
kk1.merge(rxn, left_on='RXNRECer', right_on='reaction_id', how='left')

Unnamed: 0,input_id,RXNRECer,RXNRECer_with_prob,reaction_id,equation,chebi_id,ec_number,equation_chebi,equation_smiles
0,Q96A70,RHEA:22964,{'RHEA:22964': 0.998765},RHEA:22964,H(+) + L-ornithine = CO2 + putrescine,CHEBI:15378;CHEBI:46911;CHEBI:16526;CHEBI:326268,EC:4.1.1.17,CHEBI:15378 + CHEBI:46911 = CHEBI:16526 + CHEB...,[H+].[NH3+]CCC[C@H]([NH3+])C([O-])=O>>O=C=O.[N...
1,Q9AVK1,RHEA:24604,{'RHEA:24604': 0.910718},RHEA:24604,7-methylxanthine + S-adenosyl-L-methionine = H...,CHEBI:48991;CHEBI:59789;CHEBI:15378;CHEBI:5785...,EC:2.1.1.159;EC:2.1.1.160,CHEBI:48991 + CHEBI:59789 = CHEBI:15378 + CHEB...,Cn1cnc2[nH]c(=O)[nH]c(=O)c12.C[S+](CC[C@H]([NH...
2,Q4GZD1,RHEA:33299,{'RHEA:33299': 0.988275},RHEA:33299,[eIF5A protein]-L-lysine + spermidine = [eIF5A...,CHEBI:29969;CHEBI:57834;CHEBI:82657;CHEBI:57484,EC:2.5.1.46,CHEBI:29969 + CHEBI:57834 = CHEBI:82657 + CHEB...,C([C@@H](C(*)=O)N*)CCC[NH3+].[NH3+]CCCC[NH2+]C...
3,P31852,RHEA:17325,{'RHEA:17325': 0.999733},RHEA:17325,"(S)-2,3,4,5-tetrahydrodipicolinate + H2O + suc...",CHEBI:16845;CHEBI:15377;CHEBI:57292;CHEBI:1568...,EC:2.3.1.117,CHEBI:16845 + CHEBI:15377 + CHEBI:57292 = CHEB...,[O-]C(=O)[C@@H]1CCCC(=N1)C([O-])=O.[H]O[H].CC(...
4,P52917,RHEA:13065,{'RHEA:13065': 0.98596},RHEA:13065,ATP + H2O = ADP + H(+) + phosphate,CHEBI:30616;CHEBI:15377;CHEBI:456216;CHEBI:153...,EC:3.6.4.6;EC:3.6.4.7;EC:3.6.4.10;EC:3.6.4.12;...,CHEBI:30616 + CHEBI:15377 = CHEBI:456216 + CHE...,Nc1ncnc2n(cnc12)[C@@H]1O[C@H](COP([O-])(=O)OP(...
5,P73408,RHEA:23516,{'RHEA:23516': 0.978619},RHEA:23516,(R)-glycerate + ATP = (2R)-3-phosphoglycerate ...,CHEBI:16659;CHEBI:30616;CHEBI:58272;CHEBI:4562...,EC:2.7.1.31,CHEBI:16659 + CHEBI:30616 = CHEBI:58272 + CHEB...,OC[C@@H](O)C([O-])=O.Nc1ncnc2n(cnc12)[C@@H]1O[...
6,A0A384E126,RHEA:17521,{'RHEA:17521': 0.930629},RHEA:17521,ATP + H2O + L-glutamine + L-glutamyl-tRNA(Gln)...,CHEBI:30616;CHEBI:15377;CHEBI:58359;CHEBI:7852...,EC:6.3.5.7,CHEBI:30616 + CHEBI:15377 + CHEBI:58359 + CHEB...,Nc1ncnc2n(cnc12)[C@@H]1O[C@H](COP([O-])(=O)OP(...
7,S0ENM8,RHEA:27954,{'RHEA:27954': 0.246609},RHEA:27954,"(2E,6E)-farnesyl diphosphate + H2O = diphospha...",CHEBI:175763;CHEBI:15377;CHEBI:33019;CHEBI:60968,EC:4.2.3.74,CHEBI:175763 + CHEBI:15377 = CHEBI:33019 + CHE...,CC(C)=CCC\\C(C)=C\\CC\\C(C)=C\\COP([O-])(=O)OP...
8,G4VGH9,RHEA:24388,{'RHEA:24388': 0.999357},RHEA:24388,phosphate + uridine = alpha-D-ribose 1-phospha...,CHEBI:43474;CHEBI:16704;CHEBI:57720;CHEBI:17568,EC:2.4.2.2;EC:2.4.2.3,CHEBI:43474 + CHEBI:16704 = CHEBI:57720 + CHEB...,OP([O-])([O-])=O.OC[C@H]1O[C@H]([C@H](O)[C@@H]...


In [47]:
kk2 = res_rxnrecer_s1[~res_rxnrecer_s1.input_id.isin(ds_brenda.uniprot_id.to_list())].reset_index(drop=True)

In [51]:
kk3 = kk2.merge(rxn, left_on='RXNRECer', right_on='reaction_id', how='left')

In [52]:
kk3.to_excel('res/不在brenda.xlsx', index=None)