# 2018 Later 5 Years

> author: Shizhenkun   
> email: zhenkun.shi@tib.cas.cn   
> date: 2025-06-23  


## 1. Import packages

In [1]:
import sys,os
sys.path.insert(0, os.path.dirname(os.path.realpath('__file__')))
sys.path.insert(1,'../../')
sys.path.insert(1,'../../../')
from config import conf as cfg
from modules import commonfunction as cmfunc
from tqdm import tqdm
import numpy as np
from pandarallel import pandarallel # 导入pandaralle
pandarallel.initialize(progress_bar=False)
from evaluation import evTools
from joblib import parallel_backend
from multiprocessing import Pool
import pandas as pd

FIRST_TIME_RUN = False # For the initial run, please set this flag to True. This will allow the program to download data from UniProt and RHEA, which may take longer depending on your internet speed.

%load_ext autoreload
%autoreload 2

INFO: Pandarallel will run on 192 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


## 2. TestSet for Newly 5 Years

In [2]:
test_set = pd.read_feather(cfg.FILE_DS_TEST)
test_set = test_set[['uniprot_id', 'reaction_id', 'ec_number', 'label']].rename(columns={'reaction_id': 'rxn_groundtruth', 'ec_number': 'ec_groundtruth', 'label': 'rxn_groundtruth_label'})
test_set.head(3)

Unnamed: 0,uniprot_id,rxn_groundtruth,ec_groundtruth,rxn_groundtruth_label
0,A9JLI2,-,-,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,A9JLI3,-,-,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,A9JLI5,-,-,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


## 3. Load Results

In [6]:
res_ec  = pd.read_feather(f'{cfg.CASE_2018LATER}res/res_methods_ec.feather')
res_ec=res_ec.rename(columns={
                        'reaction_ecblast':'rxn_ecblast',
                        'reaction_deepec':'rxn_deepec',
                        'reaction_clean':'rxn_clean',
                        'reaction_ecrecer':'rxn_ecrecer',
                        'reaction_catfam':'rxn_catfam',
                        'reaction_priam':'rxn_priam',
})
res_direct = pd.read_feather(f'{cfg.CASE_2018LATER}res/res_methods_direct.feather')
pd.set_option('display.max_columns', None)
res_direct.head(3)

Unnamed: 0,uniprot_id,rxn_groundtruth,enz_groundtruth,lb_rxn_groundtruth,rxn_blast,lb_rxn_blast,rxn_eu_esm,enz_eu_esm,rxn_cos_esm,enz_cos_esm,lb_rxn_eu_esm,lb_rxn_cos_esm,rxn_eu_unirep,enz_eu_unirep,rxn_cos_unirep,enz_cos_unirep,lb_rxn_eu_unirep,lb_rxn_cos_unirep,rxn_eu_t5,enz_eu_t5,rxn_cos_t5,enz_cos_t5,lb_rxn_eu_t5,lb_rxn_cos_t5,enz_RXNRECer,rxn_RXNRECer,lb_rxn_RXNRECer
0,A9JLI2,-,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",-,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",-,0,-,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",-,0,-,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",-,0,-,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,-,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,A9JLI3,-,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",-,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",-,0,-,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",-,0,-,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",-,0,-,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,-,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,A9JLI5,-,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",-,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",-,0,-,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",-,0,-,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",-,0,-,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,-,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


## 4. 整理表格

In [36]:
res = res_direct.merge(res_ec, on='uniprot_id', how='left')
res = res[res.rxn_groundtruth_x=='-'].reset_index(drop=True)
res = res[res.rxn_RXNRECer!='-'].reset_index(drop=True)
res = res[[c for c in res.columns if not c.startswith("lb_")]]
res = res[[c for c in res.columns if not c.startswith("enz_")]]
res = res[[c for c in res.columns if not c.__contains__("eu_")]]
res = res[res.ec_groundtruth=='-']
res.head(2)

Unnamed: 0,uniprot_id,rxn_groundtruth_x,rxn_blast,rxn_cos_esm,rxn_cos_unirep,rxn_cos_t5,rxn_RXNRECer,rxn_groundtruth_y,isenzyme_groundtruth,ec_groundtruth,ec_specific_level,ec_ecblast,rxn_ecblast,ec_deepec,rxn_deepec,ec_clean,rxn_clean,ec_ecrecer,rxn_ecrecer,ec_ecpred,reaction_ecpred,ec_catfam,rxn_catfam,ec_priam,rxn_priam
0,C0HLM8,-,NO-PREDICTION,-,-,RHEA:15801,-;RHEA:15801,-,False,-,0,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,3.6.4.10;3.2.1.1;3.4.21.55;2.1.1.86,RHEA:13065;RHEA:53492,-,-,NO-PREDICTION,NO-PREDICTION,-,-,NO-PREDICTION,NO-PREDICTION
1,L0TBY6,-,NO-PREDICTION,RHEA:20049,-,-,-;RHEA:20049,-,False,-,0,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,2.6.1.45;1.4.3.2,RHEA:19125;RHEA:13781,-,-,-,-,-,-,6.3.4.2,RHEA:26426


In [37]:
res[res.ec_groundtruth=='-']

Unnamed: 0,uniprot_id,rxn_groundtruth_x,rxn_blast,rxn_cos_esm,rxn_cos_unirep,rxn_cos_t5,rxn_RXNRECer,rxn_groundtruth_y,isenzyme_groundtruth,ec_groundtruth,ec_specific_level,ec_ecblast,rxn_ecblast,ec_deepec,rxn_deepec,ec_clean,rxn_clean,ec_ecrecer,rxn_ecrecer,ec_ecpred,reaction_ecpred,ec_catfam,rxn_catfam,ec_priam,rxn_priam
0,C0HLM8,-,NO-PREDICTION,-,-,RHEA:15801,-;RHEA:15801,-,False,-,0,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,3.6.4.10;3.2.1.1;3.4.21.55;2.1.1.86,RHEA:13065;RHEA:53492,-,-,NO-PREDICTION,NO-PREDICTION,-,-,NO-PREDICTION,NO-PREDICTION
1,L0TBY6,-,NO-PREDICTION,RHEA:20049,-,-,-;RHEA:20049,-,False,-,0,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,2.6.1.45;1.4.3.2,RHEA:19125;RHEA:13781,-,-,-,-,-,-,6.3.4.2,RHEA:26426
7,Q59E04,-,NO-PREDICTION,RHEA:12532,RHEA:56136,-,-;RHEA:12532,-,False,-,0,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,2.4.1.21,RHEA:18189,-,-,3.-.-.-,EC-WITHOUT-REACTION,-,-,2.3.1.20,RHEA:10868
19,Q22397,-,RHEA:27950;RHEA:70947;RHEA:72807;RHEA:73011;RH...,-,RHEA:70831;RHEA:74971;RHEA:74963,RHEA:76299;RHEA:76303;RHEA:76307;RHEA:76311;RH...,-;RHEA:76299;RHEA:76303;RHEA:76307;RHEA:76311;...,-,False,-,0,-,-,NO-PREDICTION,NO-PREDICTION,2.4.1.198;7.2.2.10;2.1.1.210;2.7.10.1,RHEA:14789;RHEA:30903;RHEA:10596;RHEA:18105,-,-,3.4.-.-,EC-WITHOUT-REACTION,-,-,NO-PREDICTION,NO-PREDICTION
35,P0DPR5,-,-,-,-,RHEA:28971,-;RHEA:28971,-,False,-,0,-,-,NO-PREDICTION,NO-PREDICTION,4.1.1.114,RHEA:18345,-,-,2.4.-.-,EC-WITHOUT-REACTION,-,-,3.1.3.16,RHEA:47004
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2628,P0DPP8,-,NO-PREDICTION,-,-,RHEA:16185,-;RHEA:16185,-,False,-,0,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,1.11.1.7,RHEA:56136,-,-,NO-PREDICTION,NO-PREDICTION,-,-,NO-PREDICTION,NO-PREDICTION
2629,P0DSH6,-,NO-PREDICTION,-,-,RHEA:14801,-;RHEA:14801,-,False,-,0,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,3.1.1.4;1.4.3.2;1.3.99.17,RHEA:15801;RHEA:13781;RHEA:17749,-,-,NO-PREDICTION,NO-PREDICTION,-,-,NO-PREDICTION,NO-PREDICTION
2630,G5E8B9,-,-,-,-,-,-;RHEA:10684,-,False,-,0,-,-,2.3.2.27,EC-WITHOUT-REACTION,2.3.2.27,EC-WITHOUT-REACTION,-,-,-,-,-,-,2.1.1.43;2.3.2.27;2.3.2.27,EC-WITHOUT-REACTION
2631,A0A0R4IQZ2,-,RHEA:36683,RHEA:36683,RHEA:36683,RHEA:36683,RHEA:36683;-,-,False,-,0,2.3.1.225,RHEA:36683,NO-PREDICTION,NO-PREDICTION,2.3.1.225,RHEA:36683,-,-,3.-.-.-,EC-WITHOUT-REACTION,-,-,2.3.1.225;2.3.2.27;3.1.1.47;3.1.1.5;3.5.1.1;3....,RHEA:17777;RHEA:36683;RHEA:15177;RHEA:21016


In [34]:
res[[c for c in res.columns if not c.__contains__("eu_")]]

Unnamed: 0,uniprot_id,rxn_groundtruth_x,rxn_blast,rxn_cos_esm,rxn_cos_unirep,rxn_cos_t5,rxn_RXNRECer,rxn_groundtruth_y,isenzyme_groundtruth,ec_groundtruth,ec_specific_level,ec_ecblast,rxn_ecblast,ec_deepec,rxn_deepec,ec_clean,rxn_clean,ec_ecrecer,rxn_ecrecer,ec_ecpred,reaction_ecpred,ec_catfam,rxn_catfam,ec_priam,rxn_priam
0,C0HLM8,-,NO-PREDICTION,-,-,RHEA:15801,-;RHEA:15801,-,False,-,0,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,3.6.4.10;3.2.1.1;3.4.21.55;2.1.1.86,RHEA:13065;RHEA:53492,-,-,NO-PREDICTION,NO-PREDICTION,-,-,NO-PREDICTION,NO-PREDICTION
1,L0TBY6,-,NO-PREDICTION,RHEA:20049,-,-,-;RHEA:20049,-,False,-,0,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,2.6.1.45;1.4.3.2,RHEA:19125;RHEA:13781,-,-,-,-,-,-,6.3.4.2,RHEA:26426
2,G2X4G0,-,NO-PREDICTION,RHEA:68052,-,RHEA:21580,-;RHEA:68052;RHEA:21580,-,False,3.2.1.8,4,NO-PREDICTION,NO-PREDICTION,3.2.1.8,EC-WITHOUT-REACTION,3.2.1.8,EC-WITHOUT-REACTION,3.2.1.8,EC-WITHOUT-REACTION,3.2.1.8,EC-WITHOUT-REACTION,3.2.1.8,EC-WITHOUT-REACTION,3.2.1.8;3.1.1.72;3.2.1.8,EC-WITHOUT-REACTION
3,A0A517FNC4,-,RHEA:35499;RHEA:35503;RHEA:35507;RHEA:35511,RHEA:35499;RHEA:35503;RHEA:35507;RHEA:35511,RHEA:56748;RHEA:60936;RHEA:41728;RHEA:60928;RH...,RHEA:20585;RHEA:72667,RHEA:20585;RHEA:72667;RHEA:35503;RHEA:35499;RH...,-,False,1.14.14.-,3,1.14.13.173,EC-WITHOUT-REACTION,1.14.13.173,EC-WITHOUT-REACTION,1.14.14.115;1.14.14.85,RHEA:35499;RHEA:57576,1.14.14.-,EC-WITHOUT-REACTION,1.14.13.-,EC-WITHOUT-REACTION,-,-,1.14.13.173;1.3.3.9;1.14.14.1;1.14.13.97;1.14....,RHEA:52776;RHEA:22716;RHEA:50236;RHEA:46308;RH...
4,A0A517FNC9,-,RHEA:35499,RHEA:35499,RHEA:35499,RHEA:35499,RHEA:35499;-;RHEA:35511,-,False,1.14.14.-,3,1.14.13.173,EC-WITHOUT-REACTION,1.14.13.173,EC-WITHOUT-REACTION,1.14.14.115,RHEA:35499,1.14.14.-,EC-WITHOUT-REACTION,-,-,-,-,1.14.13.173;1.3.3.9;1.14.14.1;1.14.13.97;1.14....,RHEA:51264;RHEA:22716;RHEA:14021;RHEA:10796;RH...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2628,P0DPP8,-,NO-PREDICTION,-,-,RHEA:16185,-;RHEA:16185,-,False,-,0,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,1.11.1.7,RHEA:56136,-,-,NO-PREDICTION,NO-PREDICTION,-,-,NO-PREDICTION,NO-PREDICTION
2629,P0DSH6,-,NO-PREDICTION,-,-,RHEA:14801,-;RHEA:14801,-,False,-,0,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,3.1.1.4;1.4.3.2;1.3.99.17,RHEA:15801;RHEA:13781;RHEA:17749,-,-,NO-PREDICTION,NO-PREDICTION,-,-,NO-PREDICTION,NO-PREDICTION
2630,G5E8B9,-,-,-,-,-,-;RHEA:10684,-,False,-,0,-,-,2.3.2.27,EC-WITHOUT-REACTION,2.3.2.27,EC-WITHOUT-REACTION,-,-,-,-,-,-,2.1.1.43;2.3.2.27;2.3.2.27,EC-WITHOUT-REACTION
2631,A0A0R4IQZ2,-,RHEA:36683,RHEA:36683,RHEA:36683,RHEA:36683,RHEA:36683;-,-,False,-,0,2.3.1.225,RHEA:36683,NO-PREDICTION,NO-PREDICTION,2.3.1.225,RHEA:36683,-,-,3.-.-.-,EC-WITHOUT-REACTION,-,-,2.3.1.225;2.3.2.27;3.1.1.47;3.1.1.5;3.5.1.1;3....,RHEA:17777;RHEA:36683;RHEA:15177;RHEA:21016


In [40]:
ds_brenda = pd.read_feather('/hpcfs/fhome/shizhenkun/codebase/RXNRECer/data/brenda/brenda_reaction_uniprot_dataset.feather')
ds_brenda.head(3)

Unnamed: 0,reaction_id,equation_string,equation,substrates_smile,products_smile,uniprot_id,organism,ec,len,seq,equation_smiles
0,brnx:1,4-nitrophenyl propanoate + H2O = 4-nitrophenol...,cid:74777 = cid:980 + cid:104745,CCC(=O)Oc1ccc([N+](=O)[O-])cc1,C1=CC(=CC=C1[N+](=O)[O-])O^CCC(=O)[O-],D0EPY0,Bacillus subtilis,,300,MSNHSSSIPELSDNGIRYYQTYNESLSLWPVRCKSFYISTRFGQTH...,CCC(=O)Oc1ccc([N+](=O)[O-])cc1>>C1=CC(=CC=C1[N...
1,brnx:1,4-nitrophenyl propanoate + H2O = 4-nitrophenol...,cid:74777 = cid:980 + cid:104745,CCC(=O)Oc1ccc([N+](=O)[O-])cc1,C1=CC(=CC=C1[N+](=O)[O-])O^CCC(=O)[O-],Q5V5N6,Haloarcula marismortui (strain ATCC 43049 / DS...,,327,MSTTARPMPVTERAPESVTVQRDIPFHEVDGETLTLDLYDAAAASG...,CCC(=O)Oc1ccc([N+](=O)[O-])cc1>>C1=CC(=CC=C1[N...
2,brnx:1,4-nitrophenyl propanoate + H2O = 4-nitrophenol...,cid:74777 = cid:980 + cid:104745,CCC(=O)Oc1ccc([N+](=O)[O-])cc1,C1=CC(=CC=C1[N+](=O)[O-])O^CCC(=O)[O-],Q7M529,Sulfolobus acidocaldarius,3.1.1.1,20,PLDPTIKCLLESGFVIPIGK,CCC(=O)Oc1ccc([N+](=O)[O-])cc1>>C1=CC(=CC=C1[N...


In [41]:
res[res.uniprot_id.isin(ds_brenda.uniprot_id.to_list())]

Unnamed: 0,uniprot_id,rxn_groundtruth_x,rxn_blast,rxn_cos_esm,rxn_cos_unirep,rxn_cos_t5,rxn_RXNRECer,rxn_groundtruth_y,isenzyme_groundtruth,ec_groundtruth,ec_specific_level,ec_ecblast,rxn_ecblast,ec_deepec,rxn_deepec,ec_clean,rxn_clean,ec_ecrecer,rxn_ecrecer,ec_ecpred,reaction_ecpred,ec_catfam,rxn_catfam,ec_priam,rxn_priam
303,A0A384E126,-,NO-PREDICTION,-,-,RHEA:17521;RHEA:14513,-;RHEA:17521;RHEA:14513,-,False,-,0,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,2.3.1.12,RHEA:17017,-,-,-,-,-,-,NO-PREDICTION,NO-PREDICTION
1133,S0ENM8,-,RHEA:27954,RHEA:54520,RHEA:16013,RHEA:27954,RHEA:54520;RHEA:27954;-,-,False,-,0,4.2.3.74,RHEA:27954,NO-PREDICTION,NO-PREDICTION,4.2.3.165,RHEA:53668,-,-,5.-.-.-,EC-WITHOUT-REACTION,-,-,4.2.3.74;4.2.1.138;4.2.3.89;4.2.3.125;4.2.3.12...,RHEA:34703;RHEA:12516;RHEA:33103;RHEA:31815;RH...
2496,G4VGH9,-,RHEA:24388;RHEA:22824,RHEA:24388;RHEA:16037;RHEA:22824,RHEA:14877,RHEA:24388;RHEA:16037;RHEA:22824,RHEA:24388;RHEA:22824;RHEA:16037;-,-,False,-,0,2.4.2.3,RHEA:24388,NO-PREDICTION,NO-PREDICTION,2.4.2.3,RHEA:24388,-,-,NO-PREDICTION,NO-PREDICTION,-,-,2.4.2.3;2.4.2.2,RHEA:24388;RHEA:52540;RHEA:22824;RHEA:16037


In [42]:
ds_brenda[ds_brenda.uniprot_id =='A0A384E126']

Unnamed: 0,reaction_id,equation_string,equation,substrates_smile,products_smile,uniprot_id,organism,ec,len,seq,equation_smiles
10128,brnx:3071,succinamic acid + H2O = succinic acid + NH3,cid:12522 = cid:222 + cid:1110,NC(=O)CCC(=O)O,N^C(CC(=O)O)C(=O)O,A0A384E126,Pseudomonas sp. (strain ADP),,68,MTETEIFAYIEAASIAIGIPLEPARARAVAHHFSRTALLAEMLESV...,NC(=O)CCC(=O)O>>C(CC(=O)O)C(=O)O.N
16566,brnx:5109,"1-carboxybiuret + H2O = 1,3-dicarboxyurea + NH3",cid:121412669 = cid:222 + cid:57054417,NC(=O)NC(=O)NC(=O)O,N^C(=O)(NC(=O)O)NC(=O)O,A0A384E126,Pseudomonas sp. (strain ADP),,68,MTETEIFAYIEAASIAIGIPLEPARARAVAHHFSRTALLAEMLESV...,NC(=O)NC(=O)NC(=O)O>>N.C(=O)(NC(=O)O)NC(=O)O
17492,brnx:5459,"1-carboxybiuret + H2O = urea-1,3-dicarboxylate...",cid:121412669 = cid:222 + cid:135397931,NC(=O)NC(=O)NC(=O)O,N^C(=O)(NC(=O)[O-])NC(=O)[O-],A0A384E126,Pseudomonas sp. (strain ADP),,68,MTETEIFAYIEAASIAIGIPLEPARARAVAHHFSRTALLAEMLESV...,NC(=O)NC(=O)NC(=O)O>>N.C(=O)(NC(=O)[O-])NC(=O)...


In [5]:
# 创建公共的重命名字典
rename_dict = {
    'lb_rxn_groundtruth_x': 'rxn_groundtruth',
    'lb_rxn_ecblast': 'MSA-via-EC',
    'lb_rxn_deepec': 'DeepEC',
    'lb_rxn_clean': 'CLEAN',
    'lb_rxn_ecrecer': 'ECRECer',
    'lb_rxn_catfam': 'CatFam',
    'lb_rxn_priam': 'PRIAM',
    'lb_rxn_blast': 'MSA-via-RXN',
    'lb_rxn_cos_esm': 'ESM',
    'lb_rxn_cos_unirep': 'UniRep',
    'lb_rxn_cos_t5': 'T5',
    'lb_rxn_RXNRECer': 'ESMwithCLF',
    'rxn_groundtruth_x': 'rxn_groundtruth',
    'rxn_ecblast': 'MSA-via-EC',
    'rxn_deepec': 'DeepEC',
    'rxn_clean': 'CLEAN',
    'rxn_ecrecer': 'ECRECer',
    'rxn_catfam': 'CatFam',
    'rxn_priam': 'PRIAM',
    'rxn_blast': 'MSA-via-RXN',
    'rxn_cos_esm': 'ESM',
    'rxn_cos_unirep': 'UniRep',
    'rxn_cos_t5': 'T5',
    'rxn_RXNRECer': 'ESMwithCLF'
}

# 定义 methods 列表
methods = ['MSA-via-EC', 'DeepEC', 'CLEAN', 'ECRECer', 'CatFam', 'PRIAM',
           'MSA-via-RXN', 'ESM', 'UniRep', 'T5', 'ESMwithCLF']


# 创建函数来生成 DataFrame
def create_dataframe(prefix):
    # 筛选以指定前缀开头的列
    selected_columns = ['uniprot_id'] + \
                       [col for col in res.columns if col.startswith(prefix)]
    # 重命名列并选择所需的列
    df = res[selected_columns].rename(columns=rename_dict)
    df = df[['uniprot_id', 'rxn_groundtruth'] + methods]
    return df

### 4.1 计算用

In [None]:
df_cp = create_dataframe('lb_')
df_cp.head(2)

Unnamed: 0,uniprot_id,rxn_groundtruth,MSA-via-EC,DeepEC,CLEAN,ECRECer,CatFam,PRIAM,MSA-via-RXN,ESM,UniRep,T5,ESMwithCLF
0,A9JLI2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,A9JLI3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [8]:
methods = ['MSA-via-EC', 'DeepEC', 'CLEAN', 'ECRECer', 'CatFam', 'PRIAM',
           'MSA-via-RXN', 'ESM', 'UniRep', 'T5', 'ESMwithCLF']

# 定义平均类型列表
avg_types = ['weighted', 'micro', 'macro']

# 计算指标的函数
def calculate_metrics(args):
    method, avg_type = args
    res = evTools.calculate_metrics(eva_df=df_cp, ground_truth_col='rxn_groundtruth', pred_col=method, eva_name=method, avg_method=avg_type)
    return res

def get_metrics(methods, avg_types):
    # 创建参数列表
    params = [(method, avg_type) for method in methods for avg_type in avg_types]

    # 关键改动：使用 threading 作为 joblib 后端，防止 multiprocessing 冲突
    with parallel_backend('threading'):  # 强制 joblib 在 calculate_metrics 内部使用 threading
        with Pool() as pool:
            all_metrics = pool.map(calculate_metrics, params)

    # 合并所有结果
    combined_metrics = pd.concat(all_metrics, ignore_index=True)
    
    return combined_metrics

eva_metrics_initial = get_metrics(methods=methods, avg_types=avg_types)

In [118]:
evTools.calculate_metrics(eva_df=df_cp, ground_truth_col='rxn_groundtruth', pred_col='MSA-via-EC', eva_name='m', avg_method='macro')

Unnamed: 0,mAccuracy,mPrecision,mRecall,mF1,avgType
0,0.43337,0.956005,0.838333,0.80686,macro


### 4.2 给人读

In [16]:
df_cp

Unnamed: 0,uniprot_id,rxn_groundtruth,MSA-via-EC,DeepEC,CLEAN,ECRECer,CatFam,PRIAM,MSA-via-RXN,ESM,UniRep,T5,ESMwithCLF
0,A9JLI2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,A9JLI3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,A9JLI5,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,A9JLI7,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,B5KVH4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
13510,P0DW91,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
13511,P0DTL6,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
13512,P0DW87,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
13513,P0DW89,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [None]:
df_hum = create_dataframe('rxn_')
df_hum.head(2)

Unnamed: 0,uniprot_id,rxn_groundtruth,MSA-via-EC,DeepEC,CLEAN,ECRECer,CatFam,PRIAM,MSA-via-RXN,ESM,UniRep,T5,ESMwithCLF
0,A9JLI2,-,-,NO-PREDICTION,RHEA:31427;RHEA:16301;RHEA:13781,-,-,NO-PREDICTION,-,-,-,-,-
1,A9JLI3,-,-,NO-PREDICTION,EC-WITHOUT-REACTION,-,-,RHEA:49524,-,-,-,-,-


In [14]:
methods = ['MSA-via-EC', 'DeepEC', 'CLEAN', 'ECRECer', 'CatFam', 'PRIAM', 'MSA-via-RXN', 'ESM', 'UniRep', 'T5', 'RXNRECer']

In [None]:
for item in methods:
    df_hum[f'pred_true_{item}'] = df_hum.apply(
        lambda x: set(x.rxn_groundtruth.split(cfg.SPLITER)).issubset(set(x[f'{item}'].split(cfg.SPLITER))),
        axis=1
    )

In [110]:
for item in methods:
    correct = df_hum[df_hum[f'pred_true_{item}']].shape[0]
    accuracy = round(correct / len(df_hum), 6)
    print(f'{item:12s}: {accuracy}')


MSA-via-EC  : 0.43744
DeepEC      : 0.036108
CLEAN       : 0.10899
ECRECer     : 0.695154
CatFam      : 0.771143
PRIAM       : 0.118461
MSA-via-RXN : 0.516907
ESM         : 0.739327
UniRep      : 0.698705
T5          : 0.760858
ESMwithCLF  : 0.957085


In [54]:
# savedf.to_excel('res/res_case2018Later_250115.xlsx', index=None)

In [11]:
def count_num(rxnstr):
    if rxnstr =='-':
        return 0
    
    rxns = rxnstr.split(cfg.SPLITER)
    if '-' in rxns:
        return len(rxns) -1
    else:
        return len(rxns)

In [None]:
res['rxn_num_groundtruth'] = res.rxn_groundtruth.apply(lambda x: count_num(x))
res['rxn_num_rxnrecer'] = res.RXNRECer.apply(lambda x: count_num(x))

In [None]:
res[(res.rxn_num_rxnrecer > res.rxn_num_groundtruth) & (res.rxn_groundtruth!='-')]

In [67]:
res.to_excel('res/res_ensemble_2018later_all_250213.xlsx', index=False)