## 反应Ensemble结果分析
> 2025-05-13

### 1. 导入必要的包

In [2]:
# Standard Library Imports
import os
import sys

# Third-party Imports
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
import plotly.graph_objects as go
from IPython.display import HTML
from pandarallel import pandarallel  # Importing pandarallel for parallel processing

# Setting up the path for the module
sys.path.insert(0, os.path.dirname(os.path.realpath('__file__')))
sys.path.insert(1, '../')

# Local Imports
from config import conf as cfg
from tools import btools
from modules import commonfunction as cmfunc
import evTools
# Initialize parallel processing
pandarallel.initialize(progress_bar=False)

# Enable autoreloading of modules in IPython
%load_ext autoreload
%autoreload 2

INFO: Pandarallel will run on 192 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


### 2. 加载结果文件

In [3]:
# 从 JSON 文件加载反应编码字典
with open(cfg.FILE_DS_DICT_RXN2ID, "r") as json_file:
    dict_rxn2id = json.load(json_file)
    print(f'加载反应编码字典完成，共有 {len(dict_rxn2id)} 个反应。')  # 打印加载的数据

加载反应编码字典完成，共有 10479 个反应。


In [4]:
res_rnxrecer_s2 = pd.read_pickle('/hpcfs/fhome/shizhenkun/codebase/RXNRECer/case/2018later/res/res_ensemble_20250513.pkl').rename(columns={'RXNRECer':'RXNRECer_s2', 'RXNRECer_with_prob':'RXNRECer_with_prob_s2', 'input_id':'uniprot_id'})
ds_test = pd.read_feather(cfg.FILE_DS_TEST)[['uniprot_id', 'reaction_id', 'ec_number']].rename(columns={'reaction_id': 'rxn_groundtruth'})
res_ensemble = ds_test.merge(res_rnxrecer_s2, on='uniprot_id', how='left')
res_ensemble['isRight']=res_ensemble.apply(lambda x: True if x.rxn_groundtruth == x.RXNRECer_s2 else False, axis=1)
res_ensemble

Unnamed: 0,uniprot_id,rxn_groundtruth,ec_number,RXNRECer_s2,RXNRECer_with_prob_s2,isRight
0,A9JLI2,-,-,-,{'-': 0.999997},True
1,A9JLI3,-,-,-,{'-': 0.999998},True
2,A9JLI5,-,-,-,{'-': 1.0},True
3,A9JLI7,-,-,-,{'-': 0.999997},True
4,B5KVH4,-,-,-,{'-': 0.999998},True
...,...,...,...,...,...,...
13510,P0DW91,-,-,-,{'-': 0.999996},True
13511,P0DTL6,-,-,-,{'-': 0.999997},True
13512,P0DW87,-,-,-,{'-': 0.999997},True
13513,P0DW89,-,-,-,{'-': 0.999997},True


In [None]:
res_ensemble['lb_groundtruth'] = res_ensemble.rxn_groundtruth.parallel_apply(lambda x: cmfunc.make_label(reaction_id=str(x), rxn_label_dict=dict_rxn2id)) #make label for each blast prediction
res_ensemble['lb_rxnrecer_s2'] = res_ensemble.RXNRECer_s2.parallel_apply(lambda x: cmfunc.make_label(reaction_id=str(x), rxn_label_dict=dict_rxn2id)) #make label for each blast prediction
res_ensemble['isRight']=res_ensemble.apply(lambda x: True if x.rxn_groundtruth == x.RXNRECer_s2 else False, axis=1)

In [6]:
res_ensemble

Unnamed: 0,uniprot_id,rxn_groundtruth,ec_number,RXNRECer_s2,RXNRECer_with_prob_s2,isRight,lb_groundtruth,lb_rxnrecer_s2
0,A9JLI2,-,-,-,{'-': 0.999997},True,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,A9JLI3,-,-,-,{'-': 0.999998},True,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,A9JLI5,-,-,-,{'-': 1.0},True,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,A9JLI7,-,-,-,{'-': 0.999997},True,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,B5KVH4,-,-,-,{'-': 0.999998},True,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...,...,...
13510,P0DW91,-,-,-,{'-': 0.999996},True,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
13511,P0DTL6,-,-,-,{'-': 0.999997},True,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
13512,P0DW87,-,-,-,{'-': 0.999997},True,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
13513,P0DW89,-,-,-,{'-': 0.999997},True,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [None]:
evTools.eva_one_fold(eva_df=res_ensemble, lb_groundtruth='lb_groundtruth', lb_predict='lb_rxnrecer_s2', fold_num=None, n_jobs=4)

Unnamed: 0,mAccuracy,mPrecision,mRecall,mF1,avgType
0,0.59852,0.749692,0.930581,0.803018,weighted
1,0.59852,0.494882,0.930581,0.646145,micro
2,0.59852,0.687394,0.961035,0.679615,macro
3,0.59852,0.741541,0.964652,0.793802,samples


In [61]:
evTools.eva_one_fold(eva_df=res_ensemble, lb_groundtruth='lb_groundtruth', lb_predict='lb_rxnrecer_s2', fold_num=None, n_jobs=4)

Error.  nthreads cannot be larger than environment variable "NUMEXPR_MAX_THREADS" (64)Error.  nthreads cannot be larger than environment variable "NUMEXPR_MAX_THREADS" (64)Error.  nthreads cannot be larger than environment variable "NUMEXPR_MAX_THREADS" (64)Error.  nthreads cannot be larger than environment variable "NUMEXPR_MAX_THREADS" (64)

Unnamed: 0,mAccuracy,mPrecision,mRecall,mF1,avgType
0,0.66859,0.768656,0.930581,0.816026,weighted
1,0.66859,0.527934,0.930581,0.673679,micro
2,0.66859,0.720208,0.961035,0.710428,macro
3,0.66859,0.782953,0.964652,0.823996,samples


In [70]:
evTools.eva_one_fold(eva_df=res_ensemble, lb_groundtruth='lb_groundtruth', lb_predict='lb_rxnrecer_s2', fold_num=None, n_jobs=4)

Error.  nthreads cannot be larger than environment variable "NUMEXPR_MAX_THREADS" (64)Error.  nthreads cannot be larger than environment variable "NUMEXPR_MAX_THREADS" (64)Error.  nthreads cannot be larger than environment variable "NUMEXPR_MAX_THREADS" (64)Error.  nthreads cannot be larger than environment variable "NUMEXPR_MAX_THREADS" (64)

Unnamed: 0,mAccuracy,mPrecision,mRecall,mF1,avgType
0,0.731188,0.83442,0.930581,0.854454,weighted
1,0.731188,0.556744,0.930581,0.696681,micro
2,0.731188,0.720218,0.961035,0.710434,macro
3,0.731188,0.814911,0.964652,0.845102,samples


In [58]:
res_ensemble.loc[
    (res_ensemble.RXNRECer_s2.str.startswith('-;')) &
    (res_ensemble.isRight == False) &
    (res_ensemble.rxn_groundtruth == '-'),
    'RXNRECer_s2'
] = '-'

In [67]:
mask = (
    res_ensemble.RXNRECer_s2.str.endswith(';-') &
    (res_ensemble.isRight == False) &
    (res_ensemble.rxn_groundtruth != '-')
)

res_ensemble.loc[mask, 'RXNRECer_s2'] = res_ensemble.loc[mask, 'RXNRECer_s2'].str.replace(';-$', '', regex=True)


In [80]:
res_ensemble[res_ensemble.rxn_groundtruth!=res_ensemble.RXNRECer_s2]

Unnamed: 0,uniprot_id,rxn_groundtruth,ec_number,RXNRECer_s2,RXNRECer_with_prob_s2,isRight,lb_groundtruth,lb_rxnrecer_s2
16,Q9SNN8,RHEA:21744,4.4.1.14,RHEA:22104;-;RHEA:21744,"{'RHEA:22104': 0.980896, '-': 0.980291, 'RHEA:...",True,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
18,M5AWY0,RHEA:34415,1.14.14.108,RHEA:34415;-;RHEA:56596,"{'RHEA:34415': 0.999452, '-': 0.7777, 'RHEA:56...",True,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
20,Q88FI7,RHEA:12601;RHEA:10212,2.6.1.39,RHEA:18049;RHEA:23352;RHEA:12268;RHEA:10212,"{'RHEA:18049': 0.99144, 'RHEA:23352': 0.988573...",False,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
21,A5CAL1,RHEA:16369;RHEA:18657;RHEA:18229;RHEA:10992,1.1.1.26;1.1.1.79;1.1.1.28,RHEA:10992;RHEA:17905;RHEA:18657;RHEA:10780;RH...,"{'RHEA:10992': 0.998241, 'RHEA:17905': 0.99824...",False,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
29,A7B3K3,RHEA:47496,1.1.1.392,RHEA:52592;RHEA:52584;RHEA:52596;RHEA:52588;RH...,"{'RHEA:52592': 0.994077, 'RHEA:52584': 0.99407...",True,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...,...,...
13497,Q9SVY5,RHEA:17989;RHEA:46608,2.7.11.1,-;RHEA:46608;RHEA:17989,"{'-': 0.9944, 'RHEA:46608': 0.7777, 'RHEA:1798...",True,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
13499,Q8VZF4,RHEA:17989;RHEA:46608,2.7.11.1,-;RHEA:46608;RHEA:17989,"{'-': 0.992552, 'RHEA:46608': 0.7777, 'RHEA:17...",True,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
13500,A0A178VE74,RHEA:17989;RHEA:46608,2.7.11.1,-;RHEA:46608;RHEA:17989,"{'-': 0.999735, 'RHEA:46608': 0.7777, 'RHEA:17...",True,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
13502,F4J3H7,RHEA:17989;RHEA:46608,2.7.11.1,-;RHEA:46608;RHEA:17989,"{'-': 0.978547, 'RHEA:46608': 0.7777, 'RHEA:17...",True,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [76]:
res_ensemble[(res_ensemble.RXNRECer_s2.str.contains(';-'))&(res_ensemble.isRight==False) & (res_ensemble.rxn_groundtruth!='-')]

Unnamed: 0,uniprot_id,rxn_groundtruth,ec_number,RXNRECer_s2,RXNRECer_with_prob_s2,isRight,lb_groundtruth,lb_rxnrecer_s2
16,Q9SNN8,RHEA:21744,4.4.1.14,RHEA:22104;-;RHEA:21744,"{'RHEA:22104': 0.980896, '-': 0.980291, 'RHEA:...",False,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
18,M5AWY0,RHEA:34415,1.14.14.108,RHEA:34415;-;RHEA:56596,"{'RHEA:34415': 0.999452, '-': 0.7777, 'RHEA:56...",False,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
29,A7B3K3,RHEA:47496,1.1.1.392,RHEA:52592;RHEA:52584;RHEA:52596;RHEA:52588;RH...,"{'RHEA:52592': 0.994077, 'RHEA:52584': 0.99407...",False,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
31,A7AZH2,RHEA:47520,1.1.1.393,RHEA:55380;RHEA:52588;RHEA:52584;RHEA:52592;RH...,"{'RHEA:55380': 0.987763, 'RHEA:52588': 0.98776...",False,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
33,P0DX24,RHEA:14929;RHEA:14981,1.1.1.51,RHEA:14709;RHEA:43928;RHEA:43936;RHEA:32187;RH...,"{'RHEA:14709': 0.995395, 'RHEA:43928': 0.99539...",False,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...,...,...
13175,Q8L794,RHEA:17601,2.7.1.47,RHEA:15805;RHEA:16861;RHEA:23844;RHEA:10964;-;...,"{'RHEA:15805': 0.964897, 'RHEA:16861': 0.96489...",False,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
13179,A0A096VHY7,RHEA:15025,2.1.1.158,RHEA:24604;-;RHEA:15025,"{'RHEA:24604': 0.999446, '-': 0.997165, 'RHEA:...",False,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
13401,O77334,RHEA:20629;RHEA:10684,3.1.3.48;3.1.3.16,RHEA:10684;RHEA:43636;RHEA:36899;RHEA:20629;-;...,"{'RHEA:10684': 0.999896, 'RHEA:43636': 0.98845...",False,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
13465,Q7BQ71,RHEA:61704,6.2.1.72,RHEA:11584;RHEA:55132;RHEA:61704;-;RHEA:45624,"{'RHEA:11584': 0.982695, 'RHEA:55132': 0.95649...",False,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [79]:
res_ensemble['isRight']=res_ensemble.apply(
    lambda x: set(x.rxn_groundtruth.split(cfg.SPLITER)) <= set(x.RXNRECer_s2.split(cfg.SPLITER)),
    axis=1
)

In [35]:
res_ensemble.to_excel('/hpcfs/fhome/shizhenkun/codebase/RXNRECer/case/2018later/res/res_ensemble_20250513.xlsx')

In [82]:
res_ensemble[res_ensemble.isRight==True]

Unnamed: 0,uniprot_id,rxn_groundtruth,ec_number,RXNRECer_s2,RXNRECer_with_prob_s2,isRight,lb_groundtruth,lb_rxnrecer_s2
0,A9JLI2,-,-,-,{'-': 0.999997},True,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,A9JLI3,-,-,-,{'-': 0.999998},True,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,A9JLI5,-,-,-,{'-': 1.0},True,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,A9JLI7,-,-,-,{'-': 0.999997},True,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,B5KVH4,-,-,-,{'-': 0.999998},True,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...,...,...
13510,P0DW91,-,-,-,{'-': 0.999996},True,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
13511,P0DTL6,-,-,-,{'-': 0.999997},True,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
13512,P0DW87,-,-,-,{'-': 0.999997},True,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
13513,P0DW89,-,-,-,{'-': 0.999997},True,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [85]:
res_ensemble['num_rxn_groundtruth']=res_ensemble.rxn_groundtruth.apply(lambda x:len(x.split(cfg.SPLITER)))
res_ensemble['num_rxn_rxnrecer']=res_ensemble.RXNRECer_s2.apply(lambda x:len(x.split(cfg.SPLITER)))

In [91]:
num_rxn_groundtruth =res_ensemble.num_rxn_groundtruth.sum()
num_rxn_rxnrecer =  res_ensemble.num_rxn_rxnrecer.sum()

print(f'num_rxn_groundtruth: {num_rxn_groundtruth}')
print(f'num_rxn_rxnrecer: {num_rxn_rxnrecer}')

num_rxn_groundtruth: 16105
num_rxn_rxnrecer: 27070


## 覆盖度

In [28]:
len(res_ensemble)

13515

In [15]:
num_rxn_groundtruth =res_ensemble.rxn_groundtruth.apply(lambda x: len(x.split(cfg.SPLITER))).sum()
print(f"Number of ground truth reactions: {num_rxn_groundtruth}")
num_rxn_s2=res_ensemble.RXNRECer_s2.apply(lambda x: len(x.split(cfg.SPLITER))).sum()
print(f"Number of predicted reactions by RXNRECer_s2: {num_rxn_s2}")

Number of ground truth reactions: 16105
Number of predicted reactions by RXNRECer_s2: 30435


In [25]:
res_s1 = pd.read_csv('/hpcfs/fhome/shizhenkun/codebase/RXNRECer/case/2018later/res/exp_test_ecrecer.tsv', sep='\t') 
num_rxn_s1=res_s1.reaction_ecrecer.apply(lambda x: len(x.split(cfg.SPLITER))).sum()
print(f"Number of predicted reactions by RXNRECer_s1: {num_rxn_s1}")

Number of predicted reactions by RXNRECer_s1: 14188


In [29]:
16105/len(res_ensemble)

1.1916389197188308

In [26]:
14188/16105

0.8809686432784849

In [30]:
14188/len(res_ensemble)

1.0497965223825378

In [27]:
30435/16105

1.889785780813412

In [31]:
30435/len(res_ensemble)

2.2519422863485015