## 反应直接预测结果分析
> 2024-11-20

### 1. 导入必要的包

In [13]:
import sys,os
sys.path.insert(0, os.path.dirname(os.path.realpath('__file__')))
sys.path.insert(1,'../')
from config import conf as cfg
import pandas as pd
import json
import plotly.graph_objects as go
from tools import btools
from IPython.display import HTML
from pandarallel import pandarallel # 导入pandaralle
import evTools


pandarallel.initialize(progress_bar=False)
%load_ext autoreload
%autoreload 2

INFO: Pandarallel will run on 112 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### 2. 加载测试数据集

In [14]:
# 从 JSON 文件加载反应编码字典
with open(cfg.FILE_DS_DICT_RXN2ID, "r") as json_file:
    dict_rxn2id = json.load(json_file)
    print(f'加载反应编码字典完成，共有 {len(dict_rxn2id)} 个反应。')  # 打印加载的数据

methods = ['blast', 'deepec', 'clean', 'ecrecer', 'ecpred', 'catfam', 'priam']

print(f'使用预测方法{methods}')



加载反应编码字典完成，共有 10479 个反应。
使用预测方法['blast', 'deepec', 'clean', 'ecrecer', 'ecpred', 'catfam', 'priam']


## 4. Load results from EC based method

### 4.1 Blast

In [15]:
std_blast, metrics_blast, ec_no_rxn_blat =evTools.get_eval_results(baselineName= 'blast', dict_rxn2id=dict_rxn2id, method_type='ec')
evTools.display_html_results(metrics = metrics_blast, std_mean = std_blast, no_pred=ec_no_rxn_blat, eva_name ='Blast')

Getting evaluation results for blast ...
Calculating mean and std ...
Statistic ec no prediction and ec with no reaction ...


Unnamed: 0,baselineName,runFold,mAccuracy,mPrecision,mRecall,mF1,avgType
0,blast,1,0.837732,0.816875,0.790192,0.784566,weighted
1,blast,1,0.837732,0.886831,0.790192,0.835727,micro
2,blast,1,0.837732,0.177491,0.708508,0.156693,macro
3,blast,1,0.837732,0.889851,0.863472,0.870549,samples
4,blast,2,0.835901,0.810685,0.783438,0.778173,weighted
5,blast,2,0.835901,0.886197,0.783438,0.831655,micro
6,blast,2,0.835901,0.183282,0.686853,0.160974,macro
7,blast,2,0.835901,0.889949,0.86251,0.869922,samples
8,blast,3,0.832341,0.807635,0.78449,0.779202,weighted
9,blast,3,0.832341,0.883179,0.78449,0.830915,micro

Unnamed: 0,baselineName,avgType,Metric,mean,std
0,blast,macro,mAccuracy,0.836434,0.001912
1,blast,macro,mF1,0.159373,0.003008
2,blast,macro,mPrecision,0.178795,0.003609
3,blast,macro,mRecall,0.701045,0.007987
4,blast,micro,mAccuracy,0.836434,0.001912
5,blast,micro,mF1,0.833994,0.00167
6,blast,micro,mPrecision,0.885927,0.001247
7,blast,micro,mRecall,0.787815,0.002485
8,blast,samples,mAccuracy,0.836434,0.001912
9,blast,samples,mF1,0.869941,0.001316

Unnamed: 0,run_fold,test_size,no_prediction_count,ec_without_reaction_count
0,1,447532,2424,24984
1,2,447609,2386,25079
2,3,448166,2379,26637
3,4,448885,2301,25295
4,5,447594,2418,25070
5,6,447902,2391,25425
6,7,446999,2369,25404
7,8,446732,2381,25939
8,9,447445,2449,25324
9,10,447314,2431,24910


### 4.2 DeepEC

In [16]:
std_deepec, metrics_deepec, ec_no_rxn_deepec =evTools.get_eval_results(baselineName= 'deepec', dict_rxn2id=dict_rxn2id, method_type='ec')
evTools.display_html_results(metrics = metrics_deepec, std_mean = std_deepec, no_pred=ec_no_rxn_deepec, eva_name ='deepec')

Getting evaluation results for deepec ...
Calculating mean and std ...
Statistic ec no prediction and ec with no reaction ...


Unnamed: 0,baselineName,runFold,mAccuracy,mPrecision,mRecall,mF1,avgType
0,deepec,1,0.26,1.0,0.318966,0.324713,weighted
1,deepec,1,0.26,0.352381,0.318966,0.334842,micro
2,deepec,1,0.26,0.993511,0.998091,0.991634,macro
3,deepec,1,0.26,0.326667,0.294167,0.304,samples
4,deepec,2,0.31,1.0,0.330435,0.330435,weighted
5,deepec,2,0.31,0.376238,0.330435,0.351852,micro
6,deepec,2,0.31,0.993988,0.997519,0.991507,macro
7,deepec,2,0.31,0.37,0.3345,0.344,samples
8,deepec,3,0.27,1.0,0.294643,0.294643,weighted
9,deepec,3,0.27,0.323529,0.294643,0.308411,micro

Unnamed: 0,baselineName,avgType,Metric,mean,std
0,deepec,macro,mAccuracy,0.291,0.041218
1,deepec,macro,mF1,0.991389,0.000543
2,deepec,macro,mPrecision,0.993726,0.000376
3,deepec,macro,mRecall,0.997643,0.000426
4,deepec,micro,mAccuracy,0.291,0.041218
5,deepec,micro,mF1,0.331823,0.041839
6,deepec,micro,mPrecision,0.354447,0.045388
7,deepec,micro,mRecall,0.312022,0.039251
8,deepec,samples,mAccuracy,0.291,0.041218
9,deepec,samples,mF1,0.32209,0.037979

Unnamed: 0,run_fold,test_size,no_prediction_count,ec_without_reaction_count
0,1,50858,30918,1572
1,2,50858,30958,1543
2,3,50858,30749,1673
3,4,50858,30916,1587
4,5,50858,30799,1580
5,6,50858,30908,1588
6,7,50858,30984,1600
7,8,50858,30991,1569
8,9,50858,30938,1594
9,10,50858,31044,1537


### 4.3 CLEAN

In [17]:
std_clean, metrics_clean, ec_no_rxn_clean =evTools.get_eval_results(baselineName= 'clean', dict_rxn2id=dict_rxn2id, method_type='ec')
evTools.display_html_results(metrics = metrics_clean, std_mean = std_clean, no_pred=ec_no_rxn_clean, eva_name ='clean')

Getting evaluation results for clean ...
Calculating mean and std ...
Statistic ec no prediction and ec with no reaction ...


Unnamed: 0,baselineName,runFold,mAccuracy,mPrecision,mRecall,mF1,avgType
0,clean,1,0.26,0.96408,0.327586,0.314655,weighted
1,clean,1,0.26,0.342342,0.327586,0.334802,micro
2,clean,1,0.26,0.994616,0.998235,0.99305,macro
3,clean,1,0.26,0.341667,0.309167,0.317333,samples
4,clean,2,0.35,0.971739,0.408696,0.394493,weighted
5,clean,2,0.35,0.379032,0.408696,0.393305,micro
6,clean,2,0.35,0.993741,0.998425,0.992267,macro
7,clean,2,0.35,0.441667,0.3945,0.396571,samples
8,clean,3,0.35,0.986607,0.375,0.367857,weighted
9,clean,3,0.35,0.368421,0.375,0.371681,micro

Unnamed: 0,baselineName,avgType,Metric,mean,std
0,clean,macro,mAccuracy,0.319,0.047011
1,clean,macro,mF1,0.992315,0.000942
2,clean,macro,mPrecision,0.99405,0.000845
3,clean,macro,mRecall,0.998093,0.000343
4,clean,micro,mAccuracy,0.319,0.047011
5,clean,micro,mF1,0.351079,0.052498
6,clean,micro,mPrecision,0.348917,0.056584
7,clean,micro,mRecall,0.353771,0.04964
8,clean,samples,mAccuracy,0.319,0.047011
9,clean,samples,mF1,0.359926,0.046849

Unnamed: 0,run_fold,test_size,no_prediction_count,ec_without_reaction_count
0,1,50858,0,8422
1,2,50858,0,8454
2,3,50858,0,8445
3,4,50858,0,8359
4,5,50858,0,8505
5,6,50858,0,8526
6,7,50858,0,8498
7,8,50858,0,8381
8,9,50858,0,8542
9,10,50858,0,8550


### 4.4 ECRECer

In [18]:
std_ecrecer, metrics_ecrecer, ec_no_rxn_ecrecer =evTools.get_eval_results(baselineName= 'ecrecer', dict_rxn2id=dict_rxn2id, method_type='ec')
evTools.display_html_results(metrics = metrics_ecrecer, std_mean = std_ecrecer, no_pred=ec_no_rxn_ecrecer, eva_name ='ECRECer')

Getting evaluation results for ecrecer ...
Calculating mean and std ...
Statistic ec no prediction and ec with no reaction ...


Unnamed: 0,baselineName,runFold,mAccuracy,mPrecision,mRecall,mF1,avgType
0,ecrecer,1,0.84,0.991523,0.775862,0.771588,weighted
1,ecrecer,1,0.84,0.9,0.775862,0.833333,micro
2,ecrecer,1,0.84,0.99914,0.997614,0.996755,macro
3,ecrecer,1,0.84,0.9,0.8675,0.877333,samples
4,ecrecer,2,0.82,0.98295,0.756522,0.752303,weighted
5,ecrecer,2,0.82,0.87,0.756522,0.809302,micro
6,ecrecer,2,0.82,0.998947,0.997612,0.996562,macro
7,ecrecer,2,0.82,0.87,0.842,0.85,samples
8,ecrecer,3,0.88,0.991231,0.830357,0.829505,weighted
9,ecrecer,3,0.88,0.93,0.830357,0.877358,micro

Unnamed: 0,baselineName,avgType,Metric,mean,std
0,ecrecer,macro,mAccuracy,0.863,0.027101
1,ecrecer,macro,mF1,0.997061,0.000645
2,ecrecer,macro,mPrecision,0.99932,0.000177
3,ecrecer,macro,mRecall,0.997736,0.000585
4,ecrecer,micro,mAccuracy,0.863,0.027101
5,ecrecer,micro,mF1,0.843316,0.026463
6,ecrecer,micro,mPrecision,0.912919,0.019998
7,ecrecer,micro,mRecall,0.784183,0.037599
8,ecrecer,samples,mAccuracy,0.863,0.027101
9,ecrecer,samples,mF1,0.892789,0.021613

Unnamed: 0,run_fold,test_size,no_prediction_count,ec_without_reaction_count
0,1,50858,0,2953
1,2,50858,0,2993
2,3,50858,0,3154
3,4,50858,0,3074
4,5,50858,0,3012
5,6,50858,0,3042
6,7,50858,0,2993
7,8,50858,0,3059
8,9,50858,0,3066
9,10,50858,0,3061


### 4.5 CATFAM

In [19]:
std_catfam, metrics_catfam, ec_no_rxn_catfam =evTools.get_eval_results(baselineName= 'catfam', dict_rxn2id=dict_rxn2id, method_type='ec')
evTools.display_html_results(metrics = metrics_catfam, std_mean = std_catfam, no_pred=ec_no_rxn_catfam, eva_name ='CatFam')

Getting evaluation results for catfam ...
Calculating mean and std ...
Statistic ec no prediction and ec with no reaction ...


Unnamed: 0,baselineName,runFold,mAccuracy,mPrecision,mRecall,mF1,avgType
0,catfam,1,0.76,0.921751,0.724138,0.701196,weighted
1,catfam,1,0.76,0.815534,0.724138,0.767123,micro
2,catfam,1,0.76,0.999126,0.997417,0.996554,macro
3,catfam,1,0.76,0.81,0.783333,0.791667,samples
4,catfam,2,0.71,0.886957,0.66087,0.609611,weighted
5,catfam,2,0.71,0.76,0.66087,0.706977,micro
6,catfam,2,0.71,0.99926,0.996704,0.996009,macro
7,catfam,2,0.71,0.76,0.7295,0.737333,samples
8,catfam,3,0.74,0.886128,0.714286,0.660858,weighted
9,catfam,3,0.74,0.776699,0.714286,0.744186,micro

Unnamed: 0,baselineName,avgType,Metric,mean,std
0,catfam,macro,mAccuracy,0.75,0.02708
1,catfam,macro,mF1,0.996144,0.000616
2,catfam,macro,mPrecision,0.999276,0.000132
3,catfam,macro,mRecall,0.996853,0.000586
4,catfam,micro,mAccuracy,0.75,0.02708
5,catfam,micro,mF1,0.740563,0.034388
6,catfam,micro,mPrecision,0.792265,0.027731
7,catfam,micro,mRecall,0.695594,0.041817
8,catfam,samples,mAccuracy,0.75,0.02708
9,catfam,samples,mF1,0.77508,0.030315

Unnamed: 0,run_fold,test_size,no_prediction_count,ec_without_reaction_count
0,1,50858,0,2109
1,2,50858,0,2149
2,3,50858,0,2221
3,4,50858,0,2233
4,5,50858,0,2214
5,6,50858,0,2158
6,7,50858,0,2213
7,8,50858,0,2198
8,9,50858,0,2123
9,10,50858,0,2104


### 4.6 PRIAM

In [20]:
std_priam, metrics_priam, ec_no_rxn_priam =evTools.get_eval_results(baselineName= 'priam', dict_rxn2id=dict_rxn2id, method_type='ec')
evTools.display_html_results(metrics = metrics_priam, std_mean = std_priam, no_pred=ec_no_rxn_priam, eva_name ='PRIAM')

Getting evaluation results for priam ...
Calculating mean and std ...
Statistic ec no prediction and ec with no reaction ...


Unnamed: 0,baselineName,runFold,mAccuracy,mPrecision,mRecall,mF1,avgType
0,priam,1,0.06,0.96954,0.387931,0.369048,weighted
1,priam,1,0.06,0.147059,0.387931,0.21327,micro
2,priam,1,0.06,0.977581,0.998807,0.976484,macro
3,priam,1,0.06,0.1518,0.331667,0.188848,samples
4,priam,2,0.12,0.978261,0.478261,0.464058,weighted
5,priam,2,0.12,0.214008,0.478261,0.295699,micro
6,priam,2,0.12,0.982552,0.999046,0.981655,macro
7,priam,2,0.12,0.239532,0.428,0.27314,samples
8,priam,3,0.04,0.965402,0.339286,0.314484,weighted
9,priam,3,0.04,0.141264,0.339286,0.199475,micro

Unnamed: 0,baselineName,avgType,Metric,mean,std
0,priam,macro,mAccuracy,0.081,0.029609
1,priam,macro,mF1,0.980676,0.00262
2,priam,macro,mPrecision,0.982226,0.002808
3,priam,macro,mRecall,0.99833,0.000444
4,priam,micro,mAccuracy,0.081,0.029609
5,priam,micro,mF1,0.237074,0.035394
6,priam,micro,mPrecision,0.173529,0.031032
7,priam,micro,mRecall,0.379523,0.054394
8,priam,samples,mAccuracy,0.081,0.029609
9,priam,samples,mF1,0.217152,0.036167

Unnamed: 0,run_fold,test_size,no_prediction_count,ec_without_reaction_count
0,1,50858,14343,2625
1,2,50858,14432,2725
2,3,50858,14313,2782
3,4,50858,14233,2876
4,5,50858,14307,2699
5,6,50858,14358,2746
6,7,50858,14319,2722
7,8,50858,14326,2794
8,9,50858,14409,2745
9,10,50858,14627,2658


## 5.Evaluation Results

In [21]:
res_std = pd.concat([std_blast, std_deepec, std_clean, std_ecrecer, std_catfam, std_priam], axis=0)
res_std.head(3)

Unnamed: 0,baselineName,avgType,Metric,mean,std
0,blast,macro,mAccuracy,0.836434,0.001912
1,blast,macro,mF1,0.159373,0.003008
2,blast,macro,mPrecision,0.178795,0.003609


In [22]:
evTools.show_ec_methods_10_eva_fig(res_metrics_data=res_std[res_std.avgType =='macro'].reset_index(drop=True))

In [23]:
evTools.show_ec_methods_10_eva_fig(res_metrics_data=res_std[res_std.avgType =='micro'].reset_index(drop=True))

In [24]:
evTools.show_ec_methods_10_eva_fig(res_metrics_data=res_std[res_std.avgType =='weighted'].reset_index(drop=True))

In [25]:
evTools.show_ec_methods_10_eva_fig(res_metrics_data=res_std[res_std.avgType =='samples'].reset_index(drop=True))

### 5.1 isEnzyme prediction

In [16]:
print('isEnzyme Predcition Results - EC METHODs')
method = ['ecblast', 'deepec', 'clean', 'ecrecer', 'ecpred', 'catfam', 'priam']
resl = []
for m in method:
    res_item = btools.eva_isenzyme(baselineName=m, res_df=res_method_ec, category='ec')
    resl.append(res_item)
    
resl=pd.DataFrame(resl, columns=['baselineName', 'Accuracy', 'Precision', 'Recall', 'PPV(Sensitivity)', 'NPV(Specificity)', 'F1', 'TP', 'FP', 'FN', 'TN', 'UP', 'UN'])

resl.sort_values(by=['F1'], ascending=False).reset_index(drop=True)

isEnzyme Predcition Results - EC METHODs


Unnamed: 0,baselineName,Accuracy,Precision,Recall,PPV(Sensitivity),NPV(Specificity),F1,TP,FP,FN,TN,UP,UN
0,ecrecer,0.789493,0.532835,0.911388,0.532835,0.964646,0.672499,2921,2561,284,7749,0,0
1,ecpred,0.597839,0.346035,0.631643,0.346035,0.703467,0.447122,2579,3930,560,5499,65,879
2,catfam,0.810285,0.749416,0.300468,0.749416,0.81668,0.428953,963,322,2242,9988,0,0
3,clean,0.237144,0.237144,1.0,0.237144,0.0,0.383373,3205,10310,0,0,0,0
4,ecblast,0.547244,0.302502,0.343502,0.302502,0.37037,0.321701,2466,1406,433,4930,306,3974
5,priam,0.218498,0.218498,0.397069,0.218498,0.0,0.281882,2953,6078,0,0,252,4232
6,deepec,0.090196,0.090196,0.093446,0.090196,0.0,0.091792,1219,470,0,0,1986,9840


In [74]:
fig_data = resl.sort_values(by=['F1']).set_index('baselineName').T.head(6)


plotdata = []
for method in fig_data.columns.values:
    plotdata.append(go.Bar(
        x=fig_data.index, 
        y=fig_data[f'{method}'], 
        name=f'{method}',
        text=fig_data[f'{method}'],
        textposition='auto',
        texttemplate='%{text:.2f}',  # 格式化数值标签 (保留两位小数)
        ))

fig = go.Figure(
    data=plotdata,
    layout=dict(
        barcornerradius=10,
        
    ),
)

fig.show()

### 5.2 Reaction prediction

In [6]:
# Make Label
res_method_ec['lb_rxn_blast'] = res_method_ec.reaction_ecblast.apply(lambda x: btools.make_label(reaction_id=str(x), rxn_label_dict=dict_rxn2id))
res_method_ec['lb_rxn_deepec'] = res_method_ec.reaction_deepec.apply(lambda x: btools.make_label(reaction_id=str(x), rxn_label_dict=dict_rxn2id))
res_method_ec['lb_rxn_clean'] = res_method_ec.reaction_clean.apply(lambda x: btools.make_label(reaction_id=str(x), rxn_label_dict=dict_rxn2id))
res_method_ec['lb_rxn_ecrecer'] = res_method_ec.reaction_ecrecer.apply(lambda x: btools.make_label(reaction_id=str(x), rxn_label_dict=dict_rxn2id))
res_method_ec['lb_rxn_ecpred'] = res_method_ec.reaction_ecpred.apply(lambda x: btools.make_label(reaction_id=str(x), rxn_label_dict=dict_rxn2id))
res_method_ec['lb_rxn_catfam'] = res_method_ec.reaction_catfam.apply(lambda x: btools.make_label(reaction_id=str(x), rxn_label_dict=dict_rxn2id))
res_method_ec['lb_rxn_priam'] = res_method_ec.reaction_priam.apply(lambda x: btools.make_label(reaction_id=str(x), rxn_label_dict=dict_rxn2id))

#full
metrics_rxn_fullset = btools.rxn_eva_metric(eva_df=res_method_ec, eva_name='[FULL SET]', methods=methods)
#noneenzyme
res_noneenzyme = res_method_ec[res_method_ec.rxn_groundtruth=='-'].reset_index(drop=True)
metrics_rxn_noneenzyme = btools.rxn_eva_metric(eva_df=res_noneenzyme, eva_name='[NONE ENZYME]', methods=methods)
#enzyme
res_enzyme = res_method_ec[res_method_ec.rxn_groundtruth!='-'].reset_index(drop=True)
metrics_rxn_enzyme = btools.rxn_eva_metric(eva_df=res_enzyme, eva_name='[ENZYME]', methods=methods)

Evaluating: Reaction Predcition Results [FULL SET]
Evaluating: Reaction Predcition Results [NONE ENZYME]
Evaluating: Reaction Predcition Results [ENZYME]


In [11]:
#show results
HTML(f'''
     <div style="float:left; width: 25%;"><h2>Reaction Predcition Results <span style="color:red"> [FULL SET]</span></h2>{metrics_rxn_fullset.sort_values(by=['mF1'], ascending=False).reset_index(drop=True).to_html()} </div>
     <div style="float:left; width: 25%;"><h2>Reaction Predcition Results <span style="color:red"> [NONE-ENZYME]</span></h2>{metrics_rxn_noneenzyme.sort_values(by=['mF1'], ascending=False).reset_index(drop=True).to_html()} </div>
     <div style="float:left; width: 25%;"><h2>Reaction Predcition Results <span style="color:red"> [ENZYME]</span></h2>{metrics_rxn_enzyme.sort_values(by=['mF1'], ascending=False).reset_index(drop=True).to_html()} </div>
     ''')

Unnamed: 0,baselineName,mAccuracy,mPrecision,mRecall,mF1
0,ecrecer,0.6899,0.967961,0.651723,0.714824
1,catfam,0.770477,0.870005,0.670351,0.620349
2,blast,0.43337,0.907868,0.40919,0.504355
3,ecpred,0.420792,0.926196,0.36349,0.455263
4,clean,0.078949,0.879031,0.145483,0.097747
5,priam,0.017832,0.853372,0.160137,0.075407
6,deepec,0.03485,0.9784,0.052406,0.059727

Unnamed: 0,baselineName,mAccuracy,mPrecision,mRecall,mF1
0,catfam,0.968768,1.0,0.968768,0.984136
1,ecrecer,0.7516,1.0,0.7516,0.858187
2,ecpred,0.53356,1.0,0.53356,0.695845
3,blast,0.478177,1.0,0.478177,0.646982
4,deepec,0.0,1.0,0.0,0.0
5,clean,0.0,1.0,0.0,0.0
6,priam,0.0,1.0,0.0,0.0

Unnamed: 0,baselineName,mAccuracy,mPrecision,mRecall,mF1
0,ecrecer,0.49142,0.979519,0.474029,0.486309
1,clean,0.332917,0.8617,0.404314,0.389627
2,blast,0.289236,0.931659,0.286454,0.297081
3,priam,0.075195,0.692765,0.445039,0.289674
4,deepec,0.146958,0.964808,0.145643,0.171148
5,catfam,0.132605,0.978662,0.139431,0.15227
6,ecpred,0.058034,0.977168,0.060915,0.073204
