## 反应直接预测结果分析
> 2024-12-03

### 1. 导入必要的包

In [1]:
# Standard Library Imports
import os
import sys

# Third-party Imports
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
import plotly.graph_objects as go
from IPython.display import HTML
from pandarallel import pandarallel  # Importing pandarallel for parallel processing

# Setting up the path for the module
sys.path.insert(0, os.path.dirname(os.path.realpath('__file__')))
sys.path.insert(1, '../')

# Local Imports
from config import conf as cfg
from tools import btools
import evTools
# Initialize parallel processing
pandarallel.initialize(progress_bar=False)

# Enable autoreloading of modules in IPython
%load_ext autoreload
%autoreload 2

INFO: Pandarallel will run on 128 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


### 2. 加载测试数据集

In [2]:
# 从 JSON 文件加载反应编码字典
with open(cfg.FILE_DS_DICT_RXN2ID, "r") as json_file:
    dict_rxn2id = json.load(json_file)
    print(f'加载反应编码字典完成，共有 {len(dict_rxn2id)} 个反应。')  # 打印加载的数据
    
print('Loading validation datasets feather path ...')

加载反应编码字典完成，共有 10479 个反应。
Loading validation datasets feather path ...


## 4. Load results
### 4.1 Blast

In [3]:
std_blast, metrics_blast, ec_no_rxn_blast  = evTools.get_eval_results(baselineName='blast', dict_rxn2id=dict_rxn2id, method_type='direct')
evTools.display_html_results(metrics = metrics_blast, std_mean = std_blast, no_pred=ec_no_rxn_blast, eva_name ='Blast-direct')

Getting evaluation results for blast ...
Calculating mean and std ...
Statistic ec no prediction and ec with no reaction ...


Unnamed: 0,baselineName,runFold,mAccuracy,mPrecision,mRecall,mF1,avgType
0,blast,1,0.831669,0.879424,0.950254,0.895608,weighted
1,blast,1,0.831669,0.623579,0.950254,0.753013,micro
2,blast,1,0.831669,0.416449,0.96359,0.428415,macro
3,blast,1,0.831669,0.876632,0.946831,0.893186,samples
4,blast,2,0.832711,0.876026,0.949407,0.892931,weighted
5,blast,2,0.832711,0.63011,0.949407,0.757486,micro
6,blast,2,0.832711,0.425493,0.957683,0.437911,macro
7,blast,2,0.832711,0.877638,0.947314,0.894095,samples
8,blast,3,0.832435,0.87678,0.951163,0.894422,weighted
9,blast,3,0.832435,0.621594,0.951163,0.751848,micro

Unnamed: 0,baselineName,avgType,Metric,mean,std
0,blast,macro,mAccuracy,0.832695,0.001639
1,blast,macro,mF1,0.439365,0.006131
2,blast,macro,mPrecision,0.42649,0.006962
3,blast,macro,mRecall,0.960779,0.002316
4,blast,micro,mAccuracy,0.832695,0.001639
5,blast,micro,mF1,0.756429,0.004556
6,blast,micro,mPrecision,0.628279,0.00615
7,blast,micro,mRecall,0.950299,0.000708
8,blast,samples,mAccuracy,0.832695,0.001639
9,blast,samples,mF1,0.894027,0.001242

Unnamed: 0,run_fold,test_size,no_prediction_count
0,1,50858,2424
1,2,50858,2386
2,3,50858,2379
3,4,50858,2301
4,5,50858,2418
5,6,50858,2391
6,7,50858,2369
7,8,50858,2381
8,9,50858,2449
9,10,50858,2431


### 4.2 Unirep

In [4]:
std_unirep_euclidean, metrics_unirep_euclidean, ec_no_rxn_unirep_euclidean  = evTools.get_eval_results(baselineName='unirep_euclidean', dict_rxn2id=dict_rxn2id, method_type='direct')
evTools.display_html_results(metrics = metrics_unirep_euclidean, std_mean = std_unirep_euclidean, no_pred=ec_no_rxn_unirep_euclidean, eva_name ='Unirep Euclidean')

Getting evaluation results for unirep_euclidean ...
Calculating mean and std ...


Unnamed: 0,baselineName,runFold,mAccuracy,mPrecision,mRecall,mF1,avgType
0,unirep_euclidean,1,0.889575,0.892769,0.963311,0.915861,weighted
1,unirep_euclidean,1,0.889575,0.825235,0.963311,0.888943,micro
2,unirep_euclidean,1,0.889575,0.729777,0.926269,0.704546,macro
3,unirep_euclidean,1,0.889575,0.925941,0.969456,0.938116,samples
4,unirep_euclidean,2,0.889595,0.892241,0.960985,0.913871,weighted
5,unirep_euclidean,2,0.889595,0.825177,0.960985,0.887918,micro
6,unirep_euclidean,2,0.889595,0.736967,0.913366,0.70436,macro
7,unirep_euclidean,2,0.889595,0.926429,0.969719,0.93852,samples
8,unirep_euclidean,3,0.888946,0.88975,0.961411,0.913392,weighted
9,unirep_euclidean,3,0.888946,0.823032,0.961411,0.886856,micro

Unnamed: 0,baselineName,avgType,Metric,mean,std
0,unirep_euclidean,macro,mAccuracy,0.889701,0.001506
1,unirep_euclidean,macro,mF1,0.703718,0.009531
2,unirep_euclidean,macro,mPrecision,0.73161,0.011152
3,unirep_euclidean,macro,mRecall,0.918939,0.005109
4,unirep_euclidean,micro,mAccuracy,0.889701,0.001506
5,unirep_euclidean,micro,mF1,0.888701,0.002087
6,unirep_euclidean,micro,mPrecision,0.82595,0.003642
7,unirep_euclidean,micro,mRecall,0.961781,0.001047
8,unirep_euclidean,samples,mAccuracy,0.889701,0.001506
9,unirep_euclidean,samples,mF1,0.938241,0.000984


In [5]:
std_unirep_cosine, metrics_unirep_cosine, ec_no_rxn_unirep_cosine  = evTools.get_eval_results(baselineName='unirep_cosine', dict_rxn2id=dict_rxn2id, method_type='direct')
evTools.display_html_results(metrics = metrics_unirep_cosine, std_mean = std_unirep_cosine, no_pred=ec_no_rxn_unirep_cosine, eva_name ='Unirep Cosine')

Getting evaluation results for unirep_cosine ...
Calculating mean and std ...


Unnamed: 0,baselineName,runFold,mAccuracy,mPrecision,mRecall,mF1,avgType
0,unirep_cosine,1,0.890656,0.89393,0.964254,0.916958,weighted
1,unirep_cosine,1,0.890656,0.826613,0.964254,0.890144,micro
2,unirep_cosine,1,0.890656,0.732471,0.927823,0.706907,macro
3,unirep_cosine,1,0.890656,0.926858,0.970005,0.938906,samples
4,unirep_cosine,2,0.890578,0.894111,0.961342,0.914934,weighted
5,unirep_cosine,2,0.890578,0.825507,0.961342,0.888261,micro
6,unirep_cosine,2,0.890578,0.733132,0.913889,0.699769,macro
7,unirep_cosine,2,0.890578,0.927055,0.970037,0.939039,samples
8,unirep_cosine,3,0.890696,0.890728,0.961991,0.914388,weighted
9,unirep_cosine,3,0.890696,0.823444,0.961991,0.887342,micro

Unnamed: 0,baselineName,avgType,Metric,mean,std
0,unirep_cosine,macro,mAccuracy,0.890761,0.00159
1,unirep_cosine,macro,mF1,0.705396,0.009173
2,unirep_cosine,macro,mPrecision,0.733287,0.011174
3,unirep_cosine,macro,mRecall,0.91966,0.00507
4,unirep_cosine,micro,mAccuracy,0.890761,0.00159
5,unirep_cosine,micro,mF1,0.88982,0.002254
6,unirep_cosine,micro,mPrecision,0.827592,0.00386
7,unirep_cosine,micro,mRecall,0.962178,0.00104
8,unirep_cosine,samples,mAccuracy,0.890761,0.00159
9,unirep_cosine,samples,mF1,0.938799,0.001002


### 4.3 ESM

In [6]:
std_esm_euclidean, metrics_esm_euclidean, ec_no_rxn_esm_euclidean  = evTools.get_eval_results(baselineName='esm_euclidean', dict_rxn2id=dict_rxn2id, method_type='direct')
evTools.display_html_results(metrics = metrics_esm_euclidean, std_mean = std_esm_euclidean, no_pred=ec_no_rxn_esm_euclidean, eva_name ='ESM Euclidean')

Getting evaluation results for esm_euclidean ...
Calculating mean and std ...


Unnamed: 0,baselineName,runFold,mAccuracy,mPrecision,mRecall,mF1,avgType
0,esm_euclidean,1,0.935133,0.940896,0.981476,0.953393,weighted
1,esm_euclidean,1,0.935133,0.891606,0.981476,0.934385,micro
2,esm_euclidean,1,0.935133,0.782764,0.948362,0.759314,macro
3,esm_euclidean,1,0.935133,0.958464,0.98562,0.965996,samples
4,esm_euclidean,2,0.933167,0.937562,0.978982,0.949517,weighted
5,esm_euclidean,2,0.933167,0.888709,0.978982,0.931664,micro
6,esm_euclidean,2,0.933167,0.789517,0.942028,0.762904,macro
7,esm_euclidean,2,0.933167,0.95724,0.984729,0.964734,samples
8,esm_euclidean,3,0.933875,0.937052,0.978685,0.949509,weighted
9,esm_euclidean,3,0.933875,0.886449,0.978685,0.930286,micro

Unnamed: 0,baselineName,avgType,Metric,mean,std
0,esm_euclidean,macro,mAccuracy,0.935121,0.001567
1,esm_euclidean,macro,mF1,0.761546,0.00883
2,esm_euclidean,macro,mPrecision,0.784878,0.00887
3,esm_euclidean,macro,mRecall,0.943609,0.002835
4,esm_euclidean,micro,mAccuracy,0.935121,0.001567
5,esm_euclidean,micro,mF1,0.93246,0.002193
6,esm_euclidean,micro,mPrecision,0.88938,0.003587
7,esm_euclidean,micro,mRecall,0.979933,0.000912
8,esm_euclidean,samples,mAccuracy,0.935121,0.001567
9,esm_euclidean,samples,mF1,0.965689,0.000882


In [7]:
std_esm_cosine, metrics_esm_cosine, ec_no_rxn_esm_cosine  = evTools.get_eval_results(baselineName='esm_cosine', dict_rxn2id=dict_rxn2id, method_type='direct')
evTools.display_html_results(metrics = metrics_esm_cosine, std_mean = std_esm_cosine, no_pred=ec_no_rxn_esm_cosine, eva_name ='ESM Cosine')

Getting evaluation results for esm_cosine ...
Calculating mean and std ...


Unnamed: 0,baselineName,runFold,mAccuracy,mPrecision,mRecall,mF1,avgType
0,esm_cosine,1,0.936765,0.941749,0.982007,0.95448,weighted
1,esm_cosine,1,0.936765,0.894651,0.982007,0.936296,micro
2,esm_cosine,1,0.936765,0.791777,0.950621,0.769884,macro
3,esm_cosine,1,0.936765,0.959638,0.985791,0.96692,samples
4,esm_cosine,2,0.935133,0.939775,0.979118,0.950952,weighted
5,esm_cosine,2,0.935133,0.892933,0.979118,0.934042,micro
6,esm_cosine,2,0.935133,0.79384,0.941738,0.765679,macro
7,esm_cosine,2,0.935133,0.958643,0.98483,0.965821,samples
8,esm_cosine,3,0.935684,0.939547,0.979111,0.951304,weighted
9,esm_cosine,3,0.935684,0.890548,0.979111,0.932732,micro

Unnamed: 0,baselineName,avgType,Metric,mean,std
0,esm_cosine,macro,mAccuracy,0.93695,0.001635
1,esm_cosine,macro,mF1,0.769478,0.007565
2,esm_cosine,macro,mPrecision,0.793549,0.007176
3,esm_cosine,macro,mRecall,0.94424,0.003049
4,esm_cosine,micro,mAccuracy,0.93695,0.001635
5,esm_cosine,micro,mF1,0.935216,0.002274
6,esm_cosine,micro,mPrecision,0.894122,0.003874
7,esm_cosine,micro,mRecall,0.980278,0.000878
8,esm_cosine,samples,mAccuracy,0.93695,0.001635
9,esm_cosine,samples,mF1,0.966874,0.000975


### 4.4 T5

In [8]:
std_t5_euclidean, metrics_t5_euclidean, ec_no_rxn_t5_euclidean  = evTools.get_eval_results(baselineName='t5_euclidean', dict_rxn2id=dict_rxn2id, method_type='direct')
evTools.display_html_results(metrics = metrics_t5_euclidean, std_mean = std_t5_euclidean, no_pred=ec_no_rxn_t5_euclidean, eva_name ='T5-Euclidean')

Getting evaluation results for t5_euclidean ...
Calculating mean and std ...


Unnamed: 0,baselineName,runFold,mAccuracy,mPrecision,mRecall,mF1,avgType
0,t5_euclidean,1,0.954088,0.962281,0.98432,0.967084,weighted
1,t5_euclidean,1,0.954088,0.916051,0.98432,0.94896,micro
2,t5_euclidean,1,0.954088,0.793163,0.953294,0.772146,macro
3,t5_euclidean,1,0.954088,0.97035,0.987983,0.975186,samples
4,t5_euclidean,2,0.952554,0.958974,0.983549,0.964047,weighted
5,t5_euclidean,2,0.952554,0.910823,0.983549,0.94579,micro
6,t5_euclidean,2,0.952554,0.78874,0.948093,0.764049,macro
7,t5_euclidean,2,0.952554,0.969822,0.988623,0.974864,samples
8,t5_euclidean,3,0.953282,0.962179,0.98363,0.966474,weighted
9,t5_euclidean,3,0.953282,0.918272,0.98363,0.949828,micro

Unnamed: 0,baselineName,avgType,Metric,mean,std
0,t5_euclidean,macro,mAccuracy,0.954092,0.000976
1,t5_euclidean,macro,mF1,0.77166,0.007221
2,t5_euclidean,macro,mPrecision,0.79536,0.006978
3,t5_euclidean,macro,mRecall,0.948836,0.00281
4,t5_euclidean,micro,mAccuracy,0.954092,0.000976
5,t5_euclidean,micro,mF1,0.948838,0.001863
6,t5_euclidean,micro,mPrecision,0.916466,0.003473
7,t5_euclidean,micro,mRecall,0.983587,0.000479
8,t5_euclidean,samples,mAccuracy,0.954092,0.000976
9,t5_euclidean,samples,mF1,0.975479,0.000582


In [9]:
std_t5_cosine, metrics_t5_cosine, ec_no_rxn_t5_cosine  = evTools.get_eval_results(baselineName='t5_cosine', dict_rxn2id=dict_rxn2id, method_type='direct')
evTools.display_html_results(metrics = metrics_t5_cosine, std_mean = std_t5_cosine, no_pred=ec_no_rxn_t5_cosine, eva_name ='T5-Euclidean')

Getting evaluation results for t5_cosine ...
Calculating mean and std ...


Unnamed: 0,baselineName,runFold,mAccuracy,mPrecision,mRecall,mF1,avgType
0,t5_cosine,1,0.954186,0.962347,0.984338,0.967073,weighted
1,t5_cosine,1,0.954186,0.915585,0.984338,0.948718,micro
2,t5_cosine,1,0.954186,0.792602,0.953247,0.771326,macro
3,t5_cosine,1,0.954186,0.970402,0.988038,0.97521,samples
4,t5_cosine,2,0.953341,0.959822,0.983481,0.964536,weighted
5,t5_cosine,2,0.953341,0.912065,0.983481,0.946428,micro
6,t5_cosine,2,0.953341,0.78992,0.948462,0.764916,macro
7,t5_cosine,2,0.953341,0.970347,0.988629,0.975232,samples
8,t5_cosine,3,0.953537,0.962086,0.983664,0.966485,weighted
9,t5_cosine,3,0.953537,0.918245,0.983664,0.94983,micro

Unnamed: 0,baselineName,avgType,Metric,mean,std
0,t5_cosine,macro,mAccuracy,0.954644,0.001037
1,t5_cosine,macro,mF1,0.773048,0.007379
2,t5_cosine,macro,mPrecision,0.796889,0.00729
3,t5_cosine,macro,mRecall,0.948813,0.002835
4,t5_cosine,micro,mAccuracy,0.954644,0.001037
5,t5_cosine,micro,mF1,0.949295,0.00197
6,t5_cosine,micro,mPrecision,0.9173,0.003636
7,t5_cosine,micro,mRecall,0.98361,0.000481
8,t5_cosine,samples,mAccuracy,0.954644,0.001037
9,t5_cosine,samples,mF1,0.975733,0.000623


### 4.5 ALFP

In [10]:
std_ecrecer, metrics_ecrecer, ec_no_rxn_ecrecer  = evTools.get_eval_results(baselineName='RXNRECer', dict_rxn2id=dict_rxn2id, method_type='direct')
evTools.display_html_results(metrics = metrics_ecrecer, std_mean = std_ecrecer, no_pred=ec_no_rxn_ecrecer, eva_name ='ALFP')

Getting evaluation results for RXNRECer ...
Calculating mean and std ...


Unnamed: 0,baselineName,runFold,mAccuracy,mPrecision,mRecall,mF1,avgType
0,RXNRECer,1,0.987868,0.993214,0.981904,0.982549,weighted
1,RXNRECer,1,0.987868,0.989586,0.981904,0.98573,micro
2,RXNRECer,1,0.987868,0.983285,0.957753,0.94683,macro
3,RXNRECer,1,0.987868,0.991223,0.990404,0.990378,samples
4,RXNRECer,2,0.987259,0.992444,0.979576,0.980261,weighted
5,RXNRECer,2,0.987259,0.987827,0.979576,0.983684,micro
6,RXNRECer,2,0.987259,0.978083,0.947131,0.931573,macro
7,RXNRECer,2,0.987259,0.990433,0.989777,0.989713,samples
8,RXNRECer,3,0.98716,0.992512,0.979572,0.980189,weighted
9,RXNRECer,3,0.98716,0.989357,0.979572,0.98444,micro

Unnamed: 0,baselineName,avgType,Metric,mean,std
0,RXNRECer,macro,mAccuracy,0.987282,0.000576
1,RXNRECer,macro,mF1,0.938766,0.006167
2,RXNRECer,macro,mPrecision,0.981666,0.002699
3,RXNRECer,macro,mRecall,0.950997,0.00526
4,RXNRECer,micro,mAccuracy,0.987282,0.000576
5,RXNRECer,micro,mF1,0.984657,0.000957
6,RXNRECer,micro,mPrecision,0.988962,0.000877
7,RXNRECer,micro,mRecall,0.980389,0.001319
8,RXNRECer,samples,mAccuracy,0.987282,0.000576
9,RXNRECer,samples,mF1,0.98998,0.000436


# 5. 整合指标

In [69]:
file_metrics_all_direct = f'{cfg.DIR_PROJECT_ROOT}/results/intermediate/direct/std_10_fold_direct_methods.csv'
if not os.path.exists(file_metrics_all_direct):
    res_std = pd.concat([std_blast, std_unirep_euclidean, std_unirep_cosine, std_esm_euclidean, std_esm_cosine , std_t5_euclidean, std_t5_cosine, std_ecrecer], axis=0)
    res_std.to_csv(file_metrics_all_direct, index=False, sep='\t')
else:
    res_std = pd.read_csv(file_metrics_all_direct, sep='\t')
res_std.head(3)
    

Unnamed: 0,baselineName,avgType,Metric,mean,std
0,blast,macro,mAccuracy,0.828,0.037357
1,blast,macro,mF1,0.994924,0.002831
2,blast,macro,mPrecision,0.994981,0.002829


In [65]:
evTools.show_ec_methods_10_eva_fig(res_metrics_data=res_std[res_std.avgType =='macro'].reset_index(drop=True))

In [66]:
evTools.show_ec_methods_10_eva_fig(res_metrics_data=res_std[res_std.avgType =='micro'].reset_index(drop=True))

In [63]:
evTools.show_ec_methods_10_eva_fig(res_metrics_data=res_std[res_std.avgType =='weighted'].reset_index(drop=True))

In [57]:
evTools.show_ec_methods_10_eva_fig(res_metrics_data=res_std[res_std.avgType =='samples'].reset_index(drop=True))

In [25]:
res_std[res_std.avgType =='micro']

Unnamed: 0,baselineName,avgType,Metric,mean,std
4,blast,micro,mAccuracy,0.828,0.037357
5,blast,micro,mF1,0.789212,0.080942
6,blast,micro,mPrecision,0.677126,0.110561
7,blast,micro,mRecall,0.958911,0.014166
4,unirep_euclidean,micro,mAccuracy,0.889701,0.001506
5,unirep_euclidean,micro,mF1,0.888701,0.002087
6,unirep_euclidean,micro,mPrecision,0.82595,0.003642
7,unirep_euclidean,micro,mRecall,0.961781,0.001047
4,unirep_cosine,micro,mAccuracy,0.890761,0.00159
5,unirep_cosine,micro,mF1,0.88982,0.002254
