## 反应直接预测结果分析
> 2024-12-03

### 1. 导入必要的包

In [10]:
# Standard Library Imports
import os
import sys

# Third-party Imports
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
import plotly.graph_objects as go
from IPython.display import HTML
from pandarallel import pandarallel  # Importing pandarallel for parallel processing

# Setting up the path for the module
sys.path.insert(0, os.path.dirname(os.path.realpath('__file__')))
sys.path.insert(1, '../')

# Local Imports
from config import conf as cfg
from tools import btools
import evTools
# Initialize parallel processing
pandarallel.initialize(progress_bar=False)

# Enable autoreloading of modules in IPython
%load_ext autoreload
%autoreload 2

INFO: Pandarallel will run on 128 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### 2. 加载测试数据集

In [2]:
# 从 JSON 文件加载反应编码字典
with open(cfg.FILE_DS_DICT_RXN2ID, "r") as json_file:
    dict_rxn2id = json.load(json_file)
    print(f'加载反应编码字典完成，共有 {len(dict_rxn2id)} 个反应。')  # 打印加载的数据
    
print('Loading validation datasets feather path ...')

加载反应编码字典完成，共有 10479 个反应。
Loading validation datasets feather path ...


## 4. Load results
### 4.1 Foldseek

In [11]:
foldseek_structural_pred_res = [f'{cfg.RESULTS_DIR}structural/foldseek_aln_fold{i+1}.feather' for i in range(10)]
fold_res = [ pd.read_feather(file) for file in foldseek_structural_pred_res ]

In [12]:
fold_res[0].head(3)

Unnamed: 0,uniprot_id,seq,rxn_groundtruth,ref_id,fident,alntmscore,rxn_structural_aln
0,Q9UYB6,MLPDRVLEILNEMKAERIRGATWLARKGAEAFLALAEELDEALLED...,-,O58185,0.789,0.9979,-
1,C1AQW9,MRTPCSQHRRDRPSAIGSQLPDADTLDTRQPPLQEIPISSFADKTF...,RHEA:19669,P9WK97,1.0,0.9929,RHEA:19669
2,P64647,MALFSKILIFYVIGVNISFVIIWFISHEKTHIRLLSAFLVGITWPM...,-,P64646,1.0,0.8907,-


In [14]:
std_blast, metrics_blast, ec_no_rxn_blast  = evTools.get_eval_results(baselineName='foldseekaln', dict_rxn2id=dict_rxn2id, method_type='structural')
evTools.display_html_results(metrics = metrics_blast, std_mean = std_blast, no_pred=ec_no_rxn_blast, eva_name ='Blast-direct')

Getting evaluation results for foldseekaln ...
Calculating mean and std ...


Unnamed: 0,baselineName,runFold,mAccuracy,mPrecision,mRecall,mF1,avgType
0,foldseekaln,1,0.937316,0.966115,0.934882,0.941804,weighted
1,foldseekaln,1,0.937316,0.929816,0.934882,0.932342,micro
2,foldseekaln,1,0.937316,0.820339,0.913709,0.768427,macro
3,foldseekaln,1,0.937316,0.942869,0.94288,0.942014,samples
4,foldseekaln,2,0.935566,0.964966,0.93046,0.937383,weighted
5,foldseekaln,2,0.935566,0.92967,0.93046,0.930065,micro
6,foldseekaln,2,0.935566,0.823502,0.902942,0.762929,macro
7,foldseekaln,2,0.935566,0.941697,0.94163,0.940608,samples
8,foldseekaln,3,0.937355,0.966034,0.931928,0.939412,weighted
9,foldseekaln,3,0.937355,0.931928,0.931928,0.931928,micro

Unnamed: 0,baselineName,avgType,Metric,mean,std
0,foldseekaln,macro,mAccuracy,0.852574,0.26858
1,foldseekaln,macro,mF1,0.693008,0.23345
2,foldseekaln,macro,mPrecision,0.745662,0.246755
3,foldseekaln,macro,mRecall,0.875339,0.099289
4,foldseekaln,micro,mAccuracy,0.852574,0.26858
5,foldseekaln,micro,mF1,0.848386,0.265485
6,foldseekaln,micro,mPrecision,0.848207,0.263182
7,foldseekaln,micro,mRecall,0.848645,0.267548
8,foldseekaln,samples,mAccuracy,0.852574,0.26858
9,foldseekaln,samples,mF1,0.856885,0.269868


### 4.2 Unirep

In [4]:
std_unirep_euclidean, metrics_unirep_euclidean, ec_no_rxn_unirep_euclidean  = evTools.get_eval_results(baselineName='unirep_euclidean', dict_rxn2id=dict_rxn2id, method_type='direct')
evTools.display_html_results(metrics = metrics_unirep_euclidean, std_mean = std_unirep_euclidean, no_pred=ec_no_rxn_unirep_euclidean, eva_name ='Unirep Euclidean')

Getting evaluation results for unirep_euclidean ...
Calculating mean and std ...


Unnamed: 0,baselineName,runFold,mAccuracy,mPrecision,mRecall,mF1,avgType
0,unirep_euclidean,1,0.889575,0.892769,0.963311,0.915861,weighted
1,unirep_euclidean,1,0.889575,0.825235,0.963311,0.888943,micro
2,unirep_euclidean,1,0.889575,0.729777,0.926269,0.704546,macro
3,unirep_euclidean,1,0.889575,0.925941,0.969456,0.938116,samples
4,unirep_euclidean,2,0.889595,0.892241,0.960985,0.913871,weighted
5,unirep_euclidean,2,0.889595,0.825177,0.960985,0.887918,micro
6,unirep_euclidean,2,0.889595,0.736967,0.913366,0.70436,macro
7,unirep_euclidean,2,0.889595,0.926429,0.969719,0.93852,samples
8,unirep_euclidean,3,0.888946,0.88975,0.961411,0.913392,weighted
9,unirep_euclidean,3,0.888946,0.823032,0.961411,0.886856,micro

Unnamed: 0,baselineName,avgType,Metric,mean,std
0,unirep_euclidean,macro,mAccuracy,0.889701,0.001506
1,unirep_euclidean,macro,mF1,0.703718,0.009531
2,unirep_euclidean,macro,mPrecision,0.73161,0.011152
3,unirep_euclidean,macro,mRecall,0.918939,0.005109
4,unirep_euclidean,micro,mAccuracy,0.889701,0.001506
5,unirep_euclidean,micro,mF1,0.888701,0.002087
6,unirep_euclidean,micro,mPrecision,0.82595,0.003642
7,unirep_euclidean,micro,mRecall,0.961781,0.001047
8,unirep_euclidean,samples,mAccuracy,0.889701,0.001506
9,unirep_euclidean,samples,mF1,0.938241,0.000984


In [5]:
std_unirep_cosine, metrics_unirep_cosine, ec_no_rxn_unirep_cosine  = evTools.get_eval_results(baselineName='unirep_cosine', dict_rxn2id=dict_rxn2id, method_type='direct')
evTools.display_html_results(metrics = metrics_unirep_cosine, std_mean = std_unirep_cosine, no_pred=ec_no_rxn_unirep_cosine, eva_name ='Unirep Cosine')

Getting evaluation results for unirep_cosine ...
Calculating mean and std ...


Unnamed: 0,baselineName,runFold,mAccuracy,mPrecision,mRecall,mF1,avgType
0,unirep_cosine,1,0.890656,0.89393,0.964254,0.916958,weighted
1,unirep_cosine,1,0.890656,0.826613,0.964254,0.890144,micro
2,unirep_cosine,1,0.890656,0.732471,0.927823,0.706907,macro
3,unirep_cosine,1,0.890656,0.926858,0.970005,0.938906,samples
4,unirep_cosine,2,0.890578,0.894111,0.961342,0.914934,weighted
5,unirep_cosine,2,0.890578,0.825507,0.961342,0.888261,micro
6,unirep_cosine,2,0.890578,0.733132,0.913889,0.699769,macro
7,unirep_cosine,2,0.890578,0.927055,0.970037,0.939039,samples
8,unirep_cosine,3,0.890696,0.890728,0.961991,0.914388,weighted
9,unirep_cosine,3,0.890696,0.823444,0.961991,0.887342,micro

Unnamed: 0,baselineName,avgType,Metric,mean,std
0,unirep_cosine,macro,mAccuracy,0.890761,0.00159
1,unirep_cosine,macro,mF1,0.705396,0.009173
2,unirep_cosine,macro,mPrecision,0.733287,0.011174
3,unirep_cosine,macro,mRecall,0.91966,0.00507
4,unirep_cosine,micro,mAccuracy,0.890761,0.00159
5,unirep_cosine,micro,mF1,0.88982,0.002254
6,unirep_cosine,micro,mPrecision,0.827592,0.00386
7,unirep_cosine,micro,mRecall,0.962178,0.00104
8,unirep_cosine,samples,mAccuracy,0.890761,0.00159
9,unirep_cosine,samples,mF1,0.938799,0.001002


### 4.3 ESM

### 4.4 T5

# 5. 整合指标