# 测试集的Case -Fold2

> author: Shizhenkun   
> email: zhenkun.shi@tib.cas.cn   
> date: 2025-01-17  


## 1. Import packages

In [1]:
import sys,os
sys.path.insert(0, os.path.dirname(os.path.realpath('__file__')))
sys.path.insert(1,'../../')
sys.path.insert(1,'../../../')
sys.path.insert(1,'../methods/active-learning/')
from config import conf as cfg
from tools import uniprottool as uptool
from tools import  bioFunctionLib as bfl
import rxnrecer as production
from modules import commonfunction as cmfunc
from tqdm import tqdm
import re
from pandarallel import pandarallel # 导入pandaralle
pandarallel.initialize(progress_bar=False)
from tkinter import _flatten
import json
from tools import btools
from evaluation import evTools
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib_venn import venn2,venn3

FIRST_TIME_RUN = False # For the initial run, please set this flag to True. This will allow the program to download data from UniProt and RHEA, which may take longer depending on your internet speed.

%load_ext autoreload
%autoreload 2



INFO: Pandarallel will run on 192 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


## 2. 获取测试数据

In [2]:
# Fold2 的数据
DIR_DATA=f'{cfg.DIR_DATASET}validation/fold2/'
ds_train = pd.read_feather(f'{DIR_DATA}train.feather')
ds_test = pd.read_feather(f'{DIR_DATA}valid.feather')
ds_test.head(3)

Unnamed: 0,uniprot_id,seq,reaction_id,ec_number,functionCounts,ec_specific_level,isenzyme,label
189365,O35066,MASKTKASEALKVVARCRPLSRKEEAAGHEQILTMDVKLGQVTLRN...,-,-,0,0,False,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
390973,Q6AYR8,MASSSPDAPCSCDCFVSVPPASAIPAVIFAKNSDRPRDEVQEVVFI...,-,-,0,0,False,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
288740,Q5BE22,MDVIPSTTPGEAVRISAKRTAELFGPEYLMVTPSASNGSIGVSYRR...,-,-,0,0,False,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [3]:
def simple_res(method_type, baselineName):
    if method_type == 'ec':
        dir_path = f'{cfg.DIR_PROJECT_ROOT}/results/intermediate/ecmethods'
    elif method_type == 'direct':
        dir_path = f'{cfg.DIR_PROJECT_ROOT}/results/intermediate/direct'
    elif method_type == 'structural':
        dir_path = f'{cfg.DIR_PROJECT_ROOT}/results/intermediate/structural'
        
    label_file = f'{dir_path}/{baselineName}_10folds_labels_res.feather'
    
    data = pd.read_feather(label_file)
    data = data[data.run_fold==2].reset_index(drop=True)
    data[f'isTrue_{baselineName}']=data.apply(lambda x: True if np.array_equal(x.lb_rxn_groundtruth, x[f'lb_rxn_{baselineName}']) else False, axis=1)
    data = data.drop(['lb_rxn_groundtruth', f'lb_rxn_{baselineName}'], axis=1)
    data.to_feather(f'res/{baselineName}_fold2.feather')
    return data
    

In [4]:
ecmethods=['blastvec', 'deepec', 'clean', 'ecrecer', 'catfam', 'priam']
directmethods = ['blastvrxn', 'esm_cosine', 'esm_euclidean', 'unirep_cosine', 'unirep_euclidean', 't5_cosine', 't5_euclidean', 'RXNRECer']

In [24]:
for item in tqdm(directmethods[1:]):
    method_type = 'direct'
    baselineName = item
    lbfile = simple_res(method_type=method_type, baselineName=baselineName)

100%|██████████| 6/6 [01:32<00:00, 15.34s/it]


## 3. Load Results

In [None]:
ecmethods=['blastvec', 'deepec', 'clean', 'ecrecer', 'catfam', 'priam']
directmethods = ['blastvrxn', 'esm_cosine', 'esm_euclidean', 'unirep_cosine', 'unirep_euclidean', 't5_cosine', 't5_euclidean',  'tdit5_cosine', 'tdit5_euclidean','RXNRECer']
methods = ecmethods + directmethods

ldatas = [pd.read_feather(f'res/{method}_fold2.feather') for method in methods]

# Define the columns to drop
cols_to_drop = ['ec_groundtruth', 'run_fold', 'rxn_groundtruth', 'rxn_blast']
cols_to_drop_2 = ['rxn_t5_euclidean_y', 'rxn_tdit5_euclidean_y','rxn_t5_cosine_y', 'rxn_tdit5_cosine_y', 'rxn_t5_euclidean_y', 'rxn_unirep_cosine_y', 'rxn_unirep_euclidean_y','rxn_esm_cosine_y', 'rxn_esm_euclidean_y']

# Iteratively merge all DataFrames in ldatas
savedf = ldatas[0]
for df in ldatas[1:]:
    # Drop columns only if they exist in the DataFrame
    df = df.drop([col for col in cols_to_drop if col in df.columns], axis=1)
    savedf = savedf.merge(df, on='uniprot_id', how='left')

# Now `result` contains the merged DataFrame from ldatas[0] to ldatas[10]
savedf = savedf.drop([col for col in cols_to_drop_2 if col in savedf.columns], axis=1)
savedf = savedf.rename(columns= {
'rxn_t5_euclidean_x':'rxn_t5_euclidean', 
'rxn_tdit5_euclidean_x':'rxn_tdit5_euclidean',
'rxn_t5_cosine_x':'rxn_t5_cosine', 
'rxn_tdit5_cosine_x':'rxn_tdit5_cosine', 
'rxn_t5_euclidean_x':'rxn_t5_euclidean', 
'rxn_unirep_cosine_x':'rxn_unirep_cosine', 
'rxn_unirep_euclidean_x':'rxn_unirep_euclidean',
'rxn_esm_cosine_x':'rxn_esm_cosine', 
'rxn_esm_euclidean_x':'rxn_esm_euclidean'
})
pd.set_option('display.max_columns', None)
# savedf.to_excel('res/case_test_fold2_20250120.xlsx', index=False)
savedf

Unnamed: 0,uniprot_id,rxn_groundtruth,ec_groundtruth,ec_blast,rxn_blastvec,run_fold,isTrue_blastvec,ec_deepec,rxn_deepec,isTrue_deepec,ec_clean,rxn_clean,isTrue_clean,reaction_groundtruth,ec_ecrecer,rxn_ecrecer,isTrue_ecrecer,ec_catfam,rxn_catfam,isTrue_catfam,ec_priam,rxn_priam,isTrue_priam,rxn_blastvrxn,isTrue_blastvrxn,rxn_esm_euclidean,rxn_esm_cosine,isTrue_esm_cosine,isTrue_esm_euclidean,rxn_unirep_euclidean,rxn_unirep_cosine,isTrue_unirep_cosine,isTrue_unirep_euclidean,rxn_t5_euclidean,rxn_t5_cosine,isTrue_t5_cosine,isTrue_t5_euclidean,rxn_tdit5_euclidean,rxn_tdit5_cosine,isTrue_tdit5_cosine,isTrue_tdit5_euclidean,rxn_RXNRECer,isTrue_RXNRECer
0,O35066,-,-,-,-,2,True,NO-PREDICTION,NO-PREDICTION,False,2.7.11.1,RHEA:46608,False,-,-,-,True,-,-,True,NO-PREDICTION,NO-PREDICTION,False,-,True,-,-,True,True,-,-,True,True,-,-,True,True,-,-,True,True,-,True
1,Q6AYR8,-,-,-,-,2,True,NO-PREDICTION,NO-PREDICTION,False,2.4.1.1,RHEA:41732,False,-,-,-,True,-,-,True,NO-PREDICTION,NO-PREDICTION,False,-,True,-,-,True,True,-,-,True,True,-,-,True,True,-,-,True,True,-,True
2,Q5BE22,-,-,-,-,2,True,NO-PREDICTION,NO-PREDICTION,False,3.6.4.13;1.1.1.127;1.8.1.7,RHEA:24232;RHEA:13065;RHEA:11740,False,-,-,-,True,-,-,True,2.7.11.7;2.3.2.27;2.7.11.7,RHEA:11424,False,-,True,-,-,True,True,-,-,True,True,-,-,True,True,-,-,True,True,-,True
3,Q5R6U3,RHEA:32295;RHEA:32299;RHEA:22228;RHEA:22232;RH...,2.5.1.21,2.5.1.21,RHEA:32295,2,False,2.5.1.21,RHEA:32295,False,2.5.1.21,RHEA:32295,False,RHEA:32295;RHEA:32299;RHEA:22228;RHEA:22232;RH...,2.5.1.21,RHEA:32295,False,2.5.1.21,RHEA:32295,False,2.5.1.21;1.3.1.96;2.5.1.103;1.3.1.97,RHEA:22232;RHEA:32299;RHEA:34571;RHEA:32295;RH...,False,RHEA:22672;RHEA:32295;RHEA:32299;RHEA:22228;RH...,False,RHEA:32295;RHEA:32299;RHEA:22228;RHEA:22232;RH...,RHEA:32295;RHEA:32299;RHEA:22228;RHEA:22232;RH...,True,True,RHEA:32295;RHEA:32299;RHEA:22228;RHEA:22232;RH...,RHEA:32295;RHEA:32299;RHEA:22228;RHEA:22232;RH...,True,True,RHEA:32295;RHEA:32299;RHEA:22228;RHEA:22232;RH...,RHEA:32295;RHEA:32299;RHEA:22228;RHEA:22232;RH...,True,True,RHEA:32295;RHEA:32299;RHEA:22672;RHEA:22228;RH...,RHEA:32295;RHEA:32299;RHEA:22672;RHEA:22228;RH...,True,True,RHEA:22672;RHEA:32295;RHEA:22228;RHEA:32299;RH...,True
4,P54827,RHEA:16957,3.1.4.35,3.1.4.35,RHEA:16957,2,True,NO-PREDICTION,NO-PREDICTION,False,3.1.4.35,RHEA:16957,True,RHEA:16957,3.1.4.35,RHEA:16957,True,3.1.4.17,RHEA:14653,False,3.1.4.35,RHEA:16957,True,RHEA:16957,True,RHEA:16957,RHEA:16957,True,True,RHEA:16957,RHEA:16957,True,True,RHEA:16957,RHEA:16957,True,True,RHEA:16957,RHEA:16957,True,True,RHEA:16957,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50853,Q7JIH3,-,-,-,-,2,True,NO-PREDICTION,NO-PREDICTION,False,3.2.2.6,RHEA:16301,False,-,-,-,True,-,-,True,2.7.8.17,RHEA:13581,False,-,True,-,-,True,True,-,-,True,True,-,-,True,True,-,-,True,True,-,True
50854,A4VTK4,RHEA:23192,2.4.1.227,2.4.1.227,RHEA:23192,2,True,2.4.1.227,RHEA:23192,True,2.4.1.227,RHEA:23192,True,RHEA:23192,2.4.1.227,RHEA:23192,True,2.4.1.227,RHEA:23192,True,2.4.1.227;2.1.1.72,RHEA:31227;RHEA:15197;RHEA:23192,False,RHEA:23192,True,RHEA:23192,RHEA:23192,True,True,RHEA:23192,RHEA:23192,True,True,RHEA:23192,RHEA:23192,True,True,RHEA:23192,RHEA:23192,True,True,RHEA:23192,True
50855,Q30ZX3,-,-,-,-,2,True,NO-PREDICTION,NO-PREDICTION,False,3.6.4.12,RHEA:13065,False,-,-,-,True,-,-,True,2.7.7.27,RHEA:12120,False,-,True,-,-,True,True,-,-,True,True,-,-,True,True,-,-,True,True,-,True
50856,B2HQI2,-,-,-,-,2,True,NO-PREDICTION,NO-PREDICTION,False,3.5.5.8,RHEA:21464,False,-,-,-,True,-,-,True,NO-PREDICTION,NO-PREDICTION,False,-,True,-,-,True,True,-,-,True,True,-,-,True,True,-,-,True,True,-,True


## 4. 分析结果添加各种标签

In [6]:
def countFuncs(ids):
    if ids == '-':
        return 0
    idArray = ids.split(';')
    return len(idArray)

In [7]:
# 更多反应标注的
savedf['funcCounts_groundtruth'] = savedf.rxn_groundtruth.apply(lambda x: countFuncs(x))
savedf['funcCounts_RXNRECer'] = savedf.rxn_RXNRECer.apply(lambda x: countFuncs(x))

In [8]:
more_functions = savedf[savedf.funcCounts_groundtruth < savedf.funcCounts_RXNRECer].reset_index(drop=True)
# more_functions.to_excel('res/case_test_fold_2_more_functions.xlsx', index=False)
more_functions

Unnamed: 0,uniprot_id,rxn_groundtruth,ec_groundtruth,ec_blast,rxn_blastvec,run_fold,isTrue_blastvec,ec_deepec,rxn_deepec,isTrue_deepec,ec_clean,rxn_clean,isTrue_clean,reaction_groundtruth,ec_ecrecer,rxn_ecrecer,isTrue_ecrecer,ec_catfam,rxn_catfam,isTrue_catfam,ec_priam,rxn_priam,isTrue_priam,rxn_blastvrxn,isTrue_blastvrxn,rxn_esm_euclidean,rxn_esm_cosine,isTrue_esm_cosine,isTrue_esm_euclidean,rxn_unirep_euclidean,rxn_unirep_cosine,isTrue_unirep_cosine,isTrue_unirep_euclidean,rxn_t5_euclidean,rxn_t5_cosine,isTrue_t5_cosine,isTrue_t5_euclidean,rxn_tdit5_euclidean,rxn_tdit5_cosine,isTrue_tdit5_cosine,isTrue_tdit5_euclidean,rxn_RXNRECer,isTrue_RXNRECer,funcCounts_groundtruth,funcCounts_RXNRECer
0,P0CT43,-,-,-,-,2,True,NO-PREDICTION,NO-PREDICTION,False,2.7.7.49,RHEA:22508,False,-,-,-,True,-,-,True,2.7.7.49;2.7.7.49;2.7.7.7;2.7.7.49;2.7.7.7;2.7...,RHEA:21248;RHEA:22508,False,RHEA:22508;RHEA:22508;-;RHEA:22508,False,-,-,True,True,-,-,True,True,-,-,True,True,-,-,True,True,RHEA:22508,False,0,1
1,P85173,-,-,3.1.3.1,RHEA:15017,2,False,NO-PREDICTION,NO-PREDICTION,False,3.1.3.1,RHEA:15017,False,-,-,-,True,-,-,True,3.1.3.1,RHEA:15017,False,RHEA:15017,False,-;RHEA:15017,-;RHEA:15017,False,False,RHEA:10172;-,RHEA:10172;-,False,False,-;RHEA:15017,-;RHEA:15017,False,False,-;RHEA:15017,-;RHEA:15017,False,False,RHEA:15017,False,0,1
2,P9WIY0,RHEA:31575,3.6.1.55,3.6.1.55,RHEA:31575,2,True,NO-PREDICTION,NO-PREDICTION,False,3.6.1.55,RHEA:31575,True,RHEA:31575,3.6.1.55,RHEA:31575,True,-,-,False,3.6.1.55;3.6.1.65;2.7.1.1;3.6.1.55;3.6.1.56;5....,RHEA:25524;RHEA:31583;RHEA:31575;RHEA:27762;RH...,False,RHEA:31575,True,RHEA:23420;RHEA:31575;-,RHEA:23420;RHEA:31575;-,False,False,RHEA:12068;RHEA:31575;RHEA:53744;RHEA:13953,RHEA:12068;RHEA:31575;RHEA:53744;RHEA:13953,False,False,RHEA:31575;-,RHEA:27762;RHEA:22636;RHEA:31575,False,False,RHEA:27762;RHEA:22636;RHEA:25302;RHEA:11800;RH...,RHEA:27762;RHEA:22636;RHEA:25302;RHEA:11800;RH...,False,False,RHEA:67616;RHEA:31575,False,1,2
3,P97720,RHEA:15629;RHEA:46100;RHEA:46104;RHEA:76151;RH...,1.14.15.4,1.14.15.5;1.14.15.4,RHEA:11872;RHEA:15629,2,False,1.14.15.4,RHEA:15629,False,1.14.15.4;1.14.15.5,RHEA:15629;RHEA:11872,False,RHEA:15629;RHEA:46100;RHEA:46104;RHEA:76151;RH...,1.14.15.4,RHEA:15629,False,1.14.15.4,RHEA:15629,False,1.14.15.4;1.14.15.4;1.14.15.5;1.14.14.1;1.14.1...,RHEA:49064;RHEA:50696;RHEA:20573;RHEA:17149;RH...,False,RHEA:50292;RHEA:24964;RHEA:24972;RHEA:24976;RH...,False,RHEA:15629;RHEA:46100;RHEA:46104;RHEA:76151;RH...,RHEA:15629;RHEA:46100;RHEA:46104;RHEA:76151;RH...,False,False,RHEA:15629;RHEA:46104;RHEA:11872;RHEA:50792;RH...,RHEA:15629;RHEA:46104;RHEA:11872;RHEA:50792;RH...,False,False,RHEA:15629;RHEA:46104;RHEA:76151;RHEA:76155;RH...,RHEA:15629;RHEA:46104;RHEA:76151;RHEA:76155;RH...,False,False,RHEA:15629;RHEA:46104;RHEA:76151;RHEA:76155;RH...,RHEA:15629;RHEA:46104;RHEA:76151;RHEA:76155;RH...,False,False,RHEA:46100;RHEA:46104;RHEA:76019;RHEA:76023;RH...,False,7,8
4,P72156,RHEA:11312,3.8.1.8,3.5.4.31;3.5.4.28,RHEA:25025;RHEA:20716,2,False,3.8.1.8,RHEA:11312,True,3.8.1.8;3.5.4.45,RHEA:11312;RHEA:26197;RHEA:26201,False,RHEA:11312,3.8.1.8,RHEA:11312,True,-,-,False,3.8.1.8;3.5.4.45;3.5.4.28;3.5.4.31;3.5.4.41;3....,RHEA:26197;RHEA:20716;RHEA:23660;RHEA:12833;RH...,False,RHEA:20716;RHEA:25025;RHEA:42892;RHEA:20716;RH...,False,RHEA:23092;RHEA:23660;RHEA:23688,RHEA:23092;RHEA:23688,False,False,RHEA:20716;RHEA:25025;RHEA:23092,RHEA:20716;RHEA:25025;RHEA:23092,False,False,RHEA:20716;RHEA:25025;RHEA:42892;RHEA:20716;RH...,RHEA:20716;RHEA:25025;RHEA:42892;RHEA:20716;RH...,False,False,RHEA:23092;-;RHEA:20716;RHEA:25025,RHEA:23092;-;RHEA:20716;RHEA:25025,False,False,RHEA:26197;RHEA:26201,False,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
116,P30741,RHEA:18585,5.3.1.1,5.3.1.1,RHEA:18585,2,True,5.3.1.1,RHEA:18585,True,5.3.1.1,RHEA:18585,True,RHEA:18585,5.3.1.1,RHEA:18585,True,5.3.1.1,RHEA:18585,True,5.3.1.1;5.3.1.33,RHEA:49588;RHEA:18585,False,RHEA:18585;RHEA:17937;RHEA:18585;RHEA:17937;RH...,False,RHEA:17937;RHEA:18585,RHEA:17937;RHEA:18585,False,False,RHEA:18585;RHEA:17937;RHEA:18585,RHEA:18585;RHEA:17937;RHEA:18585,False,False,RHEA:18585,RHEA:18585;RHEA:17937;RHEA:18585,False,True,RHEA:17937;RHEA:18585;RHEA:18585,RHEA:17937;RHEA:18585;RHEA:18585,False,False,RHEA:17937;RHEA:18585,False,1,2
117,Q86XP0,RHEA:15801;RHEA:40427;RHEA:40811;RHEA:38779;RH...,3.1.1.4,3.1.1.4,RHEA:15801,2,False,3.1.1.4,RHEA:15801,False,3.1.1.5,RHEA:15177,False,RHEA:15801;RHEA:40427;RHEA:40811;RHEA:38779;RH...,3.1.1.4,RHEA:15801,False,3.1.1.4,RHEA:15801,False,3.1.1.4;3.1.1.4;3.1.1.5;3.1.1.4;3.1.1.5;2.3.1.64,RHEA:15801;RHEA:15177;RHEA:13405,False,RHEA:15801;RHEA:15177;RHEA:40427;RHEA:40435;RH...,False,RHEA:15801;RHEA:15177;RHEA:40815;RHEA:40431;RH...,RHEA:15801;RHEA:15177;RHEA:40815;RHEA:40431;RH...,False,False,RHEA:15801;RHEA:38783;RHEA:40811;RHEA:40427;RH...,RHEA:15801;RHEA:38783;RHEA:40811;RHEA:40427;RH...,False,False,RHEA:15801;RHEA:15177;RHEA:40815;RHEA:40431;RH...,RHEA:15801;RHEA:15177;RHEA:40815;RHEA:40431;RH...,False,False,RHEA:15801;RHEA:15177;RHEA:40427;RHEA:40435;RH...,RHEA:15801;RHEA:15177;RHEA:40427;RHEA:40435;RH...,False,False,RHEA:40427;RHEA:40435;RHEA:15801;RHEA:15177;RH...,False,6,7
118,O05151,RHEA:22624,4.1.3.39,4.1.2.52,RHEA:25788,2,False,4.1.3.39,RHEA:22624,True,4.1.2.53,RHEA:25784,False,RHEA:22624,4.1.3.39,RHEA:22624,True,-,-,False,4.1.3.39;4.1.2.52;4.1.2.53;4.1.2.20;4.1.2.58;5...,RHEA:25784;RHEA:47924;RHEA:13677;RHEA:25788;RH...,False,RHEA:25788;RHEA:22624;RHEA:25784,False,RHEA:25784,RHEA:25784,False,False,RHEA:25788,RHEA:25788,False,False,RHEA:27726;RHEA:10268,RHEA:27726;RHEA:10268,False,False,RHEA:27726;RHEA:10268,RHEA:27726;RHEA:10268,False,False,RHEA:27726;RHEA:10268,False,1,2
119,Q66J54,-,-,-,-,2,True,NO-PREDICTION,NO-PREDICTION,False,3.3.2.10,RHEA:19037,False,-,-,-,True,-,-,True,NO-PREDICTION,NO-PREDICTION,False,RHEA:73859;RHEA:73863;RHEA:73867;RHEA:73875;RH...,False,RHEA:72151;RHEA:72399;RHEA:71751;RHEA:72403;RH...,RHEA:72151;RHEA:72399;RHEA:71751;RHEA:72403;RH...,False,False,RHEA:76071;RHEA:76075;RHEA:76079;RHEA:76119;RH...,RHEA:76071;RHEA:76075;RHEA:76079;RHEA:76119;RH...,False,False,RHEA:76071;RHEA:76075;RHEA:76079;RHEA:76119;RH...,RHEA:76071;RHEA:76075;RHEA:76079;RHEA:76119;RH...,False,False,RHEA:76071;RHEA:76075;RHEA:76079;RHEA:76119;RH...,RHEA:76071;RHEA:76075;RHEA:76079;RHEA:76119;RH...,False,False,RHEA:75991;RHEA:76119;RHEA:76135;RHEA:76123;RH...,False,0,13


## 5. 整理数据保存结果