## S2 集成时间评估
> 2024-09-03

### 1. 导入必要的包

In [1]:
import sys,os
sys.path.insert(0, os.path.dirname(os.path.realpath('__file__')))
sys.path.insert(1,'../../')
from config import conf as cfg
import pandas as pd
from tkinter import _flatten
import numpy as np
import json
import time
from sklearn.metrics import confusion_matrix
import plotly.graph_objects as go
import rxnrecer as production
from modules import commonfunction as cmfunc
from IPython.display import HTML
import tools.bioFunctionLib as bfl
import modules.simi_caculator as simitool
import modules.predict.predRXN as predrxn
import tools.btools as btools
from pandarallel import pandarallel # 导入pandaralle
pandarallel.initialize(progress_bar=False)
%load_ext autoreload
%autoreload 2

ImportError: tokenizers>=0.19,<0.20 is required for a normal functioning of this module, but found tokenizers==0.13.3.
Try: `pip install transformers -U` or `pip install -e '.[dev]'` if you're working with git main

### 2. 加载测试数据集

In [2]:
# load datasets
ds_test = pd.read_feather(cfg.FILE_DS_TEST)
print(f'测试集数据量: {len(ds_test)}')
method = ['ecblast', 'rxnblast', 'deepec', 'clean', 'ecrecer', 'ecpred', 'catfam', 'priam']
print(f'使用预测方法{method}')


测试集数据量: 13515
使用预测方法['ecblast', 'rxnblast', 'deepec', 'clean', 'ecrecer', 'ecpred', 'catfam', 'priam']


## 3. 方法时间测试
### 3.1 ecblast

In [17]:
start = time.time()


#add reaction data
ds_rhea = pd.read_feather(cfg.FILE_DS_RHEA_REACTIONS)

# caculate reaction_id ec_number map
ec_reaction_map = ds_rhea[['reaction_id', 'ec_number']].copy()
ec_reaction_map = ec_reaction_map.fillna('REACTION-WITHOUT-EC')
ec_reaction_map = pd.concat([ec_reaction_map, pd.DataFrame({'reaction_id':['NO-PREDICTION', '-'],'ec_number':['NO-PREDICTION', '-']})], axis=0).reset_index(drop=True)
ec_reaction_map.ec_number = ec_reaction_map.ec_number.apply(lambda x: x.replace('EC:',''))
ec_reaction_map = ec_reaction_map.assign(ec_number=ec_reaction_map['ec_number'].str.split(';')).explode('ec_number').reset_index(drop=True).rename(columns={'ec_number': 'ec'})


#trainning set
ds_train = pd.read_feather(cfg.FILE_DS_TRAIN)

#blsting test set
blast_res_ec = bfl.getblast(train=ds_train[['uniprot_id', 'seq']], test=ds_test[['uniprot_id', 'seq']], k=1)    
blast_res_ec= blast_res_ec[['id','sseqid']].merge(ds_train[['uniprot_id', 'ec_number']], left_on='sseqid', right_on='uniprot_id', how='left')[['id','ec_number']].rename(columns={'id':'uniprot_id', 'ec_number':'ec_ecblast'})
#拼合test，加入无预测结果的数据
blast_res_ec=ds_test[['uniprot_id', 'reaction_id','ec_number']].rename(columns={'reaction_id':'reaction_groundtruth', 'ec_number': 'ec_groundtruth'}).merge(blast_res_ec, on='uniprot_id', how='left').fillna('NO-PREDICTION')
#add rxn info
blast_res_ec['reaction_ecblast']=blast_res_ec.ec_ecblast.parallel_apply(lambda x: btools.retrival_reaction_from_ec(ec_pred=x, ec_reaction_map=ec_reaction_map))

end = time.time()
print(f"运行时间: {end - start:.2f} 秒")

blast_res_ec.head(3)

运行时间: 31.50 秒


Unnamed: 0,uniprot_id,reaction_groundtruth,ec_groundtruth,ec_ecblast,reaction_ecblast
0,A9JLI2,-,-,-,-
1,A9JLI3,-,-,-,-
2,A9JLI5,-,-,-,-


#### 3.2 rxnblast

In [23]:
start = time.time()
blast_res = bfl.getblast(train=ds_train[['uniprot_id', 'seq']], test=ds_test[['uniprot_id', 'seq']], k=1)         # 序列比对
blast_res = blast_res.merge(ds_train, left_on='sseqid', right_on='uniprot_id', how='left')[['id', 'reaction_id']].rename(columns={'id':'uniprot_id', 'reaction_id':'reaction_blast'})
blast_res = ds_test[['uniprot_id', 'reaction_id']].merge(blast_res, on='uniprot_id', how='left').fillna('NO-PREDICTION')
end = time.time()
print(f"运行时间: {end - start:.2f} 秒")
blast_res.head(3)

运行时间: 13.77 秒


Unnamed: 0,uniprot_id,reaction_id,reaction_blast
0,A9JLI2,-,-
1,A9JLI3,-,-
2,A9JLI5,-,-


### 3.3 DeepEC

In [29]:
deepec_res_file = '/tmp/deepec'
print(f'''deepec cmd: 
time singularity exec /hpcfs/fpublic/container/singularity/app/deepec/deepec.sif python /opt/deepec/deepec.py -i {cfg.FILE_DS_TEST_FASTA} -o {deepec_res_file}
      ''')

deepec cmd: 
time singularity exec /hpcfs/fpublic/container/singularity/app/deepec/deepec.sif python /opt/deepec/deepec.py -i /hpcfs/fhome/shizhenkun/codebase/RXNRECer/data/datasets/task240524/ds_test.fasta -o /tmp/deepec
      


### 3.4 CLEAN

In [4]:
cmd = f'time singularity exec --nv /hpcfs/fpublic/container/singularity/app/clean/clean.sif python /app/inference.py -i {cfg.FILE_DS_TEST_FASTA} -o /tmp/clean -d ~/tmp/'
print(cmd)

time singularity exec --nv /hpcfs/fpublic/container/singularity/app/clean/clean.sif python /app/inference.py -i /hpcfs/fhome/shizhenkun/codebase/RXNRECer/data/datasets/task240524/ds_test.fasta -o /tmp/clean -d ~/tmp/


### 3.5 RXNRECer

In [37]:
time_start = time.time()
ds_test = pd.read_feather(cfg.FILE_DS_TEST)
res_rxnrecer_s1 = production.step_by_step_prediction(input_data=ds_test[['uniprot_id', 'seq']], Ensemble=False, output_file=None,batch_size=600)
time_end = time.time()
print(f'Time cost: {time_end-time_start}s')

Step 1: Preparing input data
Step 2: Loading predictive model
Use device: cuda
Step 3: Running prediction on 13515 proteins


Predicting reactions: 100%|██████████| 300/300 [00:38<00:00,  7.89it/s]
Predicting reactions: 100%|██████████| 300/300 [00:28<00:00, 10.36it/s]
Predicting reactions: 100%|██████████| 300/300 [00:37<00:00,  7.90it/s]
Predicting reactions: 100%|██████████| 300/300 [00:28<00:00, 10.52it/s]
Predicting reactions: 100%|██████████| 300/300 [00:35<00:00,  8.43it/s]
Predicting reactions: 100%|██████████| 300/300 [00:26<00:00, 11.50it/s]
Predicting reactions: 100%|██████████| 300/300 [00:34<00:00,  8.59it/s]
Predicting reactions: 100%|██████████| 300/300 [00:32<00:00,  9.26it/s]
Predicting reactions: 100%|██████████| 300/300 [00:47<00:00,  6.27it/s]
Predicting reactions: 100%|██████████| 300/300 [00:28<00:00, 10.48it/s]
Predicting reactions: 100%|██████████| 300/300 [00:34<00:00,  8.76it/s]
Predicting reactions: 100%|██████████| 300/300 [00:46<00:00,  6.39it/s]
Predicting reactions: 100%|██████████| 300/300 [00:41<00:00,  7.32it/s]
Predicting reactions: 100%|██████████| 300/300 [00:42<00:00,  7.

Time cost: 815.0774381160736s





### 3.6 PRIAM

In [3]:
priam_res_file = '/tmp/priam/'
print(f'''priam cmd: 
time singularity exec /hpcfs/fpublic/container/singularity/app/priam/priam.sif /opt/jdk1.7.0_80/bin/java -Xmx128G -jar /opt/priam/PRIAM_search.jar -p /opt/priam/PRIAM_JAN18  -i {cfg.FILE_DS_TEST_FASTA} -o {priam_res_file} --blast_path /opt/blast-2.2.26/bin -np 100
 ''')

priam cmd: 
time singularity exec /hpcfs/fpublic/container/singularity/app/priam/priam.sif /opt/jdk1.7.0_80/bin/java -Xmx128G -jar /opt/priam/PRIAM_search.jar -p /opt/priam/PRIAM_JAN18  -i /hpcfs/fhome/shizhenkun/codebase/RXNRECer/data/datasets/task240524/ds_test.fasta -o /tmp/priam/ --blast_path /opt/blast-2.2.26/bin -np 100
 


### 3.7 ECPred

In [6]:
ECPred_res_file = '/tmp/ecpred/test.txt'
singularity_ecpred = '/hpcfs/fpublic/container/singularity/app/ecpred/ecpred.sif'
print(f'''ECpred cmd: 
time singularity exec {singularity_ecpred} java  -Xmx128G -jar /ECPred/ECPred.jar spmap  {cfg.FILE_DS_TEST_FASTA} /ECPred/ /tmp {ECPred_res_file}
      ''')

ECpred cmd: 
time singularity exec /hpcfs/fpublic/container/singularity/app/ecpred/ecpred.sif java  -Xmx128G -jar /ECPred/ECPred.jar spmap  /hpcfs/fhome/shizhenkun/codebase/RXNRECer/data/datasets/task240524/ds_test.fasta /ECPred/ /tmp /tmp/ecpred/test.txt
      


### 3.8 ESM simi

In [39]:
time_start = time.time()

from modules.embedding import seqEmbedding as ebdseq
featureBank = pd.read_feather(cfg.FILE_PRODUCTION_FEATURES)
dict_featureBank = pd.Series( featureBank['uniprot_id'], featureBank.index.values).to_dict()

# 从 JSON 文件加载字典数据
with open(cfg.DICT_UNIPROT_RHEA, "r") as json_file:
    dict_uniprot2rhea = json.load(json_file)

# ESM embedding   
embd_esm = ebdseq.getEsm(ds_test[['uniprot_id', 'seq']].rename(columns={'uniprot_id':'id'}))
# ESM similarity
esm_cos = predrxn.get_top_protein_simi(x_feature=np.vstack(featureBank.esm), 
                                y_feature=np.vstack(embd_esm.esm), 
                                y_uniprot_id=embd_esm.id, 
                                dict_featureBank=dict_featureBank, 
                                dict_uniprot2rhea = dict_uniprot2rhea,
                                topk=5).rename(columns={'simi':'esm'})

time_end = time.time()
print('Time cost:', time_end - time_start)

Transferred model to GPU


100%|██████████| 13515/13515 [10:18<00:00, 21.87it/s]


Time cost: 1327.035010099411


### 3.9 T5

In [40]:
time_start = time.time()
from modules.embedding import t5Embedding as ebdt5

featureBank = pd.read_feather(cfg.FILE_PRODUCTION_FEATURES)
dict_featureBank = pd.Series( featureBank['uniprot_id'], featureBank.index.values).to_dict()

# 从 JSON 文件加载字典数据
with open(cfg.DICT_UNIPROT_RHEA, "r") as json_file:
    dict_uniprot2rhea = json.load(json_file)

# T5 Embedding    
embd_t5 = ebdt5.get_embd_seq(seqdfwithid=ds_test[['uniprot_id', 'seq']].rename(columns={'uniprot_id':'id'}), batch_size=20)
    
# T5 similarity
t5_cos =  predrxn.get_top_protein_simi(x_feature=np.vstack(featureBank.t5), 
                            y_feature=np.vstack(embd_t5.t5), 
                            y_uniprot_id=embd_t5.id, 
                            dict_featureBank=dict_featureBank, 
                            dict_uniprot2rhea = dict_uniprot2rhea,
                            topk=10).rename(columns={'simi':'t5'})

end_time = time.time()
print(f"Time cost: {end_time - time_start} s")

Processing Sequences: 100%|██████████| 13515/13515 [04:56<00:00, 45.63it/s] 


Processed 13515 proteins in 296.19 seconds
Time cost: 1026.413476228714 s


### 3.10 UniRep

In [None]:
from jax_unirep import get_reps

: 

from module.embedding.