# Baseline Performance Comparison - EC-based methods
> Author: Shi Zhenkun   
> Email: zhenkun.shi@tib.cas.cn   
> Last update: 2024-09-29



## 0. Import packages

In [2]:
import os,sys
import pandas as pd

sys.path.insert(0, os.path.dirname(os.path.realpath('__file__')))
sys.path.insert(1,'../')
from config import conf as cfg
import tools.bioFunctionLib as bfl
import tools.btools as btools
from tkinter import _flatten # type: ignore

from pandarallel import pandarallel # 导入pandaralle
pandarallel.initialize(progress_bar=False)

%load_ext autoreload
%autoreload 2

INFO: Pandarallel will run on 128 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


## 1. Define Common Functions

In [7]:
def caculate_reaction_pred_true(reaction_groundtruth, reaction_pred):
    set_ground = set(reaction_groundtruth.split(';'))
    set_pred = set(reaction_pred.split(';'))
    if set_ground == set_pred:
        return True
    else:
        return False

#拼合测试集与预测结果    
def merge_test_set(testset, needmergeset, mergekey='uniprot_id', merge_way='left', fillna='NO-PREDICTION'):
    testset = testset[['uniprot_id', 'reaction_id', 'ec_number']].rename(columns={'reaction_id':'reaction_groundtruth','ec_number':'ec_groundtruth'})
    res = testset.merge(needmergeset, on=mergekey, how=merge_way).fillna(fillna)
    # res = testset.merge(needmergeset, on=mergekey, how=merge_way)
    return res

## 2. Load exp data

In [10]:
print(f'Use dataset: {cfg.FILE_DS_TEST}')
ds_test = pd.read_feather(cfg.FILE_DS_TEST)
ds_test.ec_number = ds_test.ec_number.apply(lambda x: x.replace(',',';')) # 统一spliter

#add reaction data
ds_rhea = pd.read_feather(cfg.FILE_DS_RHEA_REACTIONS)

# caculate reaction_id ec_number map
ec_reaction_map = ds_rhea[['reaction_id', 'ec_number']].copy()
ec_reaction_map = ec_reaction_map.fillna('REACTION-WITHOUT-EC')
ec_reaction_map = pd.concat([ec_reaction_map, pd.DataFrame({'reaction_id':['NO-PREDICTION', '-'],'ec_number':['NO-PREDICTION', '-']})], axis=0).reset_index(drop=True)
ec_reaction_map.ec_number = ec_reaction_map.ec_number.apply(lambda x: x.replace('EC:',''))
ec_reaction_map = ec_reaction_map.assign(ec_number=ec_reaction_map['ec_number'].str.split(';')).explode('ec_number').reset_index(drop=True).rename(columns={'ec_number': 'ec'})


input_fasta = '/hpcfs/fhome/shizhenkun/codebase/preaction/data/datasets/task240524/ds_test.fasta'
print(f'Test fasta file: {input_fasta}')

ds_test.head(2)

Use dataset: /hpcfs/fhome/shizhenkun/codebase/preaction/data/datasets/task240524/ds_test.feather
Test fasta file: /hpcfs/fhome/shizhenkun/codebase/preaction/data/datasets/task240524/ds_test.fasta


Unnamed: 0,uniprot_id,seq,reaction_id,ec_number,functionCounts,ec_specific_level,isenzyme,label
0,A9JLI2,MLGLQIFTLLSIPTLLYTYEIEPLERTSTPPEKEFGYWCTYANHCR...,-,-,0,0,False,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,A9JLI3,MRFFSYLGLLLAGLTSLQGFSTDNLLEEELRYWCQYVKNCRFCWTC...,-,-,0,0,False,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


## 3. EC-based baselines

### 3.1 DeepEC

In [4]:
deepec_res_file = '/hpcfs/fhome/shizhenkun/codebase/preaction/results240614/baselines/deepec'
print(f'''deepec cmd: 
singularity exec /hpcfs/fpublic/container/singularity/app/deepec/deepec.sif python /opt/deepec/deepec.py -i {input_fasta} -o {deepec_res_file}
      ''')

deepec cmd: 
singularity exec /hpcfs/fpublic/container/singularity/app/deepec/deepec.sif python /opt/deepec/deepec.py -i /hpcfs/fhome/shizhenkun/codebase/preaction/data/datasets/task240524/ds_test.fasta -o /hpcfs/fhome/shizhenkun/codebase/preaction/results240614/baselines/deepec
      


In [31]:
# load results
deepec = btools.load_deepec_resluts(filepath=f'{cfg.RESULTS_DIR}/baselines/deepec/DeepEC_Result.txt').rename(columns={'id':'uniprot_id'})
# 拼合DeepEC 预测结果
test_deepec = merge_test_set(testset=ds_test, needmergeset=deepec, fillna='NO-PREDICTION')
# add reaction prediction results
test_deepec['reaction_deepec']=test_deepec.ec_deepec.parallel_apply(lambda x : btools.retrival_reaction_from_ec(ec_pred=x, ec_reaction_map=ec_reaction_map))


test_deepec.to_csv(cfg.FILE_RESULTS_DEEPEC, sep='\t', index=False)
print(f'Write deepec results to: {cfg.FILE_RESULTS_DEEPEC} ')
test_deepec.head(3)

Write deepec results to: /hpcfs/fhome/shizhenkun/codebase/preaction/results240614/baselines/exp_test_deepec.tsv 


Unnamed: 0,uniprot_id,reaction_groundtruth,ec_groundtruth,ec_deepec,reaction_deepec
0,A9JLI2,-,-,NO-PREDICTION,NO-PREDICTION
1,A9JLI3,-,-,NO-PREDICTION,NO-PREDICTION
2,A9JLI5,-,-,NO-PREDICTION,NO-PREDICTION


### 3.2 CLEAN

In [None]:
!sudo singularity shell --writable /hpcfs/fpublic/container/singularity/app/clean/clean_sandbox
!sudo singularity exec --writable /hpcfs/fpublic/container/singularity/app/clean/clean_sandbox python /app/clean.py -i '/hpcfs/fhome/shizhenkun/codebase/preaction/data/datasets/task240524/ds_test.fasta' -o '/hpcfs/fhome/shizhenkun/codebase/preaction/results240614/baselines/clean/clean.txt'

In [14]:
#format EC resluts from clean
test_clean = btools.load_clean_resluts(res_file='/hpcfs/fhome/shizhenkun/codebase/preaction/results240614/baselines/clean/clean.txt_maxsep.csv').rename(columns={'Entry':'uniprot_id'})

# 拼合groundtruth
test_clean= merge_test_set(testset=ds_test, needmergeset= test_clean)
test_clean['reaction_clean']=test_clean.ec_clean.parallel_apply(lambda x: btools.retrival_reaction_from_ec(ec_pred=x, ec_reaction_map=ec_reaction_map))
test_clean = test_clean.replace('', 'EC-WITHOUT-REACTION')
test_clean['isRight_reaction_clean']=test_clean.parallel_apply(lambda x : caculate_reaction_pred_true(reaction_groundtruth=x.reaction_groundtruth, reaction_pred=x.reaction_clean), axis=1) # type: ignore

test_clean.to_csv(cfg.FILE_RESULTS_CLEAN, sep='\t', index=False)
print(f'Write clean results to: {cfg.FILE_RESULTS_CLEAN} ')
test_clean.head(3)

Write clean results to: /hpcfs/fhome/shizhenkun/codebase/preaction/results240614/baselines/exp_test_clean.tsv 


Unnamed: 0,uniprot_id,reaction_groundtruth,ec_groundtruth,ec_clean,reaction_clean,isRight_reaction_clean
0,A9JLI2,-,-,3.2.2.6;1.4.3.2;4.2.3.81,RHEA:31427;RHEA:16301;RHEA:13781,False
1,A9JLI3,-,-,4.6.1.18,EC-WITHOUT-REACTION,False
2,A9JLI5,-,-,1.4.3.2,RHEA:13781,False


### 3.3 ECRECer

In [23]:
ecrecer_res_file = '/hpcfs/fhome/shizhenkun/codebase/preaction/results240614/baselines/ecrecer.tsv'
print(f'''deepec cmd: 
conda activate DMLF
python /hpcfs/fhome/shizhenkun/codebase/DMLF/production.py  -i {input_fasta} -o {deepec_res_file} -mode p
      ''')

deepec cmd: 
conda activate DMLF
python /hpcfs/fhome/shizhenkun/codebase/DMLF/production.py  -i /hpcfs/fhome/shizhenkun/codebase/preaction/data/datasets/task240524/ds_test.fasta -o /hpcfs/fhome/shizhenkun/codebase/preaction/results240614/baselines/deepec -mode p
      


In [15]:

dmlf = pd.read_csv('/hpcfs/fhome/shizhenkun/codebase/preaction/results240614/baselines/ecrecer/ECRECer.csv', sep=',')[['id_input','ec_pred']].rename(columns={'id_input':'uniprot_id', 'ec_pred':'ec_ecrecer'})
test_dmlf = merge_test_set(testset=ds_test, needmergeset= dmlf)

test_dmlf.ec_ecrecer = test_dmlf.ec_ecrecer.apply(lambda x: x.replace(',',';'))
test_dmlf['reaction_ecrecer']=test_dmlf.ec_ecrecer.parallel_apply(lambda x: btools.retrival_reaction_from_ec(ec_pred=x, ec_reaction_map=ec_reaction_map))
test_dmlf = test_dmlf.replace('', 'EC-WITHOUT-REACTION')
test_dmlf['isRight_reaction_ecrecer']=test_dmlf.parallel_apply(lambda x : caculate_reaction_pred_true(reaction_groundtruth=x.reaction_groundtruth, reaction_pred=x.reaction_ecrecer), axis=1) # type: ignore
test_dmlf.to_csv(cfg.FILE_RESULTS_ECRECER, sep='\t', index=False)
print(f'Write ecrecer results to: {cfg.FILE_RESULTS_ECRECER} ')
test_dmlf.head(3)

Write ecrecer results to: /hpcfs/fhome/shizhenkun/codebase/preaction/results240614/baselines/exp_test_ecrecer.tsv 


Unnamed: 0,uniprot_id,reaction_groundtruth,ec_groundtruth,ec_ecrecer,reaction_ecrecer,isRight_reaction_ecrecer
0,A9JLI2,-,-,-,-,True
1,A9JLI3,-,-,-,-,True
2,A9JLI5,-,-,-,-,True


### 3.4 CATFAM

In [36]:
catfam_res_file = '/hpcfs/fhome/shizhenkun/codebase/preaction/results240614/baselines/catfam/catfam.txt'
print(f'''catfam cmd:
 singularity exec /hpcfs/fpublic/container/singularity/app/catfam/catfam.sif /catfam/source/catsearch.pl  -d /catfam/CatFamDB/CatFam_v2.0/CatFam4D99R -i {input_fasta}  -o {catfam_res_file} ''')

catfam cmd:
 singularity exec /hpcfs/fpublic/container/singularity/app/catfam/catfam.sif /catfam/source/catsearch.pl  -d /catfam/CatFamDB/CatFam_v2.0/CatFam4D99R -i /hpcfs/fhome/shizhenkun/codebase/preaction/data/datasets/task240524/ds_test.fasta  -o /hpcfs/fhome/shizhenkun/codebase/preaction/results240614/baselines/catfam/catfam.txt 


In [8]:
# load results
catfam = btools.load_catfam_res(resfile=f'/hpcfs/fhome/shizhenkun/codebase/preaction/results240614/baselines/catfam/catfam.txt').rename(columns={'id':'uniprot_id'}) 
catfam = catfam.fillna('-') # catfam 所有输入进去的序列都可以返回结果，故返回的结果中没有EC号的被认为是非酶
catfam = catfam.groupby('uniprot_id').agg({ 'uniprot_id':'first',  'ec_catfam': ';'.join}).reset_index(drop=True)

test_catfam = merge_test_set(testset=ds_test, needmergeset=catfam)
test_catfam['reaction_catfam']=test_catfam.ec_catfam.parallel_apply(lambda x: cfunc.retrival_reaction_from_ec(ec_pred=x, ec_reaction_map=ec_reaction_map))
test_catfam = test_catfam.replace('', 'EC-WITHOUT-REACTION')
test_catfam['isRight_reaction_catfam']=test_catfam.parallel_apply(lambda x : caculate_reaction_pred_true(reaction_groundtruth=x.reaction_groundtruth, reaction_pred=x.reaction_catfam), axis=1) # type: ignore

test_catfam.to_csv(cfg.FILE_RESULTS_CATFAM, sep='\t', index=False)

test_catfam.head(3)

Unnamed: 0,uniprot_id,reaction_groundtruth,ec_groundtruth,ec_catfam,reaction_catfam,isRight_reaction_catfam
0,A9JLI2,-,-,-,-,True
1,A9JLI3,-,-,-,-,True
2,A9JLI5,-,-,-,-,True


### 3.5 PRIAM

In [5]:
priam_res_file = '/hpcfs/fhome/shizhenkun/codebase/preaction/results240614/baselines/priam/'
print(f'''priam cmd:
 singularity exec /hpcfs/fpublic/container/singularity/app/priam/priam.sif /opt/jdk1.7.0_80/bin/java -Xmx128G -jar /opt/priam/PRIAM_search.jar -p /opt/priam/PRIAM_JAN18  -i {input_fasta} -o {priam_res_file} --blast_path /opt/blast-2.2.26/bin -np 100
 ''')

priam cmd:
 singularity exec /hpcfs/fpublic/container/singularity/app/priam/priam.sif /opt/jdk1.7.0_80/bin/java -Xmx128G -jar /opt/priam/PRIAM_search.jar -p /opt/priam/PRIAM_JAN18  -i /hpcfs/fhome/shizhenkun/codebase/preaction/data/datasets/task240524/ds_test.fasta -o /hpcfs/fhome/shizhenkun/codebase/preaction/results240614/baselines/priam/ --blast_path /opt/blast-2.2.26/bin -np 100
 


In [6]:
# %conda activate DMLF
#! rm -rf /home/shizhenkun/codebase/DMLF/baselines/priam/PRIAM_JAN18/PROFILES/LIBRARY
#! java -Xmx128G -jar ./PRIAM_search.jar -p /home/shizhenkun/codebase/DMLF/baselines/priam/PRIAM_JAN18 -i /home/shizhenkun/codebase/preaction/data/datasets/ds_test.fasta -o /home/shizhenkun/codebase/preaction/results/baselines/priam/alfp0921 --blast_path /home/shizhenkun/downloads/blast-2.2.26/bin -np 100

# load results
priam = btools.load_praim_res(resfile=f'{cfg.RESULTS_DIR}baselines/priam/PRIAM_20240708030102/ANNOTATION/sequenceECs.txt').rename(columns={'id':'uniprot_id'})
test_priam = merge_test_set(testset=ds_test, needmergeset=priam)
test_priam['reaction_priam']=test_priam.ec_priam.parallel_apply(lambda x: btools.retrival_reaction_from_ec(ec_pred=x, ec_reaction_map=ec_reaction_map))
test_priam = test_priam.replace('', 'EC-WITHOUT-REACTION')
test_priam['isRight_reaction_catfam']=test_priam.parallel_apply(lambda x : caculate_reaction_pred_true(reaction_groundtruth=x.reaction_groundtruth, reaction_pred=x.reaction_priam), axis=1) # type: ignore
test_priam.to_csv(cfg.FILE_RESULTS_PRIAM, sep='\t', index=False)

test_priam.head(3)

Unnamed: 0,uniprot_id,reaction_groundtruth,ec_groundtruth,ec_priam,reaction_priam,isRight_reaction_catfam
0,A9JLI2,-,-,NO-PREDICTION,NO-PREDICTION,False
1,A9JLI3,-,-,1.14.11.51;2.3.2.27,RHEA:49524,False
2,A9JLI5,-,-,6.5.1.3,EC-WITHOUT-REACTION,False


### 3.6 ECPred

In [28]:
ECPred_res_file = '/hpcfs/fhome/shizhenkun/codebase/preaction/results240614/baselines/ecpred/test.txt'
singularity_ecpred = '/hpcfs/fpublic/container/singularity/app/ecpred/ecpred.sif'
print(f'''ECpred cmd: 
singularity exec {singularity_ecpred} java  -Xmx128G -jar /ECPred/ECPred.jar spmap  {input_fasta} /ECPred/ /tmp {ECPred_res_file}
      ''')

ECpred cmd: 
singularity exec /hpcfs/fpublic/container/singularity/app/ecpred/ecpred.sif java  -Xmx128G -jar /ECPred/ECPred.jar spmap  /hpcfs/fhome/shizhenkun/codebase/preaction/data/datasets/task240524/ds_test.fasta /ECPred/ /tmp /hpcfs/fhome/shizhenkun/codebase/preaction/results240614/baselines/ecpred/test.txt
      


In [None]:
!singularity exec /hpcfs/fpublic/container/singularity/app/ecpred/ecpred.sif  java  -Xmx128G -jar /ECPred/ECPred.jar spmap  /hpcfs/fhome/shizhenkun/codebase/preaction/data/datasets/task240524/ds_test.fasta   /ECPred/  /hpcfs/fhome/shizhenkun/codebase/preaction/temp  /hpcfs/fhome/shizhenkun/codebase/preaction/results240614/baselines/ecpred/test.txt  

In [18]:
# ! java -Xmx128G -jar /home/shizhenkun/codebase/DMLF/baselines/ECPred/ECPred.jar spmap /home/shizhenkun/codebase/preaction/data/datasets/ds_test.fasta /home/shizhenkun/codebase/DMLF/baselines/ECPred/ /home/shizhenkun/codebase/preaction/temp /home/shizhenkun/codebase/preaction/results/baselines/ecpred/alfp0921.txt
# load results
ecpred = btools.load_ecpred_res(resfile=f'/hpcfs/fhome/shizhenkun/codebase/preaction/results240614/baselines/ecpred/test.txt')
ecpred = ecpred.groupby('id').agg({ 'id':'first',  'ec_ecpred': ';'.join}).reset_index(drop=True).replace('no Prediction', 'NO-PREDICTION').rename(columns={'id':'uniprot_id'})

test_ecpred = merge_test_set(testset=ds_test, needmergeset=ecpred)
test_ecpred['reaction_ecpred']=test_ecpred.ec_ecpred.parallel_apply(lambda x: btools.retrival_reaction_from_ec(ec_pred=x, ec_reaction_map=ec_reaction_map))
test_ecpred = test_ecpred.replace('', 'EC-WITHOUT-REACTION')
test_ecpred['isRight_reaction_ecpred']=test_ecpred.parallel_apply(lambda x : caculate_reaction_pred_true(reaction_groundtruth=x.reaction_groundtruth, reaction_pred=x.reaction_ecpred), axis=1) # type: ignore

test_ecpred.to_csv(cfg.FILE_RESULTS_ECPRED, sep='\t', index=False)
test_ecpred.head(3)


Unnamed: 0,uniprot_id,reaction_groundtruth,ec_groundtruth,ec_ecpred,reaction_ecpred,isRight_reaction_ecpred
0,A9JLI2,-,-,-,-,True
1,A9JLI3,-,-,-,-,True
2,A9JLI5,-,-,-,-,True


### 3.7 Blast

In [7]:
#trainning set
ds_train = pd.read_feather(cfg.FILE_DS_TRAIN)

#blsting test set
blast_res_ec = bfl.getblast(train=ds_train[['uniprot_id', 'seq']], test=ds_test[['uniprot_id', 'seq']], k=1)    
blast_res_ec= blast_res_ec[['id','sseqid']].merge(ds_train[['uniprot_id', 'ec_number']], left_on='sseqid', right_on='uniprot_id', how='left')[['id','ec_number']].rename(columns={'id':'uniprot_id', 'ec_number':'ec_ecblast'})
#拼合test，加入无预测结果的数据
blast_res_ec=ds_test[['uniprot_id', 'reaction_id','ec_number']].rename(columns={'reaction_id':'reaction_groundtruth', 'ec_number': 'ec_groundtruth'}).merge(blast_res_ec, on='uniprot_id', how='left').fillna('NO-PREDICTION')
#add rxn info
blast_res_ec['reaction_ecblast']=blast_res_ec.ec_ecblast.parallel_apply(lambda x: btools.retrival_reaction_from_ec(ec_pred=x, ec_reaction_map=ec_reaction_map))

blast_res_ec.to_csv(cfg.FILE_RESULTS_BLAST_EC,sep='\t', index=False)

blast_res_ec.head(3)

Unnamed: 0,uniprot_id,reaction_groundtruth,ec_groundtruth,ec_ecblast,reaction_ecblast
0,A9JLI2,-,-,-,-
1,A9JLI3,-,-,-,-
2,A9JLI5,-,-,-,-


In [3]:
cfg.FILE_RESULTS_ECPRED

'/hpcfs/fhome/shizhenkun/codebase/RXNRECer/results/baselines/exp_test_ecpred.tsv'