In [2]:
import pandas as pd
import numpy as np
import joblib, os,time, argparse
import benchmark_common as bcommon
import config as cfg
import benchmark_test as btest
import benchmark_train as btrain
import benchmark_evaluation as eva
import tools.funclib as funclib
import tools.embedding_esm as esmebd
from tqdm import tqdm

from sklearn import metrics
from sklearn.model_selection import train_test_split
from gc import callbacks
from xgboost import XGBClassifier
from xgboost.callback import EarlyStopping

from pandarallel import pandarallel #  import pandaralle
pandarallel.initialize() 

%load_ext autoreload
%autoreload 2

INFO: Pandarallel will run on 104 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [3]:
print('step 1 loading task data')

data_task1_train = pd.read_feather(cfg.FILE_TASK1_TRAIN)
data_task2_train = pd.read_feather(cfg.FILE_TASK2_TRAIN)
data_task3_train = pd.read_feather(cfg.FILE_TASK3_TRAIN)

data_task1_test = pd.read_feather(cfg.FILE_TASK1_TEST)
data_task2_test = pd.read_feather(cfg.FILE_TASK2_TEST)
data_task3_test = pd.read_feather(cfg.FILE_TASK3_TEST)

step 1 loading task data


## 1. DIAMOND

In [116]:
diamond_task1 = funclib.getblast(train=data_task1_train[['id','seq']], test=data_task1_test[['id','seq']])

res_task1_diamond =diamond_task1[['id','sseqid']].merge(data_task1_train[['id','isenzyme']], how='left', left_on='sseqid', right_on='id')
res_task1_diamond = res_task1_diamond[['id_x', 'isenzyme']].rename(columns={'id_x':'id','isenzyme':'isenzyme_pred'})
res_task1_diamond = data_task1_test.merge(res_task1_diamond, how='left', on='id')
res_task1_diamond['isenzyme_pred_full']=res_task1_diamond.apply(lambda x : x.isenzyme_pred if (str(x.isenzyme_pred)!='nan') else ( True if x.isenzyme==False else False)  , axis=1)
print('task1:\n----------------')
print('baslineName', '\t\t', 'accuracy','\t', 'precision(PPV) \t NPV \t\t', 'recall','\t', 'f1', '\t\t', '\t confusion Matrix')
eva.caculateMetrix(groundtruth=res_task1_diamond.isenzyme, predict=res_task1_diamond.isenzyme_pred_full, baselineName='Diamond', type='binary')

print('\ntask2:\n----------------')

diamond_task2 = funclib.getblast(train=data_task2_train[['id','seq']], test=data_task2_test[['id','seq']])
res_task2_diamond = diamond_task2[['id','sseqid']].merge(data_task2_train[['id','functionCounts']], how='left', left_on='sseqid', right_on='id')
res_task2_diamond = res_task2_diamond[['id_x', 'functionCounts']].rename(columns={'id_x':'id','functionCounts':'functionCounts_diamond'})
res_task2_diamond = data_task2_test.merge( res_task2_diamond, on='id', how='left')
res_task2_diamond = res_task2_diamond.fillna(-1)
print('%12s'%'baslineName', '\t\t', 'accuracy','\t', 'precision-macro \t', 'recall-macro','\t', 'f1-macro')
eva.caculateMetrix(groundtruth=res_task2_diamond.functionCounts, predict=res_task2_diamond.functionCounts_diamond, baselineName='Diamond', type='multi')

diamond_task3 = funclib.getblast(train=data_task3_train[['id','seq']], test=data_task3_test[['id','seq']])
res_task3_diamond = diamond_task3[['id','sseqid']].merge(data_task3_train[['id','ec_number']], how='left', left_on='sseqid', right_on='id')
res_task3_diamond = res_task3_diamond[['id_x', 'ec_number']].rename(columns={'id_x':'id','ec_number':'ec_number_diamond'})

res_task3_diamond = data_task3_test.merge( res_task3_diamond, on='id', how='left')
res_task3_diamond = res_task3_diamond.fillna('-')
print('\ntask3:\n----------------')
print('%12s'%'baslineName', '\t\t', 'accuracy','\t', 'precision-macro \t', 'recall-macro','\t', 'f1-macro')
eva.caculateMetrix(groundtruth=res_task3_diamond.ec_number, predict=res_task3_diamond.ec_number_diamond, baselineName='Diamond', type='multi')


Write finished
Write finished
diamond makedb --in /tmp/train.fasta -d /tmp/train.dmnd --quiet
diamond blastp -d /tmp/train.dmnd  -q  /tmp/test.fasta -o /tmp/test_fasta_results.tsv -b5 -c1 -k 1 --quiet
baslineName 		 accuracy 	 precision(PPV) 	 NPV 		 recall 	 f1 		 	 confusion Matrix
Diamond 		0.677030 	0.629801 		0.751150 	0.798865 	0.704330 	 tp: 4083 fp: 2400 fn: 1028 tn: 3103


## 2. ECPred

In [None]:
res_ecpred = pd.read_csv(file_ecpred, sep='\t', header=0)
res_ecpred['isemzyme_ecpred'] = ''
with pd.option_context('mode.chained_assignment', None):
    res_ecpred.isemzyme_ecpred[res_ecpred['EC Number']=='non Enzyme'] = False
    res_ecpred.isemzyme_ecpred[res_ecpred['EC Number']!='non Enzyme'] = True
    
res_ecpred.columns = ['id','ec_ecpred', 'conf', 'isemzyme_ecpred']
res_ecpred = res_ecpred.iloc[:,np.r_[0,1,3]]
res_ecpred['functionCounts_ecpred'] = res_ecpred.ec_ecpred.apply(lambda x :len(str(x).split(',')))
big_res = big_res.merge(res_ecpred, on='id', how='left').drop_duplicates(subset='id')

## 3. DeepEC

In [58]:
# !conda activate deepec
# !python ./baselines/deepec/deepec.py -i ./datasets/task1/test.fasta -o ./results/deepec/

res_deepec = pd.read_csv(cfg.FILE_DEEPEC_RESULTS, sep='\t',names=['id', 'ec_number'], header=0 )
res_deepec.ec_number=res_deepec.apply(lambda x: x['ec_number'].replace('EC:',''), axis=1)
res_deepec.columns = ['id','ec_deepec']

res = []
for index, group in  res_deepec.groupby('id'):
    if len(group)==1:
        res = res + [[group.id.values[0], group.ec_deepec.values[0]]]
    else:
        ecs_str = ','.join(group.ec_deepec.values)
        res = res +[[group.id.values[0],ecs_str]] 
res_deepec = pd.DataFrame(res, columns=['id', 'ec_deepec'])


res_deepec_task1=data_task1_test.merge(res_deepec, on='id', how='left')
res_deepec_task1['isenzyme_pred']=res_deepec_task1.ec_deepec.apply(lambda x: True if str(x)!='nan' else False)

print('task1:\n----------------')
print('baslineName', '\t', 'accuracy','\t', 'precision(PPV) \t NPV \t\t', 'recall','\t', 'f1', '\t\t', '\t confusion Matrix')
eva.caculateMetrix(groundtruth=res_deepec_task1.isenzyme, predict=res_deepec_task1.isenzyme_pred, baselineName='deepec', type='binary')

print('\ntask2:\n----------------')

res_deepec_task2=data_task2_test.merge(res_deepec, on='id', how='left')
res_deepec_task2=res_deepec_task2.fillna('-')
res_deepec_task2['functionCounts_deepec'] =res_deepec_task2.ec_deepec.apply(lambda x: len(x.split(',')))

print('%12s'%'baslineName', '\t\t', 'accuracy','\t', 'precision-macro \t', 'recall-macro','\t', 'f1-macro')
eva.caculateMetrix(groundtruth=res_deepec_task2.functionCounts, predict=res_deepec_task2.functionCounts_deepec, baselineName='deepec', type='multi')

print('\ntask3:\n----------------')
res_deepec_task3 = data_task3_test.merge(res_deepec, on='id', how='left')
res_deepec_task3=res_deepec_task3.fillna('-')
print('%12s'%'baslineName', '\t\t', 'accuracy','\t', 'precision-macro \t', 'recall-macro','\t', 'f1-macro')
eva.caculateMetrix(groundtruth=res_deepec_task3.ec_number, predict=res_deepec_task3.ec_deepec, baselineName='deepec', type='multi')

task1:
----------------
baslineName 	 accuracy 	 precision(PPV) 	 NPV 		 recall 	 f1 		 	 confusion Matrix
deepec 		0.638308 	0.944134 		0.590612 	0.264527 	0.413266 	 tp: 1352 fp: 80 fn: 3759 tn: 5423

task2:
----------------
 baslineName 		 accuracy 	 precision-macro 	 recall-macro 	 f1-macro
      deepec  		0.906281  	0.817691 		0.135362 	0.138948

task3:
----------------
 baslineName 		 accuracy 	 precision-macro 	 recall-macro 	 f1-macro
      deepec  		0.104872  	0.791039 		0.287901 	0.121086


## 4. CatFam

In [235]:
res_catfam = pd.read_csv(cfg.RESULTSDIR+'catfam/catfam.txt', sep='\t', names=['id', 'ec_catfam'])
res = []
for index, group in  res_catfam.groupby('id'):
    if len(group)==1:
        res = res + [[group.id.values[0], group.ec_catfam.values[0]]]
    else:
        ecs_str = ','.join(group.ec_catfam.values)
        res = res +[[group.id.values[0],ecs_str]] 
res_catfam = pd.DataFrame(res, columns=['id', 'ec_catfam'])
res_catfam = res_catfam.fillna('-')
res_catfam['isenzyme_catfam']=res_catfam.ec_catfam.apply(lambda x: True if str(x)!='-' else False)
res_catfam['functionCounts_catfam'] = res_catfam.ec_catfam.apply(lambda x :len(str(x).split(',')))

print('task1:\n----------------')
res_catfam_task1=data_task1_test.merge(res_catfam, on='id', how='left')
print('baslineName', '\t', 'accuracy','\t', 'precision(PPV) \t NPV \t\t', 'recall','\t', 'f1', '\t\t', '\t confusion Matrix')
eva.caculateMetrix(groundtruth=res_catfam_task1.isenzyme, predict=res_catfam_task1.isenzyme_catfam, baselineName='catfam', type='binary')

print('\ntask2:\n----------------')
res_catfam_task2=data_task2_test.merge(res_catfam, on='id', how='left')
print('%12s'%'baslineName', '\t\t', 'accuracy','\t', 'precision-macro \t', 'recall-macro','\t', 'f1-macro')
eva.caculateMetrix(groundtruth=res_catfam_task2.functionCounts, predict=res_catfam_task2.functionCounts_catfam, baselineName='catfam', type='multi')

print('\ntask3:\n----------------')
print('%12s'%'baslineName', '\t\t', 'accuracy','\t', 'precision-macro \t', 'recall-macro','\t', 'f1-macro')
res_catfam_task3=data_task3_test.merge(res_catfam, on='id', how='left')
eva.caculateMetrix(groundtruth=res_catfam_task3.ec_number, predict=res_catfam_task3.ec_catfam, baselineName='catfam', type='multi')

task1:
----------------
baslineName 	 accuracy 	 precision(PPV) 	 NPV 		 recall 	 f1 		 	 confusion Matrix
catfam 		0.594404 	0.927813 		0.561931 	0.171004 	0.288782 	 tp: 874 fp: 68 fn: 4237 tn: 5435

task2:
----------------
 baslineName 		 accuracy 	 precision-macro 	 recall-macro 	 f1-macro
      catfam  		0.910194  	0.662225 		0.159200 	0.174859

task3:
----------------
      catfam  		0.084328  	0.885812 		0.191622 	0.101330


## 5. PRIAM

In [60]:
res_priam = eva.load_praim_res(resfile=cfg.RESULTSDIR+'priam/PRIAM_20221011103347/ANNOTATION/sequenceECs.txt')


res_priam['isenzyme_priam'] = res_priam.ec_priam.apply(lambda x: True if str(x)!='nan' else False)
res_priam['functionCounts_priam'] = res_priam.ec_priam.apply(lambda x :len(str(x).split(',')))

print('task1:\n----------------')
res_priam_task1=data_task1_test.merge(res_priam, on='id', how='left')
# res_priam_task1['isenzyme_priam']=res_priam_task1.apply(lambda x: x.isenzyme_priam if x.isenzyme_priam==True else (False if x.isenzyme else True), axis=1)
res_priam_task1.isenzyme_priam = res_priam_task1.isenzyme_priam.fillna(False)
print('baslineName', '\t', 'accuracy','\t', 'precision(PPV) \t NPV \t\t', 'recall','\t', 'f1', '\t\t', '\t confusion Matrix')
eva.caculateMetrix(groundtruth=res_priam_task1.isenzyme, predict=res_priam_task1.isenzyme_priam, baselineName='priam', type='binary')

print('\ntask2:\n----------------')
res_priam_task2=data_task2_test.merge(res_priam, on='id', how='left')
res_priam_task2.functionCounts_priam = res_priam_task2.functionCounts_priam.fillna(0)
print('%12s'%'baslineName', '\t\t', 'accuracy','\t', 'precision-macro \t', 'recall-macro','\t', 'f1-macro')
eva.caculateMetrix(groundtruth=res_priam_task2.functionCounts, predict=res_priam_task2.functionCounts_priam, baselineName='priam', type='multi')

print('\ntask3:\n----------------')
res_priam_task3=data_task3_test.merge(res_priam, on='id', how='left')
res_priam_task3.ec_priam = res_priam_task3.ec_priam.fillna('-')
print('%12s'%'baslineName', '\t\t', 'accuracy','\t', 'precision-macro \t', 'recall-macro','\t', 'f1-macro')
eva.caculateMetrix(groundtruth=res_priam_task3.ec_number, predict=res_priam_task3.ec_priam, baselineName='priam', type='multi')


task1:
----------------
baslineName 	 accuracy 	 precision(PPV) 	 NPV 		 recall 	 f1 		 	 confusion Matrix
priam 		0.747315 	0.678001 		0.872065 	0.905107 	0.775264 	 tp: 4626 fp: 2197 fn: 485 tn: 3306

task2:
----------------
 baslineName 		 accuracy 	 precision-macro 	 recall-macro 	 f1-macro
       priam  		0.132655  	0.009457 		0.930938 	0.003176

task3:
----------------
 baslineName 		 accuracy 	 precision-macro 	 recall-macro 	 f1-macro
       priam  		0.045197  	0.250324 		0.752550 	0.014136


In [None]:
java -jar /home/shizhenkun/codebase/DMLF/baselines/ECPred/ECPred.jar blast /home/shizhenkun/codebase/DMLF/data/datasets/task1/test.fasta /home/shizhenkun/codebase/DMLF/baselines/ECPred/ /home/shizhenkun/codebase/DMLF/results/ecpred

java -Xmx128G -jar /home/shizhenkun/codebase/DMLF/baselines/priam/PRIAM_search.jar -p /home/shizhenkun/codebase/DMLF/baselines/priam/PRIAM_JAN18 -i /home/shizhenkun/codebase/DMLF/data/datasets/task1/test.fasta -o /home/shizhenkun/codebase/DMLF/results/priam --blast_path /home/shizhenkun/downloads/blast-2.2.13/bin -np 100