# Task1. Enzyme or Non-enzyme Annotation

> author: Shizhenkun   
> email: zhenkun.shi@tib.cas.cn   
> date: 2021-10-20  

## 1. Import packages

In [1]:
import numpy as np
import pandas as pd
import time
import datetime
import sys
import os
from tqdm import tqdm
from functools import reduce
import joblib

sys.path.append("../tools/")
import funclib

sys.path.append("../")
import benchmark_train as btrain
import benchmark_test as btest
import config as cfg
import benchmark_evaluation as eva

from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

from pandarallel import pandarallel #  import pandaralle
pandarallel.initialize() # init

%load_ext autoreload
%autoreload 2

INFO: Pandarallel will run on 52 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


## 2.  Load data

In [2]:
#read train test data
train = pd.read_feather(cfg.DATADIR+'task1/train.feather')
test = pd.read_feather(cfg.DATADIR+'task1/test.feather')
print('train size: {0}\ntest size: {1}'.format(len(train), len(test)))

train size: 469134
test size: 7101


In [5]:
train[train.isenzyme]

Unnamed: 0,id,seq,isenzyme
124,P00958,MSFLISFDKSKKHPAHLQLANNLKIALALEYASKNLKPEVDNDNAA...,True
175,P00812,METGPHYNYYKNRELSIVLAPFSGGQGKLGVEKGPKYMLKHGLQTS...,True
248,P00959,MTQVAKKILVTCALPYANGSIHLGHMLEHIQADVWVRYQRMRGHEV...,True
249,P00348,MAFATRQLVRSLSSSSTAAASAKKILVKHVTVIGGGLMGAGIAQVA...,True
250,P00469,MLEQPYLDLAKKVLDEGHFKPDRTHTGTYSIFGHQMRFDLSKGFPL...,True
...,...,...,...
469123,Q8I6K2,MSNTAVLNDLVALYDRPTEPMFRVKAKKSFKVPKEYVTDRFKNVAV...,True
469127,O81103,MATAPSPTTMGTYSSLISTNSFSTFLPNKSQLSLSGKSKHYVARRS...,True
469129,Q21221,MSSGAPSGSSMSSTPGSPPPRAGGPNSVSFKDLCCLFCCPPFPSSI...,True
469130,Q6QJ72,MSRLLLPKLFSISRTQVPAASLFNNLYRRHKRFVHWTSKMSTDSVR...,True


## 3. Sequence aligment

In [3]:
# blast
res_data=funclib.getblast(train,test)
print(' aligment finished \n query samples:{0}\n results samples: {1}'.format(len(test), len(res_data)))

res_data = res_data[['id', 'sseqid']].merge(train, left_on='sseqid', right_on='id', how='left')[['id_x', 'isenzyme']]
res_data =res_data.rename(columns={'id_x':'id','isenzyme':'isenzyme_blast'})
res_data = test[['id','isenzyme']].merge(res_data, on='id', how='left')

Write finished
Write finished
diamond makedb --in /tmp/train.fasta -d /tmp/train.dmnd


diamond v2.0.8.146 (C) Max Planck Society for the Advancement of Science
Documentation, support and updates available at http://www.diamondsearch.org

#CPU threads: 80
Scoring parameters: (Matrix=BLOSUM62 Lambda=0.267 K=0.041 Penalties=11/1)
Database input file: /tmp/train.fasta
Opening the database file...  [0.004s]
Loading sequences...  [0.869s]
Masking sequences...  [0.304s]
Writing sequences...  [0.163s]
Hashing sequences...  [0.05s]
Loading sequences...  [0s]
Writing trailer...  [0.003s]
Closing the input file...  [0.001s]
Closing the database file...  [0.038s]
Database hash = eed65be4bf3bb33f8407f23b2e861bca
Processed 469134 sequences, 176795800 letters.
Total time = 1.436s


diamond blastp -d /tmp/train.dmnd  -q  /tmp/test.fasta -o /tmp/test_fasta_results.tsv -b5 -c1 -k 1 --quiet
 aligment finished 
 query samples:7101
 results samples: 5111


In [4]:
print('baslineName', '\t', 'accuracy','\t', 'precision(PPV) \t NPV \t\t', 'recall','\t', 'f1', '\t\t', '\t confusion Matrix')
eva.caculateMetrix(groundtruth=res_data.isenzyme, predict=res_data.isenzyme_blast, baselineName='Blast', type='include_unfind')

baslineName 	 accuracy 	 precision(PPV) 	 NPV 		 recall 	 f1 		 	 confusion Matrix
Blast 		0.667934 	0.966532 		0.884197 	0.795400 	0.872655 	 tp: 2628 fp: 91 fn: 277 tn: 2115 up: 399 un: 1591


## 4. Embedding Comparison
### 4.1 one-hot + ML

In [4]:
trainset = train.copy()
testset = test.copy()

In [7]:
MAX_SEQ_LENGTH = 1500 #定义序列最长的长度
trainset.seq = trainset.seq.map(lambda x : x[0:MAX_SEQ_LENGTH].ljust(MAX_SEQ_LENGTH, 'X'))
testset.seq = testset.seq.map(lambda x : x[0:MAX_SEQ_LENGTH].ljust(MAX_SEQ_LENGTH, 'X'))
f_train = funclib.dna_onehot(trainset) #训练集编码
f_test = funclib.dna_onehot(testset) #测试集编码

In [8]:
# 计算指标
X_train = np.array(f_train.iloc[:,2:])
X_test = np.array(f_test.iloc[:,2:])
Y_train = np.array(trainset.isenzyme.astype('int'))
Y_test = np.array(testset.isenzyme.astype('int'))
funclib.run_baseline(X_train, Y_train, X_test, Y_test)

baslineName 	 accuracy 	 precision(PPV) 	 NPV 		 recall 	 f1 		 		 confusion Matrix
knn 		0.611745 	0.666667 		0.595238 	0.331114 	0.442467 	 tp: 1094 fp: 547 fn: 2210 tn: 3250
lr 		0.672159 	0.630412 		0.718666 	0.713983 	0.669600 	 tp: 2359 fp: 1383 fn: 945 tn: 2414
xg 		0.723278 	0.730942 		0.717991 	0.641344 	0.683218 	 tp: 2119 fp: 780 fn: 1185 tn: 3017
dt 		0.617096 	0.599932 		0.629133 	0.531477 	0.563633 	 tp: 1756 fp: 1171 fn: 1548 tn: 2626
rf 		0.715111 	0.691709 		0.735904 	0.699455 	0.695561 	 tp: 2311 fp: 1030 fn: 993 tn: 2767
gbdt 		0.689621 	0.646510 		0.737974 	0.734564 	0.687730 	 tp: 2427 fp: 1327 fn: 877 tn: 2470


### 4.2 Unirep + ML

In [8]:
train_unirep = pd.read_feather(cfg.DATADIR + 'train_unirep.feather')
test_unirep = pd.read_feather(cfg.DATADIR + 'test_unirep.feather')

In [9]:
train_unirep = trainset.merge(train_unirep, on='id', how='left')
test_unirep = testset.merge(test_unirep, on='id', how='left')

In [10]:
X_train =np.array(train_unirep.iloc[:,3:])
X_test = np.array(test_unirep.iloc[:,3:])

Y_train = np.array(train_unirep.isenzyme.astype('int')).flatten()
Y_test = np.array(test_unirep.isenzyme.astype('int')).flatten()

funclib.run_baseline(X_train, Y_train, X_test, Y_test)

baslineName 	 accuracy 	 precision(PPV) 	 NPV 		 recall 	 f1 		 		 confusion Matrix
knn 		0.851852 	0.854088 		0.850038 	0.822034 	0.837754 	 tp: 2716 fp: 464 fn: 588 tn: 3333
lr 		0.809604 	0.933008 		0.752218 	0.636501 	0.756747 	 tp: 2103 fp: 151 fn: 1201 tn: 3646
xg 		0.861991 	0.885790 		0.844461 	0.807506 	0.844839 	 tp: 2668 fp: 344 fn: 636 tn: 3453
dt 		0.769610 	0.789986 		0.755740 	0.687651 	0.735275 	 tp: 2272 fp: 604 fn: 1032 tn: 3193
rf 		0.841994 	0.909535 		0.801442 	0.733354 	0.811997 	 tp: 2423 fp: 241 fn: 881 tn: 3556
gbdt 		0.785101 	0.894060 		0.734365 	0.610472 	0.725540 	 tp: 2017 fp: 239 fn: 1287 tn: 3558


### 4.3 ESM REP33 + ML

In [9]:
train_esm_33 = pd.read_feather(cfg.DATADIR + 'train_rep33.feather')
test_esm_33 = pd.read_feather(cfg.DATADIR + 'test_rep33.feather')

train_esm = trainset.merge(train_esm_33, on='id', how='left')
test_esm = testset.merge(test_esm_33, on='id', how='left')

In [10]:
X_train = np.array(train_esm.iloc[:,4:])
X_test = np.array(test_esm.iloc[:,4:])

Y_train = np.array(train_esm.isemzyme.astype('int')).flatten()
Y_test = np.array(test_esm.isemzyme.astype('int')).flatten()

funclib.run_baseline(X_train, Y_train, X_test, Y_test)

baslineName 	 accuracy 	 precision(PPV) 	 NPV 		 recall 	 f1 		 		 confusion Matrix
knn 		0.927300 	0.935953 		0.920835 	0.898296 	0.916738 	 tp: 3215 fp: 220 fn: 364 tn: 4234
lr 		0.908378 	0.927005 		0.895196 	0.862252 	0.893457 	 tp: 3086 fp: 243 fn: 493 tn: 4211
xg 		0.928047 	0.952913 		0.910593 	0.882090 	0.916135 	 tp: 3157 fp: 156 fn: 422 tn: 4298
dt 		0.833811 	0.848664 		0.823884 	0.763062 	0.803590 	 tp: 2731 fp: 487 fn: 848 tn: 3967
rf 		0.916096 	0.960965 		0.887136 	0.846046 	0.899851 	 tp: 3028 fp: 123 fn: 551 tn: 4331
gbdt 		0.865804 	0.901703 		0.843089 	0.784297 	0.838912 	 tp: 2807 fp: 306 fn: 772 tn: 4148


### 4.4 ESM REP32 + ML

In [10]:
train_esm_32 = pd.read_feather(cfg.DATADIR + 'train_rep32.feather')
test_esm_32 = pd.read_feather(cfg.DATADIR + 'test_rep32.feather')

train_esm = trainset.merge(train_esm_32, on='id', how='left')
test_esm = testset.merge(test_esm_32, on='id', how='left')

X_train = np.array(train_esm.iloc[:,4:])
X_test = np.array(test_esm.iloc[:,4:])

Y_train = np.array(train_esm.isenzyme.astype('int')).flatten()
Y_test = np.array(test_esm.isenzyme.astype('int')).flatten()

funclib.run_baseline(X_train, Y_train, X_test, Y_test)

baslineName 	 accuracy 	 precision(PPV) 	 NPV 		 recall 	 f1 		 		 confusion Matrix
knn 		0.924799 	0.939125 		0.913352 	0.896489 	0.917312 	 tp: 2962 fp: 192 fn: 342 tn: 3605
lr 		0.908604 	0.927536 		0.893894 	0.871671 	0.898736 	 tp: 2880 fp: 225 fn: 424 tn: 3572
xg 		0.921420 	0.949869 		0.899975 	0.877421 	0.912209 	 tp: 2899 fp: 153 fn: 405 tn: 3644
dt 		0.830587 	0.855981 		0.812530 	0.764528 	0.807674 	 tp: 2526 fp: 425 fn: 778 tn: 3372
rf 		0.908604 	0.961418 		0.872633 	0.837167 	0.895001 	 tp: 2766 fp: 111 fn: 538 tn: 3686
gbdt 		0.874947 	0.905641 		0.852777 	0.816283 	0.858644 	 tp: 2697 fp: 281 fn: 607 tn: 3516


### 4.5 ESM REP0 + ML

In [7]:
train_esm_0 = pd.read_feather(cfg.DATADIR + 'train_rep0.feather')
test_esm_0 = pd.read_feather(cfg.DATADIR + 'test_rep0.feather')

train_esm = trainset.merge(train_esm_0, on='id', how='left')
test_esm = testset.merge(test_esm_0, on='id', how='left')

X_train = np.array(train_esm.iloc[:,4:])
X_test = np.array(test_esm.iloc[:,4:])

Y_train = np.array(train_esm.isemzyme.astype('int')).flatten()
Y_test = np.array(test_esm.isemzyme.astype('int')).flatten()

funclib.run_baseline(X_train, Y_train, X_test, Y_test)

baslineName 	 accuracy 	 precision(PPV) 	 NPV 		 recall 	 f1 		 		 confusion Matrix
knn 		0.824599 	0.789179 		0.855641 	0.827326 	0.807802 	 tp: 2961 fp: 791 fn: 618 tn: 3663
lr 		0.757251 	0.721031 		0.787948 	0.742386 	0.731553 	 tp: 2657 fp: 1028 fn: 922 tn: 3426
xg 		0.847504 	0.844555 		0.849686 	0.806091 	0.824875 	 tp: 2885 fp: 531 fn: 694 tn: 3923
dt 		0.760612 	0.739722 		0.776370 	0.713887 	0.726575 	 tp: 2555 fp: 899 fn: 1024 tn: 3555
rf 		0.853853 	0.863623 		0.847017 	0.797988 	0.829509 	 tp: 2856 fp: 451 fn: 723 tn: 4003
gbdt 		0.820988 	0.810020 		0.829258 	0.781503 	0.795506 	 tp: 2797 fp: 656 fn: 782 tn: 3798


## 5. Ours

In [None]:
# get blast results
blastres=pd.DataFrame()
blastres['id']=res.id
blastres['isemzyme_groundtruth']=res.isemzyme
blastres['isEmzyme_pred_blast']=res.isEmzyme_pred

In [6]:
#res32
train_esm_32 = pd.read_feather(cfg.DATADIR + 'train_rep32.feather')
test_esm_32 = pd.read_feather(cfg.DATADIR + 'test_rep32.feather')

train_esm = trainset.merge(train_esm_32, on='id', how='left')
test_esm = testset.merge(test_esm_32, on='id', how='left')

X_train = np.array(train_esm.iloc[:,3:])
X_test = np.array(test_esm.iloc[:,3:])

Y_train = np.array(train_esm.isenzyme.astype('int')).flatten()
Y_test = np.array(test_esm.isenzyme.astype('int')).flatten()

In [None]:
# groundtruth, predict, predictprob = funclib.xgmain(X_train, Y_train, X_test, Y_test, type='binary')
groundtruth, predict, predictprob = funclib.knnmain(X_train, Y_train, X_test, Y_test, type='binary')
blastres['isEmzyme_pred_xg'] = predict
blastres.isEmzyme_pred_xg =blastres.isEmzyme_pred_xg.astype('bool')
blastres['isEmzyme_pred_slice']=blastres.apply(lambda x: x.isEmzyme_pred_xg if str(x.isEmzyme_pred_blast)=='nan' else x.isEmzyme_pred_blast, axis=1)
print('baslineName', '\t', 'accuracy','\t', 'precision(PPV) \t NPV \t\t', 'recall','\t', 'f1', '\t\t', '\t confusion Matrix')
eva.caculateMetrix( groundtruth=blastres.isemzyme_groundtruth,  predict=blastres.isEmzyme_pred_slice, baselineName='ours', type='binary')

In [7]:
groundtruth, predict, predictprob, model = funclib.knnmain(X_train, Y_train, X_test, Y_test, type='binary')
joblib.dump(model, cfg.ISENZYME_MODEL)

['/home/shizhenkun/codebase/DMLF/model/isenzyme.model']

In [62]:
# 保存文件
blastres.to_csv(cfg.FILE_SLICE_ISENZYME_RESULTS, sep='\t', index=None)