# Task1. 预测是酶还是非酶

> author: Shizhenkun   
> email: zhenkun.shi@tib.cas.cn   
> date: 2021-08-26  

## 任务简介
该任务通过给定蛋白序列，预测该该蛋白是酶还是非酶。本任务所使用的数据集为Sport，对数据集的数据中进行学习，然后对新给定的蛋白序列数据预测是酶还是非酶。


## 数据统计
- 数据源Sprot，酶 219,227 条， 非酶226,539条。
- 将数据集中的所有数据按照时间排序，～90%作为训练集，～10%作为测试集，找到对应时间节点为2009年12月14日。
- 以2009年12月14日为时间节点，之前的数据为训练集，之后的数据为测试集，具体数据集统计如下： 

###  三级以下EC号的算作是酶 







## 1. 导入必要的包、定义公共函数

In [1]:
import numpy as np
import pandas as pd
import random
import time
import datetime
import sys
import os
from tqdm import tqdm
from functools import reduce

sys.path.append("../../tools/")
import commontools
import funclib
import ucTools

sys.path.append("../../")
import benchmark_common as bcommon
import benchmark_train as btrain
import benchmark_test as btest
import benchmark_config as cfg
import benchmark_evaluation as eva

from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

from pandarallel import pandarallel # 导入pandaralle
pandarallel.initialize() # 初始化该这个b...并行库

%load_ext autoreload
%autoreload 2

INFO: Pandarallel will run on 32 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


## 2. 加载数据

In [2]:
train = pd.read_feather(cfg.DATADIR+'train.feather')
test = pd.read_feather(cfg.DATADIR+'test.feather')

print('train size: {0}\ntest size: {1}'.format(len(train), len(test)))

trainset = train[['id', 'isemzyme','seq', 'seqlength']].reset_index(drop=True)
testset = test[['id', 'isemzyme','seq', 'seqlength']].reset_index(drop=True)

train size: 469129
test size: 8033


In [3]:
train.head(2)

Unnamed: 0,id,isemzyme,functionCounts,ec_number,ec_specific_level,seq,seqlength,f1,f2,f3,...,f1891,f1892,f1893,f1894,f1895,f1896,f1897,f1898,f1899,f1900
0,A0A023GPI8,False,0,-,0,ADTIVAVELDTYPNTDIGDPSYPHIGIDIKSVRSKKTAKWNMQNGK...,236,0.000115,0.105367,0.037413,...,0.052682,-0.020702,0.081852,0.008791,-0.036727,-0.027552,-0.254128,0.020778,0.140904,0.140635
1,A0A023GPJ0,False,0,-,0,MFGIFSKGEPVSMEGELVQPSSIVINDYEEELHLPLSYWDIKDYKN...,143,0.001688,-0.017209,0.029518,...,0.04057,-0.031992,-0.174783,0.000952,-0.136715,0.042773,0.132279,0.015899,0.15293,-0.003412


## 3. 同源比对

In [4]:
# 读取比对结果
res_data=funclib.getblast_fasta(trainfasta=cfg.TRAIN_FASTA,testfasta=cfg.TEST_FASTA)
print(' aligment finished \n query samples:{0}\n results samples: {1}'.format(len(test), len(res_data)))

 aligment finished 
 query samples:8033
 results samples: 6034


In [5]:
# 给比对结果添加标签
isEmzyme_dict = {v: k for k,v in zip(train.isemzyme, train.id )} 
res_data['isEmzyme_pred'] = res_data['sseqid'].apply(lambda x: isEmzyme_dict.get(x))

isEmzyme_dict_test = {v: k for k,v in zip(test.isemzyme, test.id )} 
res_data['isEmzyme_groudtruth'] = res_data['id'].apply(lambda x: isEmzyme_dict_test.get(x))

# 获取比对结果
res_data_sub = res_data[['id','isEmzyme_pred']]

#选取测试集的ID与是否是酶2列
test_data_sub=test[['id', 'isemzyme']]
test_data_sub.reset_index(drop=True, inplace=True)

#拼合比对结果到测试集
res = pd.merge(test_data_sub,res_data_sub, on='id', how='left')

print('baslineName', '\t', 'accuracy','\t', 'precision(PPV) \t NPV \t\t', 'recall','\t', 'f1', '\t\t', '\t confusion Matrix')
eva.caculateMetrix(groundtruth=res.isemzyme, predict=res.isEmzyme_pred, baselineName='Blast', type='include_unfind')


baslineName 	 accuracy 	 precision(PPV) 	 NPV 		 recall 	 f1 		 	 confusion Matrix
Blast 		0.703598 	0.969148 		0.904980 	0.807488 	0.880963 	 tp: 2890 fp: 92 fn: 290 tn: 2762 up: 399 un: 1600


## 4. 机器学习方法预测
### 4.1 one-hot + 机器学习

In [6]:
MAX_SEQ_LENGTH = 1500 #定义序列最长的长度
trainset.seq = trainset.seq.map(lambda x : x[0:MAX_SEQ_LENGTH].ljust(MAX_SEQ_LENGTH, 'X'))
testset.seq = testset.seq.map(lambda x : x[0:MAX_SEQ_LENGTH].ljust(MAX_SEQ_LENGTH, 'X'))


f_train = funclib.dna_onehot(trainset) #训练集编码
f_test = funclib.dna_onehot(testset) #测试集编码

In [7]:
# 计算指标
X_train = np.array(f_train.iloc[:,2:])
X_test = np.array(f_test.iloc[:,2:])
Y_train = np.array(f_train.isemzyme.astype('int'))
Y_test = np.array(f_test.isemzyme.astype('int'))
funclib.run_baseline(X_train, Y_train, X_test, Y_test)

baslineName 	 accuracy 	 precision(PPV) 	 NPV 		 recall 	 f1 		 		 confusion Matrix
knn 		0.633014 	0.668626 		0.622201 	0.349539 	0.459083 	 tp: 1251 fp: 620 fn: 2328 tn: 3834
lr 		0.657413 	0.597547 		0.724302 	0.707740 	0.647992 	 tp: 2533 fp: 1706 fn: 1046 tn: 2748
xg 		0.742064 	0.733788 		0.747609 	0.660799 	0.695384 	 tp: 2365 fp: 858 fn: 1214 tn: 3596
dt 		0.626914 	0.586197 		0.656431 	0.552948 	0.569087 	 tp: 1979 fp: 1397 fn: 1600 tn: 3057
rf 		0.716171 	0.676831 		0.749312 	0.694607 	0.685604 	 tp: 2486 fp: 1187 fn: 1093 tn: 3267
gbdt 		0.677455 	0.616290 		0.746103 	0.731489 	0.668966 	 tp: 2618 fp: 1630 fn: 961 tn: 2824


## 5. Unirep + 机器学习

In [8]:
X_train = np.array(train.iloc[:,7:])
X_test = np.array(test.iloc[:,7:])

Y_train = np.array(train.isemzyme.astype('int')).flatten()
Y_test = np.array(test.isemzyme.astype('int')).flatten()

funclib.run_baseline(X_train, Y_train, X_test, Y_test)

baslineName 	 accuracy 	 precision(PPV) 	 NPV 		 recall 	 f1 		 		 confusion Matrix
knn 		0.848624 	0.867039 		0.836311 	0.779827 	0.821124 	 tp: 2791 fp: 428 fn: 788 tn: 4026
lr 		0.838043 	0.862739 		0.822195 	0.756915 	0.806370 	 tp: 2709 fp: 431 fn: 870 tn: 4023
xg 		0.880866 	0.907649 		0.862985 	0.815591 	0.859161 	 tp: 2919 fp: 297 fn: 660 tn: 4157
dt 		0.792232 	0.788520 		0.794834 	0.729254 	0.757730 	 tp: 2610 fp: 700 fn: 969 tn: 3754
rf 		0.863438 	0.915050 		0.832838 	0.764459 	0.833004 	 tp: 2736 fp: 254 fn: 843 tn: 4200
gbdt 		0.833188 	0.873790 		0.809051 	0.731210 	0.796167 	 tp: 2617 fp: 378 fn: 962 tn: 4076


## 6. ESM + 机器学习
### 6.1 REP33

In [9]:
train_esm_33 = pd.read_feather(cfg.DATADIR + 'train_rep33.feather')
test_esm_33 = pd.read_feather(cfg.DATADIR + 'test_rep33.feather')

train_esm = trainset.merge(train_esm_33, on='id', how='left')
test_esm = testset.merge(test_esm_33, on='id', how='left')

In [10]:
X_train = np.array(train_esm.iloc[:,4:])
X_test = np.array(test_esm.iloc[:,4:])

Y_train = np.array(train_esm.isemzyme.astype('int')).flatten()
Y_test = np.array(test_esm.isemzyme.astype('int')).flatten()

funclib.run_baseline(X_train, Y_train, X_test, Y_test)

baslineName 	 accuracy 	 precision(PPV) 	 NPV 		 recall 	 f1 		 		 confusion Matrix
knn 		0.927300 	0.935953 		0.920835 	0.898296 	0.916738 	 tp: 3215 fp: 220 fn: 364 tn: 4234
lr 		0.908378 	0.927005 		0.895196 	0.862252 	0.893457 	 tp: 3086 fp: 243 fn: 493 tn: 4211
xg 		0.928047 	0.952913 		0.910593 	0.882090 	0.916135 	 tp: 3157 fp: 156 fn: 422 tn: 4298
dt 		0.833811 	0.848664 		0.823884 	0.763062 	0.803590 	 tp: 2731 fp: 487 fn: 848 tn: 3967
rf 		0.916096 	0.960965 		0.887136 	0.846046 	0.899851 	 tp: 3028 fp: 123 fn: 551 tn: 4331
gbdt 		0.865804 	0.901703 		0.843089 	0.784297 	0.838912 	 tp: 2807 fp: 306 fn: 772 tn: 4148


### 6.2 REP32

In [6]:
train_esm_32 = pd.read_feather(cfg.DATADIR + 'train_rep32.feather')
test_esm_32 = pd.read_feather(cfg.DATADIR + 'test_rep32.feather')

train_esm = trainset.merge(train_esm_32, on='id', how='left')
test_esm = testset.merge(test_esm_32, on='id', how='left')

X_train = np.array(train_esm.iloc[:,4:])
X_test = np.array(test_esm.iloc[:,4:])

Y_train = np.array(train_esm.isemzyme.astype('int')).flatten()
Y_test = np.array(test_esm.isemzyme.astype('int')).flatten()

funclib.run_baseline(X_train, Y_train, X_test, Y_test)

baslineName 	 accuracy 	 precision(PPV) 	 NPV 		 recall 	 f1 		 		 confusion Matrix
knn 		0.929416 	0.941125 		0.920762 	0.897737 	0.918919 	 tp: 3213 fp: 201 fn: 366 tn: 4253




lr 		0.909374 	0.924888 		0.898247 	0.867002 	0.895010 	 tp: 3103 fp: 252 fn: 476 tn: 4202
xg 		0.926180 	0.950513 		0.909091 	0.880134 	0.913971 	 tp: 3150 fp: 164 fn: 429 tn: 4290
dt 		0.842898 	0.855914 		0.834031 	0.778430 	0.815335 	 tp: 2786 fp: 469 fn: 793 tn: 3985
rf 		0.915723 	0.965661 		0.884076 	0.840738 	0.898880 	 tp: 3009 fp: 107 fn: 570 tn: 4347
gbdt 		0.872028 	0.905049 		0.850737 	0.796312 	0.847206 	 tp: 2850 fp: 299 fn: 729 tn: 4155


### 6.3 REP0

In [7]:
train_esm_0 = pd.read_feather(cfg.DATADIR + 'train_rep0.feather')
test_esm_0 = pd.read_feather(cfg.DATADIR + 'test_rep0.feather')

train_esm = trainset.merge(train_esm_0, on='id', how='left')
test_esm = testset.merge(test_esm_0, on='id', how='left')

X_train = np.array(train_esm.iloc[:,4:])
X_test = np.array(test_esm.iloc[:,4:])

Y_train = np.array(train_esm.isemzyme.astype('int')).flatten()
Y_test = np.array(test_esm.isemzyme.astype('int')).flatten()

funclib.run_baseline(X_train, Y_train, X_test, Y_test)

baslineName 	 accuracy 	 precision(PPV) 	 NPV 		 recall 	 f1 		 		 confusion Matrix
knn 		0.824599 	0.789179 		0.855641 	0.827326 	0.807802 	 tp: 2961 fp: 791 fn: 618 tn: 3663
lr 		0.757251 	0.721031 		0.787948 	0.742386 	0.731553 	 tp: 2657 fp: 1028 fn: 922 tn: 3426
xg 		0.847504 	0.844555 		0.849686 	0.806091 	0.824875 	 tp: 2885 fp: 531 fn: 694 tn: 3923
dt 		0.760612 	0.739722 		0.776370 	0.713887 	0.726575 	 tp: 2555 fp: 899 fn: 1024 tn: 3555
rf 		0.853853 	0.863623 		0.847017 	0.797988 	0.829509 	 tp: 2856 fp: 451 fn: 723 tn: 4003
gbdt 		0.820988 	0.810020 		0.829258 	0.781503 	0.795506 	 tp: 2797 fp: 656 fn: 782 tn: 3798


## 7. Ours

In [8]:
# get blast results
blastres=pd.DataFrame()
blastres['id']=res.id
blastres['isemzyme_groundtruth']=res.isemzyme
blastres['isEmzyme_pred_blast']=res.isEmzyme_pred

In [9]:
#res32
train_esm_32 = pd.read_feather(cfg.DATADIR + 'train_rep32.feather')
test_esm_32 = pd.read_feather(cfg.DATADIR + 'test_rep32.feather')

train_esm = trainset.merge(train_esm_32, on='id', how='left')
test_esm = testset.merge(test_esm_32, on='id', how='left')

X_train = np.array(train_esm.iloc[:,4:])
X_test = np.array(test_esm.iloc[:,4:])

Y_train = np.array(train_esm.isemzyme.astype('int')).flatten()
Y_test = np.array(test_esm.isemzyme.astype('int')).flatten()

In [10]:
# groundtruth, predict, predictprob = funclib.xgmain(X_train, Y_train, X_test, Y_test, type='binary')
groundtruth, predict, predictprob = funclib.knnmain(X_train, Y_train, X_test, Y_test, type='binary')
blastres['isEmzyme_pred_xg'] = predict
blastres.isEmzyme_pred_xg =blastres.isEmzyme_pred_xg.astype('bool')
blastres['isEmzyme_pred_slice']=blastres.apply(lambda x: x.isEmzyme_pred_xg if str(x.isEmzyme_pred_blast)=='nan' else x.isEmzyme_pred_blast, axis=1)
print('baslineName', '\t', 'accuracy','\t', 'precision(PPV) \t NPV \t\t', 'recall','\t', 'f1', '\t\t', '\t confusion Matrix')
eva.caculateMetrix( groundtruth=blastres.isemzyme_groundtruth,  predict=blastres.isEmzyme_pred_slice, baselineName='ours', type='binary')

baslineName 	 accuracy 	 precision(PPV) 	 NPV 		 recall 	 f1 		 	 confusion Matrix
ours 		0.931159 	0.952452 		0.915974 	0.889913 	0.920121 	 tp: 3185 fp: 159 fn: 394 tn: 4295


In [62]:
# 保存文件
blastres.to_csv(cfg.FILE_SLICE_ISENZYME_RESULTS, sep='\t', index=None)