# Task1. 预测是酶还是非酶

> author: Shizhenkun   
> email: zhenkun.shi@tib.cas.cn   
> date: 2021-08-26  

## 任务简介
该任务通过给定蛋白序列，预测该该蛋白是酶还是非酶。本任务所使用的数据集为Sport，对数据集的数据中进行学习，然后对新给定的蛋白序列数据预测是酶还是非酶。


## 数据统计
- 数据源Sprot，酶 219,227 条， 非酶226,539条。
- 将数据集中的所有数据按照时间排序，～90%作为训练集，～10%作为测试集，找到对应时间节点为2009年12月14日。
- 以2009年12月14日为时间节点，之前的数据为训练集，之后的数据为测试集，具体数据集统计如下： 

###  三级以下EC号的算作是酶 







## 1. 导入必要的包、定义公共函数

In [1]:
import numpy as np
import pandas as pd
import random
import time
import datetime
import sys
import os
from tqdm import tqdm
from functools import reduce

sys.path.append("../../tools/")
import commontools
import funclib
import ucTools

sys.path.append("../../")
import benchmark_common as bcommon
import benchmark_train as btrain
import benchmark_test as btest
import benchmark_config as cfg
import benchmark_evaluation as eva

from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

from pandarallel import pandarallel # 导入pandaralle
pandarallel.initialize() # 初始化该这个b...并行库

%load_ext autoreload
%autoreload 2

INFO: Pandarallel will run on 32 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


## 2. 加载数据

In [2]:
train = pd.read_feather(cfg.DATADIR+'train.feather')
test = pd.read_feather(cfg.DATADIR+'test.feather')

print('train size: {0}\ntest size: {1}'.format(len(train), len(test)))

trainset = train[['id', 'isemzyme','seq', 'seqlength']].reset_index(drop=True)
testset = test[['id', 'isemzyme','seq', 'seqlength']].reset_index(drop=True)

train size: 469129
test size: 8033


In [3]:
train.head(2)

Unnamed: 0,id,isemzyme,functionCounts,ec_number,ec_specific_level,seq,seqlength,f1,f2,f3,...,f1891,f1892,f1893,f1894,f1895,f1896,f1897,f1898,f1899,f1900
0,A0A023GPI8,False,0,-,0,ADTIVAVELDTYPNTDIGDPSYPHIGIDIKSVRSKKTAKWNMQNGK...,236,0.000115,0.105367,0.037413,...,0.052682,-0.020702,0.081852,0.008791,-0.036727,-0.027552,-0.254128,0.020778,0.140904,0.140635
1,A0A023GPJ0,False,0,-,0,MFGIFSKGEPVSMEGELVQPSSIVINDYEEELHLPLSYWDIKDYKN...,143,0.001688,-0.017209,0.029518,...,0.04057,-0.031992,-0.174783,0.000952,-0.136715,0.042773,0.132279,0.015899,0.15293,-0.003412


## 3. 同源比对

In [4]:
# 读取比对结果
res_data=funclib.getblast_fasta(trainfasta=cfg.TRAIN_FASTA,testfasta=cfg.TEST_FASTA)
print(' query samples:{0}\n results samples: {1}'.format(len(test), len(res_data)))

diamond makedb --in /home/shizhenkun/codebase/BioUniprot/data/benchmark/data/train.fasta -d /tmp/train.dmnd
diamond blastp -d /tmp/train.dmnd  -q  /home/shizhenkun/codebase/BioUniprot/data/benchmark/data/test.fasta -o /tmp/test_fasta_results.tsv -b5 -c1 -k 1
 query samples:8033
 results samples: 6034


In [5]:
# 给比对结果添加标签
isEmzyme_dict = {v: k for k,v in zip(train.isemzyme, train.id )} 
res_data['isEmzyme_pred'] = res_data['sseqid'].apply(lambda x: isEmzyme_dict.get(x))

isEmzyme_dict_test = {v: k for k,v in zip(test.isemzyme, test.id )} 
res_data['isEmzyme_groudtruth'] = res_data['id'].apply(lambda x: isEmzyme_dict_test.get(x))

# 获取比对结果
res_data_sub = res_data[['id','isEmzyme_pred']]

#选取测试集的ID与是否是酶2列
test_data_sub=test[['id', 'isemzyme']]
test_data_sub.reset_index(drop=True, inplace=True)

#拼合比对结果到测试集
res = pd.merge(test_data_sub,res_data_sub, on='id', how='left')

print('baslineName', '\t', 'accuracy','\t', 'precision(PPV) \t NPV \t\t', 'recall','\t', 'f1', '\t\t', '\t confusion Matrix')
eva.caculateMetrix(groundtruth=res.isemzyme, predict=res.isEmzyme_pred, baselineName='Blast', type='include_unfind')


baslineName 	 accuracy 	 precision(PPV) 	 NPV 		 recall 	 f1 		 	 confusion Matrix
Blast 		0.703598 	0.969148 		0.904980 	0.807488 	0.880963 	 tp: 2890 fp: 92 fn: 290 tn: 2762 up: 399 un: 1600


## 4. 机器学习方法预测
### 4.1 one-hot + 机器学习

In [6]:
MAX_SEQ_LENGTH = 1500 #定义序列最长的长度
trainset.seq = trainset.seq.map(lambda x : x[0:MAX_SEQ_LENGTH].ljust(MAX_SEQ_LENGTH, 'X'))
testset.seq = testset.seq.map(lambda x : x[0:MAX_SEQ_LENGTH].ljust(MAX_SEQ_LENGTH, 'X'))


f_train = funclib.dna_onehot(trainset) #训练集编码
f_test = funclib.dna_onehot(testset) #测试集编码

In [7]:
# 计算指标
X_train = np.array(f_train.iloc[:,2:])
X_test = np.array(f_test.iloc[:,2:])
Y_train = np.array(f_train.isemzyme.astype('int'))
Y_test = np.array(f_test.isemzyme.astype('int'))
funclib.run_baseline(X_train, Y_train, X_test, Y_test)

baslineName 	 accuracy 	 precision(PPV) 	 NPV 		 recall 	 f1 		 auroc 		 auprc 		 confusion Matrix
lr 		0.647330 	0.588599 		0.712006 	0.692372 	0.636282 	 tp: 2478 fp: 1732 fn: 1101 tn: 2722
xg 		0.699863 	0.674850 		0.717665 	0.629785 	0.651539 	 tp: 2254 fp: 1086 fn: 1325 tn: 3368
dt 		0.630400 	0.591317 		0.658214 	0.551830 	0.570892 	 tp: 1975 fp: 1365 fn: 1604 tn: 3089
rf 		0.716171 	0.676831 		0.749312 	0.694607 	0.685604 	 tp: 2486 fp: 1187 fn: 1093 tn: 3267
gbdt 		0.677455 	0.616290 		0.746103 	0.731489 	0.668966 	 tp: 2618 fp: 1630 fn: 961 tn: 2824


### 5. Unirep + 机器学习

In [8]:
X_train = np.array(train.iloc[:,7:])
X_test = np.array(test.iloc[:,7:])

Y_train = np.array(train.isemzyme.astype('int')).flatten()
Y_test = np.array(test.isemzyme.astype('int')).flatten()

funclib.run_baseline(X_train, Y_train, X_test, Y_test)

baslineName 	 accuracy 	 precision(PPV) 	 NPV 		 recall 	 f1 		 auroc 		 auprc 		 confusion Matrix
lr 		0.836798 	0.859316 		0.822227 	0.757754 	0.805345 	 tp: 2712 fp: 444 fn: 867 tn: 4010
xg 		0.866302 	0.891284 		0.849752 	0.797150 	0.841593 	 tp: 2853 fp: 348 fn: 726 tn: 4106
dt 		0.796465 	0.794724 		0.797677 	0.732327 	0.762251 	 tp: 2621 fp: 677 fn: 958 tn: 3777
rf 		0.863438 	0.915050 		0.832838 	0.764459 	0.833004 	 tp: 2736 fp: 254 fn: 843 tn: 4200
gbdt 		0.833188 	0.873790 		0.809051 	0.731210 	0.796167 	 tp: 2617 fp: 378 fn: 962 tn: 4076


### 6. ESM + 机器学习
#### 6.1 REP33

In [9]:
train_esm_33 = pd.read_feather(cfg.DATADIR + 'train_rep33.feather')
test_esm_33 = pd.read_feather(cfg.DATADIR + 'test_rep33.feather')

train_esm = trainset.merge(train_esm_33, on='id', how='left')
test_esm = testset.merge(test_esm_33, on='id', how='left')

In [None]:
X_train = np.array(train_esm.iloc[:,4:])
X_test = np.array(test_esm.iloc[:,4:])

Y_train = np.array(train_esm.isemzyme.astype('int')).flatten()
Y_test = np.array(test_esm.isemzyme.astype('int')).flatten()

funclib.run_baseline(X_train, Y_train, X_test, Y_test)

baslineName 	 accuracy 	 precision(PPV) 	 NPV 		 recall 	 f1 		 auroc 		 auprc 		 confusion Matrix
lr 		0.903150 	0.920444 		0.890897 	0.856664 	0.887410 	 tp: 3066 fp: 265 fn: 513 tn: 4189
xg 		0.912859 	0.939006 		0.894825 	0.860296 	0.897929 	 tp: 3079 fp: 200 fn: 500 tn: 4254
dt 		0.828956 	0.841860 		0.820300 	0.758592 	0.798060 	 tp: 2715 fp: 510 fn: 864 tn: 3944
rf 		0.916096 	0.960965 		0.887136 	0.846046 	0.899851 	 tp: 3028 fp: 123 fn: 551 tn: 4331


#### 6.2 REP32

In [None]:
train_esm_32 = pd.read_feather(cfg.DATADIR + 'train_rep32.feather')
test_esm_32 = pd.read_feather(cfg.DATADIR + 'test_rep32.feather')

train_esm = trainset.merge(train_esm_32, on='id', how='left')
test_esm = testset.merge(test_esm_32, on='id', how='left')

X_train = np.array(train_esm.iloc[:,4:])
X_test = np.array(test_esm.iloc[:,4:])

Y_train = np.array(train_esm.isemzyme.astype('int')).flatten()
Y_test = np.array(test_esm.isemzyme.astype('int')).flatten()

funclib.run_baseline(X_train, Y_train, X_test, Y_test)

#### 6.3 REP0

In [None]:
train_esm_0 = pd.read_feather(cfg.DATADIR + 'train_rep0.feather')
test_esm_0 = pd.read_feather(cfg.DATADIR + 'test_rep0.feather')

train_esm = trainset.merge(train_esm_0, on='id', how='left')
test_esm = testset.merge(test_esm_0, on='id', how='left')

X_train = np.array(train_esm.iloc[:,4:])
X_test = np.array(test_esm.iloc[:,4:])

Y_train = np.array(train_esm.isemzyme.astype('int')).flatten()
Y_test = np.array(test_esm.isemzyme.astype('int')).flatten()

funclib.run_baseline(X_train, Y_train, X_test, Y_test)