# Task1. 预测是酶还是非酶，使用DeepRE数据

> author: Shizhenkun   
> email: zhenkun.shi@tib.cas.cn   
> date: 2021-05-11  

## 任务简介
该任务通过给定蛋白序列，预测该该蛋白是酶还是非酶。本任务所使用的数据集为DeepRE，数据集中有标定好的数据。

参考文章：DEEPre: sequence-based enzyme EC number prediction by deep learning  
URL：https://pubmed.ncbi.nlm.nih.gov/29069344/   
发表日期：2018年3月


## 数据统计
- 数据源DeepRE，共有数据44,336条，其中有酶数据22,168条，非酶数据22,168条。
- ～90%作为训练集，～10%作为测试集, 训练集数据39902条，测试集数据4434条。

### 0. 导入必要的包

In [116]:
import numpy as np
import pandas as pd
import random
import time
import gzip
import re
from Bio import SeqIO
import datetime
import sys
from tqdm import tqdm

from functools import reduce
import matplotlib.pyplot as plt

sys.path.append("../../tools/")
import commontools
import funclib

from pyecharts.globals import CurrentConfig, NotebookType
CurrentConfig.NOTEBOOK_TYPE = NotebookType.JUPYTER_LAB

from pyecharts import options as opts
from pyecharts.charts import Bar
from pyecharts.faker import Faker
from pyecharts.globals import ThemeType

from jax_unirep import get_reps

### 1. 数据预处理

In [117]:
f = open("./data/deepre/new_data_label_sequence.txt")
line = f.readline()

counter = 0
enzyme = []
while line:
    seq = line.split('>')[1].replace('\n','')
    enzyme.append([seq,1])
    line = f.readline()
    counter +=1
#     if counter ==5:
#         break
f.close()

enzyme = pd.DataFrame(enzyme)
enzyme.columns=['seq', 'label']
enzyme['seqlength'] = enzyme['seq'].map(lambda x : len(x))


f = open("./data/deepre/non_enzyme_new_data_sequence.txt")
line = f.readline()

counter = 0
non_enzyme = []
while line:
    seq = line.split('>')[1].replace('\n','')
    non_enzyme.append([seq,0])
    line = f.readline()
    counter +=1
#     if counter ==5:
#         break
f.close()

non_enzyme = pd.DataFrame(non_enzyme)
non_enzyme.columns=['seq', 'label']
non_enzyme['seqlength'] = non_enzyme['seq'].map(lambda x : len(x))


In [118]:
train = enzyme.iloc[:19951,]
test = enzyme.iloc[19951:-1,]

train = train.append(non_enzyme.iloc[:19951,])
test = test.append(non_enzyme.iloc[19951:-1,])

train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

train['id'] = train.index
test['id'] = test.index

In [119]:
train.to_csv('./data/train_deepec.tsv', sep='\t', index=0)
test.to_csv('./data/test_deepec.tsv', sep='\t',  index=0)

funclib.table2fasta(train, './data/train_deepec.fasta')
funclib.table2fasta(test, './data/test_deepec.fasta')

Write finished
Write finished


### 3. 同源比对

In [120]:
! diamond makedb --in ./data/train_deepec.fasta -d ./data/train_deepec.dmnd     #建库
! diamond blastp -d ./data/train_deepec.dmnd  -q ./data/test_deepec.fasta -o ./data/test_deepec_fasta_results.tsv -b5 -c1 -k 1  #生成比对文件

diamond v2.0.8.146 (C) Max Planck Society for the Advancement of Science
Documentation, support and updates available at http://www.diamondsearch.org

#CPU threads: 32
Scoring parameters: (Matrix=BLOSUM62 Lambda=0.267 K=0.041 Penalties=11/1)
Database input file: ./data/train_deepec.fasta
Opening the database file...  [0.406s]
Loading sequences...  [0.059s]
Masking sequences...  [0.052s]
Writing sequences...  [0.008s]
Hashing sequences...  [0.004s]
Loading sequences...  [0s]
Writing trailer...  [0s]
Closing the input file...  [0.001s]
Closing the database file...  [0.026s]
Database hash = 73b25e3f21156a992e5d170dbc7223bd
Processed 39902 sequences, 17343489 letters.
Total time = 0.559s
diamond v2.0.8.146 (C) Max Planck Society for the Advancement of Science
Documentation, support and updates available at http://www.diamondsearch.org

#CPU threads: 32
Scoring parameters: (Matrix=BLOSUM62 Lambda=0.267 K=0.041 Penalties=11/1)
Temporary directory: ./data
#Target sequences to report alignment

In [122]:
#读入比对结果
res_data = pd.read_csv('./data/test_deepec_fasta_results.tsv',header=0, sep='\t', names=['id', 'sseqid', 'pident', 'length','mismatch','gapopen','qstart','qend','sstart','send','evalue','bitscore'])

#匹配查询结果
data_match = pd.merge(test,res_data, on=['id'], how='inner')
data_match.head(5)

Unnamed: 0,seq,label,seqlength,id,sseqid,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore
0,MQRPGPFSTLYGRVLAPLPGRAGGAASGGGGNSWDLPGSHVRLPGR...,1,311,1,19941,45.5,235,125,2,75,306,31,265,3.2800000000000003e-66,209.0
1,MSEAKRRAAEKAIEYVENDMIIGVGTGSTVAYFIDALGHTPKRIKG...,1,215,2,19950,41.4,220,111,5,4,208,11,227,1.19e-38,134.0
2,MSKQPENSFSSDKFFPIKQKLALEAVALVEPGMCVGLGSGSTAREF...,1,242,4,19942,38.4,219,133,2,17,234,9,226,3.71e-41,140.0
3,MSQKPTSHPYKPNMTQDELKALVGQAALPYVEPGSIVGVGTGSTVN...,1,240,5,19942,39.4,226,130,3,16,238,6,227,1.38e-40,139.0
4,MSVTPIEELPSLGDALEDAKRAASYRAVDENLDPAKHRVVGIGSGS...,1,255,6,19941,52.0,244,106,8,8,249,27,261,9.250000000000001e-73,224.0


In [123]:
counter =0
resArray =[]
for i in range(len(res_data)):
    counter+=1
    mn = train[train['id']== res_data['sseqid'][i]]['label'].values
    resArray.append(mn)
    if counter %1000 ==0:
        print(counter)
data_match['q_label']=np.array(resArray) 
data_match.head(3)

1000


Unnamed: 0,seq,label,seqlength,id,sseqid,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore,q_label
0,MQRPGPFSTLYGRVLAPLPGRAGGAASGGGGNSWDLPGSHVRLPGR...,1,311,1,19941,45.5,235,125,2,75,306,31,265,3.2800000000000003e-66,209.0,1
1,MSEAKRRAAEKAIEYVENDMIIGVGTGSTVAYFIDALGHTPKRIKG...,1,215,2,19950,41.4,220,111,5,4,208,11,227,1.19e-38,134.0,1
2,MSKQPENSFSSDKFFPIKQKLALEAVALVEPGMCVGLGSGSTAREF...,1,242,4,19942,38.4,219,133,2,17,234,9,226,3.71e-41,140.0,1


In [124]:
# 计算指标
data_match['iscorrect'] = data_match[['label', 'q_label']].apply(lambda x: x['label'] == x['q_label'], axis=1) #判断EC号是否一致
correct = sum(data_match['iscorrect'])
find  = len(data_match)
total = len(test)
print('Total query records are: {0}'.format(total))
print('Matched records are: {0}'.format(find))
print('Accuracy: {0}({1}/{2})'.format(correct/total, correct, total))
print('Pricision: {0}({1}/{2})'.format(correct/find, correct, find))
print('Recall: {0}({1}/{2})'.format(find/total, find, total))

Total query records are: 4432
Matched records are: 1348
Accuracy: 0.24413357400722022(1082/4432)
Pricision: 0.8026706231454006(1082/1348)
Recall: 0.30415162454873645(1348/4432)


### 4. 传统机器学习

In [136]:
trainset = train[['id', 'label','seq', 'seqlength']].reset_index(drop=True)
testset = test[['id', 'label','seq', 'seqlength']].reset_index(drop=True)

In [137]:
MAX_SEQ_LENGTH = 1000 #定义序列最长的长度
trainset.seq = trainset.seq.map(lambda x : x[0:MAX_SEQ_LENGTH].ljust(MAX_SEQ_LENGTH, 'X'))
testset.seq = testset.seq.map(lambda x : x[0:MAX_SEQ_LENGTH].ljust(MAX_SEQ_LENGTH, 'X'))

In [138]:
f_train = funclib.dna_onehot(trainset) #训练集编码
f_test = funclib.dna_onehot(testset) #测试集编码

train_full = pd.concat([trainset, f_train], axis=1, join='inner' ) #拼合训练集
test_full = pd.concat([testset, f_test], axis=1, join='inner' )    #拼合测试集


X_train = train_full.iloc[:,4:]
X_test = test_full.iloc[:,4:]
Y_train = train_full.label.astype('int')
Y_test = test_full.label.astype('int')

X_train = np.array(X_train)
X_test = np.array(X_test)
Y_train = np.array(Y_train)
Y_test = np.array(Y_test)

In [139]:
methods=['lr', 'xg', 'dt', 'rf', 'gbdt']
print('baslineName', '\t', 'accuracy','\t', 'precision(PPV) \t NPV \t\t', 'recall','\t', 'f1', '\t\t', 'auroc','\t\t', 'auprc', '\t\t confusion Matrix')
for method in methods:
    funclib.evaluate(method, X_train, Y_train, X_test, Y_test)

baslineName 	 accuracy 	 precision(PPV) 	 NPV 		 recall 	 f1 		 auroc 		 auprc 		 confusion Matrix
lr 		0.602211 	0.582454 		0.634421 	0.722022 	0.644771 	0.641392 	0.587693 	 tp: 1600 fp: 1147 fn: 616 tn: 1069
xg 		0.671706 	0.646854 		0.706681 	0.756318 	0.697316 	0.732797 	0.701976 	 tp: 1676 fp: 915 fn: 540 tn: 1301
dt 		0.555731 	0.555011 		0.556470 	0.562274 	0.558619 	0.555731 	0.530931 	 tp: 1246 fp: 999 fn: 970 tn: 1217
rf 		0.644856 	0.608008 		0.719863 	0.815433 	0.696608 	0.684255 	0.626909 	 tp: 1807 fp: 1165 fn: 409 tn: 1051
gbdt 		0.668998 	0.611691 		0.847081 	0.925542 	0.736577 	0.722414 	0.667868 	 tp: 2051 fp: 1302 fn: 165 tn: 914


### 5. UniRep

In [142]:
X_train = trainset['seq']
Y_train = trainset.label.astype('int')

X_test = testset['seq']
Y_test = testset.label.astype('int')

X_train = np.array(X_train)
X_test = np.array(X_test)
Y_train = np.array(Y_train)
Y_test = np.array(Y_test)

In [143]:
X_train_res = []
counter = 0
for i in tqdm(range(len(X_train))):
    train_h_avg, train_h_final, train_c_final= get_reps(X_train[i])
    X_train_res.append(train_h_final)
    counter += 1
#     if counter %100 == 0:
#         print(counter)

 59%|#####8    | 23363/39902 [4:11:07<2:57:46,  1.55it/s] 


KeyboardInterrupt: 

In [141]:
X_train

array([[ 1, 19,  1, ...,  0,  0,  0],
       [14,  1, 19, ...,  0,  0,  0],
       [14,  9, 16, ...,  0,  0,  0],
       ...,
       [14, 12, 13, ...,  0,  0,  0],
       [14, 19,  4, ...,  0,  0,  0],
       [14, 22,  2, ...,  0,  0,  0]])