# Task1. 预测是酶还是非酶

> author: Shizhenkun   
> email: zhenkun.shi@tib.cas.cn   
> date: 2021-04-27  

## 任务简介
该任务通过给定蛋白序列，预测该该蛋白是酶还是非酶。本任务所使用的数据集为Sport，对数据集的数据中进行学习，数据中有EC号被认为是酶，没有EC号的被认为是非酶。


## 数据统计
- 数据源Sprot，共有数据564,638条，其中有EC号的数据270,236条，无EC号的数据294402条。
- 将数据集中的所有数据按照时间排序，～90%作为训练集，～10%作为测试集，找到对应时间节点为2010年2月9日。
- 以2010年2月10日为时间节点，之前的数据为训练集，之后的数据为测试集，具体数据集统计如下： 


 |     name     | 训练集 |             测试集                |
| ------------ | --- | ------------------------------- |
| 数据量 |  510490（510490/564638≈90.41%） | 54148（54148/564638≈9.59%） |

## 实验方法

- 同源比对：使用训练集建立比对库，然后将测试集与之比对，取最优的比对结果，比对结果的（酶/非酶）属性当作测试集的测试结果
- 传统机器学习方法
- 深度学习方法


## 实验结果

|Methods   | Accuracy                        |             Precision           |           Recall               |
| ---------| ------------------------------- | ------------------------------- |--------------------------------|
| 同源比对  |  0.6243628573539189(33808/54148) | 0.8220590380781014(33808/41126) |0.7595109699342543(41126/54148)|
| LR.      |  32                             |     |

In [35]:
! wc -l ../../data/sprot_without_ec.tsv

294402 ../../data/sprot_without_ec.tsv


In [34]:
Accuracy: 0.6243628573539189(33808/54148)
Pricision: 0.8220590380781014(33808/41126)
Recall: 0.7595109699342543(41126/54148)

564438

In [31]:
54148/564638

0.09589861114554812

In [28]:
len(train) + len(test)

564638

In [29]:
len(sprot)

564638

## 1. 导入必要的包、定义公共函数

In [1]:
import numpy as np
import pandas as pd
import random
import time
import gzip
import re
from Bio import SeqIO
import datetime
import sys

from functools import reduce

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import normalize
from sklearn import metrics
from sklearn import linear_model, datasets
from sklearn.svm import SVC
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
import xgboost
from xgboost import XGBClassifier

import matplotlib.pyplot as plt

import random

def table2fasta(table, file_out):
    file = open(file_out, 'w')
    for index, row in table.iterrows():
        file.write('>{0}\n'.format(row['id']))
        file.write('{0}\n'.format(row['seq']))
    file.close()
    print('Write finished')

#氨基酸字典(X未知项用于数据对齐)
prot_dict = dict(
                    A=1,  R=2,  N=3,  D=4,  C=5,  E=6,  Q=7,  G=8,  H=9,  O=10, I=11, L=12, 
                    K=13, M=14, F=15, P=16, U=17, S=18, T=19, W=20, Y=21, V=22, B=23, Z=24, X=0
                )

# one-hot 编码
def dna_onehot(Xdna):
    listtmp = list()
    for index, row in Xdna.iterrows():
        row = [prot_dict[x] if x in prot_dict else x for x in row['seq']]
        listtmp.append(row)
    return pd.DataFrame(listtmp)

## 2. 加载数据

In [2]:
table_head = [  'id', 
                'isemzyme',
                'isMultiFunctional', 
                'functionCounts', 
                'ec_number', 
                'date_integraged',
                'date_sequence_update',
                'date_annotation_update',
                'seq', 
                'seqlength'
            ]

#加载数据并转换时间格式
sprot = pd.read_csv('../../data/sprot_full.tsv', sep='\t',names=table_head) #读入文件
sprot.date_integraged = pd.to_datetime(sprot['date_integraged'])
sprot.date_sequence_update = pd.to_datetime(sprot['date_sequence_update'])
sprot.date_annotation_update = pd.to_datetime(sprot['date_annotation_update'])

sprot.head(5)

Unnamed: 0,id,isemzyme,isMultiFunctional,functionCounts,ec_number,date_integraged,date_sequence_update,date_annotation_update,seq,seqlength
0,P02802,False,False,1,-,1986-07-21,1986-07-21,2021-04-07,MDPNCSCSTGGSCTCTSSCACKNCKCTSCKKSCCSCCPVGCSKCAQ...,61
1,P02732,False,False,1,-,1986-07-21,1986-07-21,2019-12-11,AATAATAATAATAATAATAATAATAATAATA,31
2,P02733,False,False,1,-,1986-07-21,1986-07-21,2019-12-11,DTASDAAAAAALTAABAAAAAKLTABBAAAAAAATAA,37
3,P02734,False,False,1,-,1986-07-21,1987-08-13,2019-12-11,MRITEANPDPDAKAVPAAAAPSTASDAAAAAAATAATAAAAAAATA...,85
4,P00484,True,False,1,2.3.1.28,1986-07-21,1988-08-01,2021-04-07,MNYTKFDVKNWVRREHFEFYRHRLPCGFSLTSKIDITTLKKSLDDS...,213


## 3. 划分训练集、测试集

In [22]:
thres = datetime.datetime(2010, 2, 10, 0, 0)

#训练集
train = sprot[sprot.date_integraged <= thres ].sort_values(by='date_integraged')
#测试集
test = sprot[sprot.date_integraged > thres ].sort_values(by='date_integraged')

# train.to_csv('./data/train.tsv', sep='\t', columns=['id', 'isemzyme','seq'], index=0)
# test.to_csv('./data/test.tsv', sep='\t', columns=['id', 'isemzyme','seq'], index=0)

# table2fasta(train, './data/train.fasta')
# table2fasta(test, './data/test.fasta')

## 4. 二分类
### 4.1 同源比对

In [None]:
! diamond makedb --in ./data/train.fasta -d ./data/train.dmnd     #建库
! diamond blastp -d ./data/train.dmnd  -q ./data/test.fasta -o ./data/test_fasta_results.tsv -b5 -c1 -k 1   #生成比对文件

In [7]:
#读入比对结果
res_data = pd.read_csv('./data/test_fasta_results.tsv', sep='\t', names=['id', 'sseqid', 'pident', 'length','mismatch','gapopen','qstart','qend','sstart','send','evalue','bitscore'])

#匹配查询结果
data_match = pd.merge(test,res_data, on=['id'], how='inner')

In [None]:
# 添加查询结果的EC号
counter =0
resArray =[]
for i in range(len(res_data)):
    counter+=1
    mn = train[train['id']== res_data['sseqid'][i]]['ec_number'].values
    resArray.append(mn)
    if counter %1000 ==0:
        print(counter)
data_match['sresults_ec']=np.array(resArray) 
data_match.head(3)

In [14]:
# 计算指标
data_match['iscorrect'] = data_match[['ec_number', 'sresults_ec']].apply(lambda x: x['ec_number'] == x['sresults_ec'], axis=1) #判断EC号是否一致
correct = sum(data_match['iscorrect'])
find  = len(data_match)
total = len(test)
print('Total query records are: {0}'.format(total))
print('Matched records are: {0}'.format(find))
print('Accuracy: {0}({1}/{2})'.format(correct/total, correct, total))
print('Pricision: {0}({1}/{2})'.format(correct/find, correct, find))
print('Recall: {0}({1}/{2})'.format(find/total, find, total))

Total query records are: 54148
Matched records are: 41126
Accuracy: 0.6243628573539189(33808/54148)
Pricision: 0.8220590380781014(33808/41126)
Recall: 0.7595109699342543(41126/54148)


In [47]:
sprot.seqlength.describe()

count    564638.000000
mean        360.442643
std         336.460236
min           2.000000
25%         169.000000
50%         294.000000
75%         449.000000
max       35213.000000
Name: seqlength, dtype: float64

### 4.2 使用机器学习方法

In [6]:
trainset = train[['id', 'isemzyme','seq', 'seqlength']].reset_index(drop=True)
testset = test[['id', 'isemzyme','seq', 'seqlength']].reset_index(drop=True)

MAX_SEQ_LENGTH = 500 #定义序列最长的长度
trainset.seq = trainset.seq.map(lambda x : x[0:MAX_SEQ_LENGTH].ljust(MAX_SEQ_LENGTH, 'X'))
testset.seq = testset.seq.map(lambda x : x[0:MAX_SEQ_LENGTH].ljust(MAX_SEQ_LENGTH, 'X'))

In [7]:
f_train = dna_onehot(trainset) #训练集编码
f_test = dna_onehot(testset) #测试集编码

train_full = pd.concat([trainset, f_train], axis=1, join='inner' ) #拼合训练集
test_full = pd.concat([testset, f_test], axis=1, join='inner' )    #拼合测试集

In [8]:
X_train = train_full.iloc[:,4:]
X_test = test_full.iloc[:,4:]
Y_train = train_full.isemzyme.astype('int')
Y_test = test_full.isemzyme.astype('int')

X_train = np.array(X_train)
X_test = np.array(X_test)
Y_train = np.array(Y_train)
Y_test = np.array(Y_test)

In [9]:
import function
    
    

In [None]:
methods=['lr', 'xg', 'dt', 'rf', 'gbdt']
print('baslineName', '\t', 'accuracy','\t', 'precision(PPV) \t NPV \t\t', 'recall','\t', 'f1', '\t\t', 'auroc','\t\t', 'auprc', '\t\t confusion Matrix')
for method in methods:
    function.evaluate(method, X_train, Y_train, X_test, Y_test)

baslineName 	 accuracy 	 precision(PPV) 	 NPV 		 recall 	 f1 		 auroc 		 auprc 		 confusion Matrix


In [None]:
function.xgmain(X_train_std, Y_train, X_test_std, Y_test)

In [1]:
for line in locals()['In']:
    print(line)


for line in locals()['In']:
    print(line)
