# Task2. 预测酶是几功能酶

> author: Shizhenkun   
> email: zhenkun.shi@tib.cas.cn   
> date: 2021-08-28  

## 任务简介
该任务通过给定酶序列，预测该酶几功能酶。


## 1. 导入必要的包

In [1]:
import numpy as np
import pandas as pd
import random
import time
import datetime
import sys
import os
from tqdm import tqdm
from functools import reduce

sys.path.append("../../tools/")
import commontools
import funclib
import ucTools

sys.path.append("../../")
import benchmark_common as bcommon
import benchmark_train as btrain
import benchmark_test as btest
import benchmark_config as cfg
import benchmark_evaluation as eva

from pandarallel import pandarallel # 导入pandaralle
pandarallel.initialize() # 初始化该这个b...并行库

%load_ext autoreload
%autoreload 2

INFO: Pandarallel will run on 32 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


## 2. 加载数据

In [33]:
train = pd.read_feather(cfg.DATADIR+'train.feather')
test = pd.read_feather(cfg.DATADIR+'test.feather')

train = train[train.isemzyme]
test = test[test.isemzyme]

print('train size: {0}\ntest size: {1}'.format(len(train), len(test)))

# 取子集
trainset = train[['id', 'functionCounts','seq', 'seqlength']].reset_index(drop=True)
testset = test[['id', 'functionCounts','seq', 'seqlength']].reset_index(drop=True)

train.head(3)

train size: 222566
test size: 3579


Unnamed: 0,id,isemzyme,functionCounts,ec_number,ec_specific_level,seq,seqlength,f1,f2,f3,...,f1891,f1892,f1893,f1894,f1895,f1896,f1897,f1898,f1899,f1900
99,A0A023W421,True,2,"3.1.1.96, 3.1.1.-",4,MKLVVQRVTDASVTVDGAVAGRIGPGIMALVGVTHEDTEEDAAYLA...,147,0.005019,0.019584,0.181748,...,-0.013379,0.404682,-0.046457,0.026537,-0.246087,0.334321,-0.02101,0.137102,-0.195889,-0.021511
101,A0A024B7W1,True,6,"3.4.21.91, 3.6.1.15, 3.6.4.13, 2.1.1.56, 2.1.1...",4,MKNPKKKSGGFRIVNMLKRGVARVSPFGGLKRLPAGLLLGHGPIRM...,3423,7.7e-05,0.019192,0.008966,...,0.030219,0.065129,-0.482739,0.022922,-0.08198,0.036783,0.010605,0.001952,-0.04481,-0.013897
102,A0A024BTN9,True,1,1.4.3.2,4,SCADDRNPLEECFQETDYEEFLEIARNGLKATSNPKHVVIVGAGMS...,497,0.000237,-0.024782,0.161787,...,0.013369,0.055697,-0.039346,0.009738,-0.391294,0.057232,-0.026859,0.15359,-0.236265,0.778679


## 3. 同源比对

In [38]:
# 写fasta文件
bcommon.save_table2fasta(dataset=trainset, file_out=cfg.TEMPDIR+'train.fasta')
bcommon.save_table2fasta(dataset=testset, file_out=cfg.TEMPDIR+'test.fasta')

# 读取比对结果
res_data=funclib.getblast_fasta(trainfasta=cfg.TEMPDIR+'train.fasta',testfasta=cfg.TEMPDIR+'test.fasta')
print(' query samples:{0}\n results samples: {1}'.format(len(test), len(res_data)))

# 给比对结果添加标签
functionCounts = {v: k for k,v in zip(train.functionCounts, train.id )} 
res_data['functionCounts_blast'] = res_data['sseqid'].apply(lambda x: functionCounts.get(x))
# 拼合结果
blast_res = testset.merge(res_data[['id', 'functionCounts_blast','pident' ]], on='id', how='left')
# 计算结果

print('\n\n')

print('%12s'%'baslineName', '\t\t', 'accuracy','\t', 'precision-macro \t', 'recall-macro','\t', 'f1-macro')
eva.caculateMetrix( groundtruth=blast_res.functionCounts, predict=blast_res['functionCounts_blast'].fillna('0').astype('int'),  baselineName='blast', type='multi')

print('\n\n')


Write finished
Write finished
diamond makedb --in /home/shizhenkun/codebase/BioUniprot/data/benchmark/tmp/train.fasta -d /tmp/train.dmnd
diamond blastp -d /tmp/train.dmnd  -q  /home/shizhenkun/codebase/BioUniprot/data/benchmark/tmp/test.fasta -o /tmp/test_fasta_results.tsv -b5 -c1 -k 1
 query samples:3579
 results samples: 3030



 baslineName 		 accuracy 	 precision-macro 	 recall-macro 	 f1-macro
       blast  		0.742666  	0.500151 		0.402750 	0.309967





## 4. 机器学习方法预测
### 4.1 onehot + 机器学习

In [39]:
MAX_SEQ_LENGTH = 1500 #定义序列最长的长度
trainset.seq = trainset.seq.map(lambda x : x[0:MAX_SEQ_LENGTH].ljust(MAX_SEQ_LENGTH, 'X'))
testset.seq = testset.seq.map(lambda x : x[0:MAX_SEQ_LENGTH].ljust(MAX_SEQ_LENGTH, 'X'))

In [45]:
f_train = funclib.dna_onehot(trainset) #训练集编码
f_test = funclib.dna_onehot(testset) #测试集编码

# 计算指标
X_train = np.array(f_train.iloc[:,2:])
X_test = np.array(f_test.iloc[:,2:])
Y_train = np.array(f_train.functionCounts.astype('int'))
Y_test = np.array(f_test.functionCounts.astype('int'))



In [None]:
X_test

In [9]:
funclib.run_baseline(X_train, Y_train, X_test, Y_test)

baslineName 	 accuracy 	 precision(PPV) 	 NPV 		 recall 	 f1 		 auroc 		 auprc 		 confusion Matrix
lr 		0.910640 	0.271739 		0.922299 	0.059988 	0.098280 	0.551935 	0.124858 	 tp: 100 fp: 268 fn: 1567 tn: 18600
xg 		0.918578 	0.490909 		0.924383 	0.080984 	0.139032 	0.626453 	0.193860 	 tp: 135 fp: 140 fn: 1532 tn: 18728
dt 		0.851668 	0.173378 		0.929386 	0.219556 	0.193753 	0.563515 	0.101421 	 tp: 366 fp: 1745 fn: 1301 tn: 17123
rf 		0.924032 	0.957265 		0.923842 	0.067187 	0.125561 	0.668622 	0.269895 	 tp: 112 fp: 5 fn: 1555 tn: 18863
gbdt 		0.917507 	0.398496 		0.920890 	0.031794 	0.058889 	0.601392 	0.161334 	 tp: 53 fp: 80 fn: 1614 tn: 18788


### 4.2 Unirep + 机器学习

In [10]:
X_train = train.iloc[:,12:]
X_test = test.iloc[:,12:]

Y_train = train.iloc[:,3].astype('int')
Y_test = test.iloc[:,3].astype('int')

X_train = np.array(X_train)
X_test = np.array(X_test)
Y_train = np.array(Y_train).flatten()
Y_test = np.array(Y_test).flatten()

In [11]:
funclib.run_baseline(X_train, Y_train, X_test, Y_test)

baslineName 	 accuracy 	 precision(PPV) 	 NPV 		 recall 	 f1 		 auroc 		 auprc 		 confusion Matrix
lr 		0.906306 	0.268468 		0.924024 	0.089382 	0.134113 	0.658508 	0.161993 	 tp: 149 fp: 406 fn: 1518 tn: 18462
xg 		0.922669 	0.613181 		0.928019 	0.128374 	0.212302 	0.730793 	0.294830 	 tp: 214 fp: 135 fn: 1453 tn: 18733
dt 		0.867835 	0.204738 		0.930498 	0.217756 	0.211047 	0.571513 	0.108084 	 tp: 363 fp: 1410 fn: 1304 tn: 17458
rf 		0.923789 	0.947368 		0.923657 	0.064787 	0.121280 	0.693704 	0.297006 	 tp: 108 fp: 6 fn: 1559 tn: 18862
gbdt 		0.918578 	0.474747 		0.920728 	0.028194 	0.053228 	0.649263 	0.171179 	 tp: 47 fp: 52 fn: 1620 tn: 18816
