# Task2. 预测酶是几功能酶

> author: Shizhenkun   
> email: zhenkun.shi@tib.cas.cn   
> date: 2021-08-28  

## 任务简介
该任务通过给定酶序列，预测该酶几功能酶。


## 1. 导入必要的包

In [55]:
import numpy as np
import pandas as pd
import random
import time
import datetime
import sys
import os
from tqdm import tqdm
from functools import reduce

sys.path.append("../../tools/")
import funclib
import ucTools
from sklearn.model_selection import train_test_split

sys.path.append("../../")
import benchmark_common as bcommon
import benchmark_train as btrain
import benchmark_test as btest
import benchmark_config as cfg
import benchmark_evaluation as eva

from pandarallel import pandarallel # 导入pandaralle
pandarallel.initialize() # 初始化该这个b...并行库

%load_ext autoreload
%autoreload 2

INFO: Pandarallel will run on 32 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## 2. 加载数据

In [33]:
train = pd.read_feather(cfg.DATADIR+'train.feather')
test = pd.read_feather(cfg.DATADIR+'test.feather')

train = train[train.isemzyme]
test = test[test.isemzyme]

print('train size: {0}\ntest size: {1}'.format(len(train), len(test)))

# 取子集
trainset = train[['id', 'functionCounts','seq', 'seqlength']].reset_index(drop=True)
testset = test[['id', 'functionCounts','seq', 'seqlength']].reset_index(drop=True)

train.head(3)

train size: 222566
test size: 3579


Unnamed: 0,id,isemzyme,functionCounts,ec_number,ec_specific_level,seq,seqlength,f1,f2,f3,...,f1891,f1892,f1893,f1894,f1895,f1896,f1897,f1898,f1899,f1900
99,A0A023W421,True,2,"3.1.1.96, 3.1.1.-",4,MKLVVQRVTDASVTVDGAVAGRIGPGIMALVGVTHEDTEEDAAYLA...,147,0.005019,0.019584,0.181748,...,-0.013379,0.404682,-0.046457,0.026537,-0.246087,0.334321,-0.02101,0.137102,-0.195889,-0.021511
101,A0A024B7W1,True,6,"3.4.21.91, 3.6.1.15, 3.6.4.13, 2.1.1.56, 2.1.1...",4,MKNPKKKSGGFRIVNMLKRGVARVSPFGGLKRLPAGLLLGHGPIRM...,3423,7.7e-05,0.019192,0.008966,...,0.030219,0.065129,-0.482739,0.022922,-0.08198,0.036783,0.010605,0.001952,-0.04481,-0.013897
102,A0A024BTN9,True,1,1.4.3.2,4,SCADDRNPLEECFQETDYEEFLEIARNGLKATSNPKHVVIVGAGMS...,497,0.000237,-0.024782,0.161787,...,0.013369,0.055697,-0.039346,0.009738,-0.391294,0.057232,-0.026859,0.15359,-0.236265,0.778679


## 3. 同源比对

In [38]:
# 写fasta文件
bcommon.save_table2fasta(dataset=trainset, file_out=cfg.TEMPDIR+'train.fasta')
bcommon.save_table2fasta(dataset=testset, file_out=cfg.TEMPDIR+'test.fasta')

# 读取比对结果
res_data=funclib.getblast_fasta(trainfasta=cfg.TEMPDIR+'train.fasta',testfasta=cfg.TEMPDIR+'test.fasta')
print(' query samples:{0}\n results samples: {1}'.format(len(test), len(res_data)))

# 给比对结果添加标签
functionCounts = {v: k for k,v in zip(train.functionCounts, train.id )} 
res_data['functionCounts_blast'] = res_data['sseqid'].apply(lambda x: functionCounts.get(x))
# 拼合结果
blast_res = testset.merge(res_data[['id', 'functionCounts_blast','pident' ]], on='id', how='left')
# 计算结果

print('\n\n')

print('%12s'%'baslineName', '\t\t', 'accuracy','\t', 'precision-macro \t', 'recall-macro','\t', 'f1-macro')
eva.caculateMetrix( groundtruth=blast_res.functionCounts, predict=blast_res['functionCounts_blast'].fillna('0').astype('int'),  baselineName='blast', type='multi')

print('\n\n')


Write finished
Write finished
diamond makedb --in /home/shizhenkun/codebase/BioUniprot/data/benchmark/tmp/train.fasta -d /tmp/train.dmnd
diamond blastp -d /tmp/train.dmnd  -q  /home/shizhenkun/codebase/BioUniprot/data/benchmark/tmp/test.fasta -o /tmp/test_fasta_results.tsv -b5 -c1 -k 1
 query samples:3579
 results samples: 3030



 baslineName 		 accuracy 	 precision-macro 	 recall-macro 	 f1-macro
       blast  		0.742666  	0.500151 		0.402750 	0.309967





## 4. 机器学习方法预测
### 4.1 onehot + 机器学习

In [39]:
MAX_SEQ_LENGTH = 1500 #定义序列最长的长度
trainset.seq = trainset.seq.map(lambda x : x[0:MAX_SEQ_LENGTH].ljust(MAX_SEQ_LENGTH, 'X'))
testset.seq = testset.seq.map(lambda x : x[0:MAX_SEQ_LENGTH].ljust(MAX_SEQ_LENGTH, 'X'))

In [45]:
f_train = funclib.dna_onehot(trainset) #训练集编码
f_test = funclib.dna_onehot(testset) #测试集编码

# 计算指标
X_train = np.array(f_train.iloc[:,2:])
X_test = np.array(f_test.iloc[:,2:])
Y_train = np.array(f_train.functionCounts.astype('int'))
Y_test = np.array(f_test.functionCounts.astype('int'))



In [85]:
groundtruth, predict, predictprob = funclib.lrmain(X_train_std=X_train[0:10000], Y_train=Y_train[0:10000], X_test_std=X_test, Y_test=Y_test)

[Parallel(n_jobs=-2)]: Using backend LokyBackend with 31 concurrent workers.
[Parallel(n_jobs=-2)]: Done   1 out of   1 | elapsed:   19.1s finished


In [58]:
x_train, x_vali, y_train, y_vali = train_test_split(X_train,np.array(Y_train).ravel(),test_size=0.2,random_state=1)
eval_set = [(x_train, y_train), (x_vali, y_vali)]

In [91]:
eva.caculateMetrix( groundtruth=Y_test, predict=predict, baselineName='k', type='multi')

           k  		0.845767  	0.882534 		0.118553 	0.123938


In [94]:
Y_test.shape

(3579,)

In [88]:
predictprob

array([7.70970933e-01, 1.21178191e-07, 8.32083576e-04, ...,
       4.95316192e-12, 1.16449381e-07, 1.19062537e-05])

In [82]:
model.decision_function(X_test)

array([[ 16.2233299 ,  13.2854452 ,   1.24928584, ...,  -8.78330026,
         -9.86510396,  -9.38334797],
       [ 18.22146167,  15.70596825,   6.92081989, ..., -10.17520219,
        -10.18988632, -10.29083957],
       [ 13.54664224,  17.55313138,   3.02475578, ..., -10.49306758,
        -10.26633932,  -9.73467565],
       ...,
       [ 22.16565754,  10.7347327 ,   4.25487239, ...,  -7.95684368,
        -10.44155889,  -9.93828283],
       [ 23.25878065,  10.44149013,   4.82809425, ...,  -7.94035348,
        -10.120435  ,  -9.66313978],
       [ 17.56767144,  12.46973213,   6.80789297, ...,  -8.71558793,
         -9.70552087,  -9.9162679 ]])

In [81]:
predict,Y_test

(array([1, 1, 2, ..., 1, 1, 1]), array([1, 1, 1, ..., 1, 1, 1]))

In [51]:
def lrmain(X_train_std, Y_train, X_test_std, Y_test):
    logreg = linear_model.LogisticRegression(solver = 'lbfgs', n_jobs=-2)
    logreg.fit(X_train_std, Y_train)
    predict = logreg.predict(X_test_std)
    lrpredpro = logreg.predict_proba(X_test_std)
    groundtruth = Y_test
    predictprob = lrpredpro[:,1]
    return groundtruth, predict, predictprob

In [None]:


model = XGBClassifier(
                        min_child_weight=6, 
                        max_depth=6, 
                        objective='multi:softmax', 
                        num_class=10, 
                        use_label_encoder=False,
                        n_estimators=cfg.TRAIN_HOWMANY_ENZYME_LEARNING_STEPS
                    )
print("-" * 100)
print("几功能酶xgboost模型：", model)
model.fit(x_train, y_train, eval_metric="mlogloss", eval_set=eval_set, verbose=True)
# # 打印重要性指数
bcommon.importance_features_top(model, x_train, topN=50)
# 保存模型
joblib.dump(model, model_file)
return model

In [50]:
funclib.run_baseline(X_train, Y_train, X_test, Y_test)

baslineName 	 accuracy 	 precision(PPV) 	 NPV 		 recall 	 f1 		 auroc 		 auprc 		 confusion Matrix


ValueError: Target is multiclass but average='binary'. Please choose another average setting, one of [None, 'micro', 'macro', 'weighted'].

### 4.2 Unirep + 机器学习

In [10]:
X_train = train.iloc[:,12:]
X_test = test.iloc[:,12:]

Y_train = train.iloc[:,3].astype('int')
Y_test = test.iloc[:,3].astype('int')

X_train = np.array(X_train)
X_test = np.array(X_test)
Y_train = np.array(Y_train).flatten()
Y_test = np.array(Y_test).flatten()

In [11]:
funclib.run_baseline(X_train, Y_train, X_test, Y_test)

baslineName 	 accuracy 	 precision(PPV) 	 NPV 		 recall 	 f1 		 auroc 		 auprc 		 confusion Matrix
lr 		0.906306 	0.268468 		0.924024 	0.089382 	0.134113 	0.658508 	0.161993 	 tp: 149 fp: 406 fn: 1518 tn: 18462
xg 		0.922669 	0.613181 		0.928019 	0.128374 	0.212302 	0.730793 	0.294830 	 tp: 214 fp: 135 fn: 1453 tn: 18733
dt 		0.867835 	0.204738 		0.930498 	0.217756 	0.211047 	0.571513 	0.108084 	 tp: 363 fp: 1410 fn: 1304 tn: 17458
rf 		0.923789 	0.947368 		0.923657 	0.064787 	0.121280 	0.693704 	0.297006 	 tp: 108 fp: 6 fn: 1559 tn: 18862
gbdt 		0.918578 	0.474747 		0.920728 	0.028194 	0.053228 	0.649263 	0.171179 	 tp: 47 fp: 52 fn: 1620 tn: 18816
