# Task2. 预测酶是几功能酶

> author: Shizhenkun   
> email: zhenkun.shi@tib.cas.cn   
> date: 2021-08-28  

## 任务简介
该任务通过给定酶序列，预测该酶几功能酶。


## 1. 导入必要的包

In [1]:
import numpy as np
import pandas as pd
import random
import time
import datetime
import sys
import os
from tqdm import tqdm
from functools import reduce

sys.path.append("../../tools/")
import funclib
import ucTools
from sklearn.model_selection import train_test_split

sys.path.append("../../")
import benchmark_common as bcommon
import benchmark_train as btrain
import benchmark_test as btest
import benchmark_config as cfg
import benchmark_evaluation as eva

from pandarallel import pandarallel # 导入pandaralle
pandarallel.initialize() # 初始化该这个b...并行库

%load_ext autoreload
%autoreload 2

INFO: Pandarallel will run on 32 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


## 2. 加载数据

In [2]:
train = pd.read_feather(cfg.DATADIR+'train.feather')
test = pd.read_feather(cfg.DATADIR+'test.feather')

train = train[train.isemzyme]
test = test[test.isemzyme]

print('train size: {0}\ntest size: {1}'.format(len(train), len(test)))

# 取子集
trainset = train[['id', 'functionCounts','seq', 'seqlength']].reset_index(drop=True)
testset = test[['id', 'functionCounts','seq', 'seqlength']].reset_index(drop=True)


train size: 222566
test size: 3579


## 3. 同源比对

In [3]:
# 写fasta文件
bcommon.save_table2fasta(dataset=trainset, file_out=cfg.TEMPDIR+'train.fasta')
bcommon.save_table2fasta(dataset=testset, file_out=cfg.TEMPDIR+'test.fasta')

# 读取比对结果
res_data=funclib.getblast_fasta(trainfasta=cfg.TEMPDIR+'train.fasta',testfasta=cfg.TEMPDIR+'test.fasta')
print(' query samples:{0}\n results samples: {1}'.format(len(test), len(res_data)))

# 给比对结果添加标签
functionCounts = {v: k for k,v in zip(train.functionCounts, train.id )} 
res_data['functionCounts_blast'] = res_data['sseqid'].apply(lambda x: functionCounts.get(x))
# 拼合结果
blast_res = testset.merge(res_data[['id', 'functionCounts_blast','pident' ]], on='id', how='left')
# 计算结果

print('\n\n')

print('%12s'%'baslineName', '\t\t', 'accuracy','\t', 'precision-macro \t', 'recall-macro','\t', 'f1-macro')
eva.caculateMetrix( groundtruth=blast_res.functionCounts, predict=blast_res['functionCounts_blast'].fillna('0').astype('int'),  baselineName='blast', type='multi')

print('\n\n')


Write finished
Write finished
diamond makedb --in /home/shizhenkun/codebase/BioUniprot/data/benchmark/tmp/train.fasta -d /tmp/train.dmnd
diamond blastp -d /tmp/train.dmnd  -q  /home/shizhenkun/codebase/BioUniprot/data/benchmark/tmp/test.fasta -o /tmp/test_fasta_results.tsv -b5 -c1 -k 1
 query samples:3579
 results samples: 3030



 baslineName 		 accuracy 	 precision-macro 	 recall-macro 	 f1-macro
       blast  		0.742666  	0.500151 		0.402750 	0.309967





## 4. 机器学习方法预测
### 4.1 onehot + 机器学习

In [4]:
MAX_SEQ_LENGTH = 1500 #定义序列最长的长度
trainset.seq = trainset.seq.map(lambda x : x[0:MAX_SEQ_LENGTH].ljust(MAX_SEQ_LENGTH, 'X'))
testset.seq = testset.seq.map(lambda x : x[0:MAX_SEQ_LENGTH].ljust(MAX_SEQ_LENGTH, 'X'))

In [5]:
f_train = funclib.dna_onehot(trainset) #训练集编码
f_test = funclib.dna_onehot(testset) #测试集编码

# 计算指标
X_train = np.array(f_train.iloc[:,2:])
X_test = np.array(f_test.iloc[:,2:])
Y_train = np.array(f_train.functionCounts.astype('int'))
Y_test = np.array(f_test.functionCounts.astype('int'))

funclib.run_baseline(X_train, Y_train-1, X_test, Y_test-1, type='multi')

 baslineName 		 accuracy 	 precision-macro 	 recall-macro 	 f1-macro
          lr  		0.884325  	0.582364 		0.220718 	0.206256
          xg  		0.895781  	0.784499 		0.225636 	0.214530
          dt  		0.821459  	0.383141 		0.209000 	0.161651
          rf  		0.896340  	0.810065 		0.209194 	0.191518
        gbdt  		0.862252  	0.117531 		0.276361 	0.127693


### 4.2 Unirep + 机器学习

In [6]:
X_train = np.array(train.iloc[:,7:])
X_test = np.array(test.iloc[:,7:])

Y_train = np.array(f_train.functionCounts.astype('int'))
Y_test = np.array(f_test.functionCounts.astype('int'))

funclib.run_baseline(X_train, Y_train-1, X_test, Y_test-1, type='multi')

 baslineName 		 accuracy 	 precision-macro 	 recall-macro 	 f1-macro
          lr  		0.888796  	0.527859 		0.237323 	0.231206
          xg  		0.893266  	0.590666 		0.226934 	0.217486
          dt  		0.845488  	0.372373 		0.152294 	0.170345
          rf  		0.895781  	0.845938 		0.151723 	0.176762
        gbdt  		0.883766  	0.442790 		0.127752 	0.135389


### 4.3. ESM + 机器学习
#### 4.3.1 REP33

In [4]:
train_esm_33 = pd.read_feather(cfg.DATADIR + 'train_rep33.feather')
test_esm_33 = pd.read_feather(cfg.DATADIR + 'test_rep33.feather')

train_esm = trainset.merge(train_esm_33, on='id', how='left')
test_esm = testset.merge(test_esm_33, on='id', how='left')

X_train = np.array(train_esm.iloc[:,4:])
X_test = np.array(test_esm.iloc[:,4:])

Y_train = np.array(train_esm.isemzyme.astype('int')).flatten()
Y_test = np.array(test_esm.isemzyme.astype('int')).flatten()

funclib.run_baseline(X_train, Y_train-1, X_test, Y_test-1, type='multi')

#### 4.3.2 REP32

In [None]:
train_esm_32 = pd.read_feather(cfg.DATADIR + 'train_rep32.feather')
test_esm_32 = pd.read_feather(cfg.DATADIR + 'test_rep32.feather')

train_esm = trainset.merge(train_esm_32, on='id', how='left')
test_esm = testset.merge(test_esm_32, on='id', how='left')

X_train = np.array(train_esm.iloc[:,4:])
X_test = np.array(test_esm.iloc[:,4:])

Y_train = np.array(train_esm.isemzyme.astype('int')).flatten()
Y_test = np.array(test_esm.isemzyme.astype('int')).flatten()

funclib.run_baseline(X_train, Y_train-1, X_test, Y_test-1, type='multi')

baslineName 	 accuracy 	 precision(PPV) 	 NPV 		 recall 	 f1 		 auroc 		 auprc 		 confusion Matrix


[Parallel(n_jobs=-2)]: Using backend LokyBackend with 31 concurrent workers.
[Parallel(n_jobs=-2)]: Done   1 out of   1 | elapsed: 33.4min finished


lr 		0.906635 	0.920857 		0.896404 	0.864767 	0.891931 	0.960401 	0.945699 	 tp: 3095 fp: 266 fn: 484 tn: 4188
xg 		0.922196 	0.945954 		0.905528 	0.875384 	0.909302 	0.975555 	0.967653 	 tp: 3133 fp: 179 fn: 446 tn: 4275
dt 		0.844143 	0.855919 		0.836062 	0.781783 	0.817173 	0.838017 	0.766367 	 tp: 2798 fp: 471 fn: 781 tn: 3983
rf 		0.915723 	0.965661 		0.884076 	0.840738 	0.898880 	0.973941 	0.969638 	 tp: 3009 fp: 107 fn: 570 tn: 4347


#### 4.3.3 REP0

In [None]:
train_esm_0 = pd.read_feather(cfg.DATADIR + 'train_rep0.feather')
test_esm_0 = pd.read_feather(cfg.DATADIR + 'test_rep0.feather')

train_esm = trainset.merge(train_esm_0, on='id', how='left')
test_esm = testset.merge(test_esm_0, on='id', how='left')

X_train = np.array(train_esm.iloc[:,4:])
X_test = np.array(test_esm.iloc[:,4:])

Y_train = np.array(train_esm.isemzyme.astype('int')).flatten()
Y_test = np.array(test_esm.isemzyme.astype('int')).flatten()

funclib.run_baseline(X_train, Y_train-1, X_test, Y_test-1, type='multi')

In [7]:
test


Unnamed: 0,id,isemzyme,functionCounts,ec_number,ec_specific_level,seq,seqlength,f1,f2,f3,...,f1891,f1892,f1893,f1894,f1895,f1896,f1897,f1898,f1899,f1900
4454,Q2U0J6,True,1,1.-.-.-,1,MASLTGAAAVLLAFILAYSTALTIYRLFFHPLARFPGPRLAAATKW...,521,0.000223,-0.094108,-0.023778,...,0.206355,0.050707,-0.095354,-0.098204,-0.694465,0.350149,0.056105,0.010750,-0.318422,0.033008
4455,A0A2I6PIZ2,True,1,1.-.-.-,1,MSTPEFKVIIVGGSLAGLTLAHCLLRAGISHIVLERRSVIAPEEGA...,463,0.001232,-0.138543,0.053197,...,0.009380,0.078908,-0.016205,-0.025848,-0.057122,0.325287,-0.016512,0.453471,-0.171643,0.037360
4456,D1MX87,True,1,1.-.-.-,1,MLEGTLQDCWTSISKMQLHWTVLGLLPVLFIAILGPRVRQIWVNYV...,523,0.000639,0.001423,0.113487,...,0.010867,0.034025,-0.595541,-0.041428,-0.089531,0.100896,0.430943,0.007430,-0.318933,-0.023877
4457,A0A075TR33,True,1,1.-.-.-,1,MRLHQSPPRLLVCILSVLQVSAGLSSNCRCMPGDSCWPSLNDWARF...,570,-0.016385,0.490243,0.020230,...,0.031702,0.005262,-0.360198,0.306562,-0.155451,-0.004852,-0.025915,-0.156622,-0.083728,0.006148
4458,Q2U0K0,True,1,1.-.-.-,1,MNSISALFSAGGFQWILLSLSLAFIVVYSLFYLAVGLYNLYFHPLA...,500,0.000235,-0.073429,-0.017966,...,0.025771,-0.000300,-0.006243,-0.054577,-0.267540,0.304406,-0.017257,0.056702,0.135750,0.092589
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8028,Q9NTI2,True,1,7.6.2.1,4,MLNGAGLDKALKMSLPRRSRIRSSVGPVRSSLGYKKAEDEMSRATS...,1188,0.005596,-0.013434,0.016993,...,0.014816,0.005686,-0.035825,0.014562,-0.331922,-0.014392,0.058262,0.010602,-0.037202,0.170220
8029,D4AA47,True,1,7.6.2.1,4,MNTERDSETTFDEDSQPNDEVVPYSDDETEDELEDQGPAVEPEQNR...,1251,0.002222,-0.045842,0.012453,...,0.032280,0.019047,-0.010349,0.023213,-0.163688,-0.053849,0.173443,0.007751,0.089689,0.116427
8030,F1RBC8,True,1,7.6.2.4,4,MSVYSKLPSQLKKPLVKKAVVLLIALYGVKKLSPYFFGKLKGRTSK...,766,0.000588,-0.009237,0.044856,...,0.001776,0.128297,-0.097380,0.047742,-0.041062,-0.357653,0.024547,0.031392,0.021815,0.140106
8031,Q7JUN3,True,1,7.6.2.4,4,MSVLSKYVDRIAEKCEHNGFTKHAFSYALVTSAILALTIKVTIPYV...,730,0.002590,0.152473,0.049695,...,0.002177,-0.179690,0.000880,-0.047634,-0.424014,0.578205,0.004410,0.014373,-0.116012,0.078008
