# Task. Benchmark 任务

* 1. 预测是否是酶
* 2. 预测是几功能酶
* 3. 分别是什么功能

> author: Shizhenkun   
> email: zhenkun.shi@tib.cas.cn   
> date: 2021-07-22  

## 任务简介
该任务通过给定蛋白序列，预测该该蛋白是酶还是非酶，是酶的话有几个功能，这些功能分别是什么。本任务所使用的数据集为Sport，对数据集的数据中进行学习，然后完成对应任务。


## 数据统计
- 训练数据取之2018年2月份的快照数据，该数据包括序列566,825条，安装核苷酸序列去除重复后还剩余469,129条唯一的数据。在这些数据中，其中没有酶号的数据为246,5625条，有酶号的数据为222,567条。训练数据涉及4855个独立的EC编号，其中4734个具有4级完整的EC号， 164个三级EC号，37个二级EC号，以及6个一级EC号。
- 训练集涉及的多功能能酶与不同EC级别对应的序列数量如下

| 酶功能数 | 序列条数   | EC精确级别 | 序列条数   |
|-----:|-------:|-------:|-------:|
| 1    | 209729 | 1      | 191128 |
| 2    | 10762  | 2      | 23720  |
| 3    | 934    | 3      | 6097   |
| 4    | 652    | 4      | 1622   |
| 5    | 199    |        |        |
| 6    | 101    |        |        |
| 7    | 124    |        |        |
| 8    | 18     |        |        |
| 9    | 14     |        |        |
| 10   | 25     |        |        |
|      | 222558 |        | 222567 （因为多功能酶有多个EC编号，故序列条数大于左边）|



- 测试集采用2020年6月份快照数据，构建方法为  <font color=red size=4>筛选</font>  在2020年6月份快照数据中出现的序列 <font color=red size=6>且</font> 未在2018年2月份快照数据中出现的序列。最终序列条数为8033条，设计1036个独立的EC编号， 其中有728个EC编号在训练数据中出现，308个新增EC号，这些新增EC号EC迁移的有188个，纯新增的有120个。

 


## 1. 导入必要的包

In [1]:
import numpy as np
import pandas as pd
import random
from random import randint
import time
import gzip
import re
import datetime
import sys
import os
from tqdm import tqdm

from functools import reduce
import matplotlib.pyplot as plt
import joblib

from pandarallel import pandarallel

sys.path.append("../../tools/")
import commontools
import funclib
from pyecharts.globals import CurrentConfig, OnlineHostType
CurrentConfig.ONLINE_HOST = OnlineHostType.NOTEBOOK_HOST
from pyecharts.globals import CurrentConfig, NotebookType
CurrentConfig.NOTEBOOK_TYPE = NotebookType.JUPYTER_LAB

from pyecharts import options as opts
from pyecharts.charts import Bar
from pyecharts.faker import Faker
from pyecharts.globals import ThemeType

%load_ext autoreload
%autoreload 2

## 2. 定义公共数据

In [2]:
#定义数据目录
DATADIR = r'''/home/shizhenkun/codebase/BioUniprot/data/benchmark/data/'''
RESULTSDIR = r'''/home/shizhenkun/codebase/BioUniprot/data/benchmark/results/'''
MODELDIR = r'''/home/shizhenkun/codebase/BioUniprot/data/benchmark/model/'''


#导入标签字典
dict_ec_label = np.load(DATADIR+'ec_label_dict.npy', allow_pickle=True).item()
dict_label_ec = dict(zip(dict_ec_label.values(), dict_ec_label.keys()))

# 读入数据
train = pd.read_feather(DATADIR+'train.feather')
test = pd.read_feather(DATADIR+'test.feather')


train_fasta = DATADIR+'train.fasta'
test_fasta = DATADIR+'test.fasta'

In [3]:
def get_blast_pred(train, test, blast_res):
    # 给比对结果添加标签
    isEmzyme_dict = {v: k for k,v in zip(train.isemzyme, train.id )} 
    blast_res['isEmzyme_pred'] = blast_res['sseqid'].apply(lambda x: isEmzyme_dict.get(x))
    
    isEmzyme_dict_test = {v: k for k,v in zip(test.isemzyme, test.id )} 
    blast_res['isEmzyme_groudtruth'] = blast_res['id'].apply(lambda x: isEmzyme_dict_test.get(x))
    return blast_res

## 3. 酶非酶预测

In [4]:
blast_res = funclib.getblast_fasta(train_fasta, test_fasta) #序列比对
blast_res = get_blast_pred(train, test, blast_res)          #添加序列比对标签

diamond makedb --in /home/shizhenkun/codebase/BioUniprot/data/benchmark/data/train.fasta -d /tmp/train.dmnd
diamond blastp -d /tmp/train.dmnd  -q  /home/shizhenkun/codebase/BioUniprot/data/benchmark/data/test.fasta -o /tmp/test_fasta_results.tsv -b5 -c1 -k 1


In [5]:
blast_res

Unnamed: 0,id,sseqid,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore,isEmzyme_pred,isEmzyme_groudtruth
0,P02462,P02462,99.8,1669,4,0,1,1669,1,1669,6.370000e-149,503.0,False,False
1,P00939,P00939,100.0,248,0,0,2,249,1,248,4.870000e-182,501.0,True,True
2,P02340,P02340,100.0,387,0,0,4,390,1,387,5.320000e-286,776.0,False,False
3,P01848,P01848,100.0,140,0,0,1,140,3,142,4.280000e-96,275.0,False,False
4,P01733,P01733,100.0,114,0,0,1,114,1,114,1.840000e-82,239.0,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6030,Q6YZW2,Q9LQI9,64.5,403,120,9,51,446,32,418,5.480000e-168,481.0,False,False
6031,A0A061ACU2,M9MSG8,26.1,2702,1523,77,13,2435,12,2518,2.870000e-196,665.0,False,False
6032,Q6NRV0,Q9BWF2,56.4,470,195,5,1,463,1,467,1.490000e-144,424.0,True,True
6033,C5DLH0,Q6CNF8,63.1,385,137,3,1,381,1,384,3.490000e-180,508.0,True,True


In [6]:
# 获取比对结果
res_data_sub = blast_res[['id','isEmzyme_pred']]

#选取测试集的ID与是否是酶2列
test_data_sub=test[['id', 'isemzyme']]


#拼合比对结果到测试集
final_pred_table = pd.merge(test_data_sub,res_data_sub, on='id', how='left')
final_pred_table = final_pred_table.rename(columns={'isEmzyme_pred':'isEmzyme_pred_blast'})

In [7]:
def caculate_isEnzymeBlastMetrics(pred_table):
    
    res = pred_table
    
    # 填充没有比对结果的值为非酶
    res.fillna(False, inplace=True)

    # # # 填充没有比对结果的值为酶
#     res.fillna(True, inplace=True)

    # 填充没有比对结果的值 随机填充
#     list_enornot=[True, False]
#     for i in range(len(res[(res.isEmzyme_pred_blast.isnull())])):
#         res.fillna(random.choice(list_enornot), inplace=True, limit=1)

    # 计算指标
    tp = len(res[res.isemzyme & res.isEmzyme_pred_blast])
    fp = len(res[(res.isemzyme ==False) & (res.isEmzyme_pred_blast)])
    tn = len(res[(res.isemzyme ==False) & (res.isEmzyme_pred_blast ==False)])
    fn = len(res[(res.isemzyme ) & (res.isEmzyme_pred_blast == False)])
    print('baslineName', '\t', 'accuracy','\t', 'precision(PPV) \t NPV \t\t', 'recall','\t', 'f1', '\t\t', 'auroc','\t\t', 'auprc', '\t\t confusion Matrix')
    funclib.caculateMetrix_1('同源比对',tp, fp, tn,fn)

In [8]:
caculate_isEnzymeBlastMetrics(final_pred_table)

baslineName 	 accuracy 	 precision(PPV) 	 NPV 		 recall 	 f1 		 auroc 		 auprc 		 confusion Matrix
同源比对 	 0.902776  	0.969148 		 0.863591  	0.807488	 0.880963		 	 	 	 tp:2890  fp:92  fn:689  tn:4362


In [67]:
X_train = train.iloc[:,4:]
X_test = test.iloc[:,4:]
Y_train = train.iloc[:,1].astype('int')
Y_test = test.iloc[:,1].astype('int')
X_train = np.array(X_train)
X_test = np.array(X_test)
Y_train = np.array(Y_train).flatten()
Y_test = np.array(Y_test).flatten()

funclib.run_baseline(X_train, Y_train, X_test, Y_test)

baslineName 	 accuracy 	 precision(PPV) 	 NPV 		 recall 	 f1 		 auroc 		 auprc 		 confusion Matrix
lr 		0.837296 	0.860406 		0.822372 	0.757754 	0.805824 	0.915852 	0.897813 	 tp: 2712 fp: 440 fn: 867 tn: 4014
xg 		0.870908 	0.901199 		0.851182 	0.797709 	0.846302 	0.941582 	0.929918 	 tp: 2855 fp: 313 fn: 724 tn: 4141
dt 		0.794846 	0.792842 		0.796242 	0.730372 	0.760326 	0.788513 	0.699199 	 tp: 2614 fp: 683 fn: 965 tn: 3771
rf 		0.863438 	0.913116 		0.833764 	0.766415 	0.833359 	0.941952 	0.928926 	 tp: 2743 fp: 261 fn: 836 tn: 4193
gbdt 		0.834184 	0.873877 		0.810461 	0.733725 	0.797691 	0.915952 	0.899464 	 tp: 2626 fp: 379 fn: 953 tn: 4075


In [9]:
train.iloc[287570:287576,]

Unnamed: 0,id,isemzyme,functionCounts,ec_number,f1,f2,f3,f4,f5,f6,...,f1891,f1892,f1893,f1894,f1895,f1896,f1897,f1898,f1899,f1900
287570,Q1QRZ1,False,0,-,0.000639,0.006981,0.034417,-0.003639,-0.502829,0.095296,...,0.003791,-0.184995,0.182114,-0.232813,-0.03569,-0.039311,-0.049576,0.01413,-0.092997,0.923809
287571,Q0ADD6,False,0,-,0.014472,-0.033905,0.025476,-0.000542,-0.550734,0.079359,...,0.000594,-0.349276,0.197637,-0.099127,-0.022826,-0.004061,-0.007294,0.032211,-0.04483,0.866529
287572,A5UA93,False,0,-,0.005446,-0.017807,0.001387,-0.073694,-0.120669,0.008307,...,0.022011,-0.025101,-0.037702,0.035336,-0.34315,-0.107576,0.053807,-0.022555,0.144398,0.398742
287573,Q2GC38,False,0,-,0.005395,0.002486,0.003525,-0.001258,-0.415502,0.371641,...,-0.000278,-0.366016,0.008274,0.007525,-0.012305,-0.006669,0.04844,0.01007,-0.02531,0.497609
287574,A6TAW0,True,1,4.2.2.-,0.000489,-0.035972,0.075146,-0.104253,-0.000412,0.002889,...,0.696999,-0.074395,-0.269464,0.019761,-0.071869,0.054137,0.627089,-0.099114,-0.17097,0.00387
287575,A4Z027,True,1,2.7.7.77,0.000356,0.201447,0.01121,-0.797472,0.56617,0.00188,...,0.050469,0.144775,-0.059071,-0.420709,-0.520245,-0.302569,-0.038439,0.001017,0.176532,0.041215


In [16]:
len(dict_ec_label)

5499

In [78]:
train[train.functionCounts>1]

Unnamed: 0,id,isemzyme,functionCounts,ec_number,f1,f2,f3,f4,f5,f6,...,f1891,f1892,f1893,f1894,f1895,f1896,f1897,f1898,f1899,f1900
469,P03015,True,2,"3.1.22.-, 6.5.1.-",0.006160,0.080437,0.083994,-0.009620,-0.340445,-0.020711,...,-0.032003,0.016285,0.013976,0.015423,0.258364,0.181318,-0.290237,0.008333,-0.014759,-0.027153
938,P00910,True,2,"4.1.1.48, 5.3.1.24",0.004756,0.212085,-0.084529,-0.953187,-0.019440,-0.038280,...,0.193054,-0.006679,0.099732,-0.095228,-0.463356,0.021307,0.039246,0.202721,0.404699,0.150168
988,P00909,True,2,"4.1.1.48, 5.3.1.24",0.001914,0.219504,0.060341,-0.903888,-0.043283,-0.011315,...,0.131357,-0.007649,-0.070414,0.179522,-0.272367,0.157647,0.050266,0.124927,0.082607,0.163048
1028,P03523,True,4,"2.7.7.48, 2.1.1.56, 2.7.7.88, 2.1.1.296",0.000146,0.145330,0.133737,-0.018988,0.563564,0.132913,...,0.066348,0.525396,-0.139279,0.108641,-0.196162,0.017553,0.099437,0.066769,-0.184558,-0.002692
1222,P00570,True,2,"2.7.4.3, 2.7.4.6",-0.000118,-0.200865,0.020188,-0.866116,-0.284671,-0.013679,...,0.004929,-0.012565,0.003535,-0.220537,-0.169378,0.337800,-0.000862,0.018484,0.026648,0.511712
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
468971,P0DPE2,True,3,"3.4.24.71, 2.1.1.-, 3.4.24.71",0.040085,-0.126931,-0.770235,-0.697415,-0.207704,0.444320,...,0.000690,-0.154874,-0.051967,-0.248093,-0.158645,0.030366,-0.083022,-0.618706,-0.341732,0.000423
468996,M2XHU6,True,6,"2.3.1.86, 3.1.2.14, 4.2.1.59, 1.3.1.9, 2.3.1.3...",0.006657,0.122323,0.026862,-0.037653,-0.142093,0.045685,...,0.250210,-0.057459,0.156916,-0.149094,-0.063334,0.352573,0.005251,0.120411,-0.023563,0.110590
469028,A7TUG9,True,3,"2.3.1.86, 1.1.1.100, 2.3.1.41",0.003121,0.226084,0.035073,-0.089781,-0.083339,0.208106,...,-0.059146,-0.275957,-0.186232,0.020055,-0.061880,-0.122103,-0.001545,0.087627,0.028066,-0.002276
469038,M2YJJ3,True,3,"2.3.1.86, 1.1.1.100, 2.3.1.41",0.001452,0.160689,0.174533,-0.013354,-0.616443,0.193632,...,0.074523,-0.257112,-0.177444,-0.139653,-0.062659,0.018374,0.015152,-0.002087,-0.095050,0.153361


In [82]:
aaa=train[train.functionCounts==1]

In [107]:
ss=train

In [116]:

pandarallel.initialize()

INFO: Pandarallel will run on 32 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [117]:
ss['ec_number']=ss.parallel_apply(lambda x: x.ec_number+'-' if x.ec_number.endswith('.') else x.ec_number, axis=1)

In [113]:
ss.ec_number.apply(lambda x: print(x) if x.endswith('.') else x==0)

0         False
1         False
2         False
3         False
4         False
          ...  
469124    False
469125    False
469126    False
469127    False
469128    False
Name: ec_number, Length: 469129, dtype: bool

In [None]:
train['ec_number']=train.parallel_apply(lambda x: x.ec_number+'-' if x.ec_number.endswith('.') else x.ec_number, axis=1)