# Task2. Enzyme Catalytic Function Quantity Annotation

> author: Shizhenkun   
> email: zhenkun.shi@tib.cas.cn   
> date: 2021-09-28  




## 1. Import packages

In [9]:
import numpy as np
import pandas as pd


import sys
import os
from tqdm import tqdm
from functools import reduce

sys.path.append("../tools/")
import funclib

sys.path.append("../")
import benchmark_common as bcommon
import benchmark_train as btrain
import benchmark_test as btest
import config as cfg
import benchmark_evaluation as eva


from pandarallel import pandarallel # 导入pandaralle
pandarallel.initialize() # 初始化该这个b...并行库

%load_ext autoreload
%autoreload 2

INFO: Pandarallel will run on 80 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## 2. Load data

In [3]:
#read train test data
train = pd.read_feather(cfg.DATADIR+'task2/train.feather')
test = pd.read_feather(cfg.DATADIR+'task2/test.feather')
print('train size: {0}\ntest size: {1}'.format(len(train), len(test)))

train size: 222567
test size: 3304


## 3. Gather features

In [4]:
trainf=pd.read_feather(cfg.DATADIR+'train_rep32.feather')
testf=pd.read_feather(cfg.DATADIR+'test_rep32.feather')

train = train.merge(trainf, on='id', how='left')
test = test.merge(testf, on='id', how='left')

## 3. 同源比对

In [6]:
res_data=funclib.getblast(train,test)

Write finished
Write finished
diamond makedb --in /tmp/train.fasta -d /tmp/train.dmnd


diamond v2.0.8.146 (C) Max Planck Society for the Advancement of Science
Documentation, support and updates available at http://www.diamondsearch.org

#CPU threads: 80
Scoring parameters: (Matrix=BLOSUM62 Lambda=0.267 K=0.041 Penalties=11/1)
Database input file: /tmp/train.fasta
Opening the database file...  [0.004s]
Loading sequences...  [0.473s]
Masking sequences...  [0.277s]
Writing sequences...  [0.096s]
Hashing sequences...  [0.027s]
Loading sequences...  [0s]
Writing trailer...  [0.001s]
Closing the input file...  [0.003s]
Closing the database file...  [0.024s]
Database hash = d455115f7ab276d8f0c450236f13a30a
Processed 222567 sequences, 93551643 letters.
Total time = 0.908s
diamond v2.0.8.146 (C) Max Planck Society for the Advancement of Science
Documentation, support and updates available at http://www.diamondsearch.org

#CPU threads: 80
Scoring parameters: (Matrix=BLOSUM62 Lambda=0.267 K=0.041 Penalties=11/1)
Temporary directory: /tmp
#Target sequences to report alignments for:

diamond blastp -d /tmp/train.dmnd  -q  /tmp/test.fasta -o /tmp/test_fasta_results.tsv -b5 -c1 -k 1


 [0.08s]
Database: /tmp/train.dmnd (type: Diamond database, sequences: 222567, letters: 93551643)
Block size = 5000000000
Opening the input file...  [0.001s]
Opening the output file...  [0s]
Loading query sequences...  [0.009s]
Masking queries...  [0.029s]
Building query seed set...  [0.024s]
Algorithm: Double-indexed
Building query histograms...  [0.012s]
Allocating buffers...  [0s]
Loading reference sequences...  [0.178s]
Masking reference...  [0.204s]
Initializing temporary storage...  [0s]
Building reference histograms...  [0.122s]
Allocating buffers...  [0s]
Processing query block 1, reference block 1/1, shape 1/2.
Building reference seed array...  [0.115s]
Building query seed array...  [0.021s]
Computing hash join...  [0.056s]
Building seed filter...  [0.005s]
Searching alignments...  [0.017s]
Processing query block 1, reference block 1/1, shape 2/2.
Building reference seed array...  [0.09s]
Building query seed array...  [0.018s]
Computing hash join...  [0.055s]
Building seed fil

In [7]:
res_data = res_data[['id', 'sseqid']].merge(train, left_on='sseqid',right_on='id', how='left')[['id_x','sseqid','functionCounts']]
res_data =res_data.rename(columns={'id_x':'id','sseqid':'id_ref', 'functionCounts':'functionCounts_pred'})
res_data = res_data.merge(test, on='id', how='left')[['id','functionCounts_pred','functionCounts']]

In [10]:
# 计算指标
res_data['iscorrect'] = res_data[['functionCounts_pred', 'functionCounts']].apply(lambda x: x['functionCounts_pred'] == x['functionCounts'], axis=1) #判断EC号是否一致
correct = sum(res_data['iscorrect'])
find  = len(res_data)
total = len(test)
print('Total query records are: {0}'.format(total))
print('Matched records are: {0}'.format(find))
print('Accuracy: {0}({1}/{2})'.format(correct/total, correct, total))
print('Pricision: {0}({1}/{2})'.format(correct/find, correct, find))
print('Recall: {0}({1}/{2})'.format(find/total, find, total))

Total query records are: 3304
Matched records are: 2761
Accuracy: 0.7566585956416465(2500/3304)
Pricision: 0.9054690329590728(2500/2761)
Recall: 0.8356537530266344(2761/3304)


## 4. Prediction


In [11]:
train_s1=train.copy()
test_s1=test.copy()

train_s1['lb'] = train_s1.functionCounts.apply(lambda x : 0 if x==1 else 1)
test_s1['lb'] = test_s1.functionCounts.apply(lambda x : 0 if x==1 else 1)

### 4.1 单功能多功能

In [12]:
X_train = np.array(train.iloc[:,np.r_[3:1283]])
Y_train = np.array(train_s1.lb.astype('int')).flatten()

X_test = np.array(test.iloc[:,np.r_[3:1283]])
Y_test = np.array(test_s1.lb.astype('int')).flatten()

In [67]:
funclib.run_baseline(X_train, Y_train, X_test, Y_test)

baslineName 	 accuracy 	 precision(PPV) 	 NPV 		 recall 	 f1 		 		 confusion Matrix
knn 		0.925545 	0.517857 		0.947385 	0.345238 	0.414286 	 tp: 87 fp: 81 fn: 165 tn: 2971




lr 		0.917070 	0.392157 		0.933791 	0.158730 	0.225989 	 tp: 40 fp: 62 fn: 212 tn: 2990
xg 		0.940375 	0.802198 		0.944289 	0.289683 	0.425656 	 tp: 73 fp: 18 fn: 179 tn: 3034
dt 		0.889831 	0.252212 		0.936647 	0.226190 	0.238494 	 tp: 57 fp: 169 fn: 195 tn: 2883
rf 		0.937651 	0.883333 		0.938656 	0.210317 	0.339744 	 tp: 53 fp: 7 fn: 199 tn: 3045
gbdt 		0.928874 	0.869565 		0.929290 	0.079365 	0.145455 	 tp: 20 fp: 3 fn: 232 tn: 3049


In [None]:
groundtruth, predict, predictprob = funclib.xgmain(X_train, Y_train, X_test, Y_test, type='binary')

### 4.2 2功能酶->8功能酶 

In [71]:
train_sub=train[train.functionCounts>=2]

Unnamed: 0,id,seq,functionCounts,f1,f2,f3,f4,f5,f6,f7,...,f1271,f1272,f1273,f1274,f1275,f1276,f1277,f1278,f1279,f1280
27,P03015,MLIGYVRVSTNDQNTDLQRNALVCAGCEQIFEDKLSGTRTDRPGLK...,2,1.556724,-0.593310,-10.047720,5.533885,-6.512511,-4.169526,-5.231729,...,16.137537,4.311558,-2.210803,1.476109,-6.806911,-4.541852,6.125546,-4.968339,-2.224045,0.001526
126,P00910,MQTVLAKIVADKAIWVEARKQQQPLASFQNEIQPSTRHFYDALQGA...,2,4.099043,1.080716,-3.884742,3.274055,-1.007856,-3.018649,-6.074471,...,13.423177,4.039546,-8.830684,1.700052,-6.829605,-1.267010,8.662738,-6.820977,4.833040,1.778101
143,P00909,MMQTVLAKIVADKAIWVEARKQQQPLASFQNEVQPSTRHFYDALQG...,2,4.683400,0.891993,-4.028535,2.958478,-1.292257,-2.839869,-5.985564,...,13.146235,4.146377,-8.457584,1.872828,-7.688915,-1.787896,8.243024,-6.723485,4.899431,1.185993
153,P03523,MEVHDFETDEFNDFNEDDYATREFLNPDERMTYLNHADYNLNSPLI...,4,-1.315872,-3.867160,4.101826,7.873257,-8.007924,-9.206687,-6.444164,...,13.512088,1.091756,-2.695173,2.587013,-1.384843,-2.827420,4.011899,-2.082527,1.242697,1.335304
212,P00570,MEEKLKKTKIIFVVGGPGSGKGTQCEKIVQKYGYTHLSTGDLLRAE...,2,-8.690660,3.961315,-2.758022,5.869511,-11.578798,6.382885,-8.468960,...,13.227867,0.329022,1.540725,9.215970,-11.617360,5.515145,6.928047,-8.592218,-7.446095,-6.900333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
222497,P0DPE2,MACLGPSAQVPELPEKNCGYREVQYWDQRYQGAADSAPYEWFGDFS...,2,-2.139962,1.980610,-4.372785,2.083648,-2.528535,-0.780301,-5.243655,...,10.998007,-0.065250,-3.129717,7.670015,-8.574594,-0.853619,3.360317,-0.730811,-9.671906,3.159673
222504,M2XHU6,MGSQHQSQHNSALIQAARDGEATLSVAFGGQGPSNLNCFNDLLELN...,6,0.616900,3.144179,-4.481573,1.086842,-7.828979,-0.493626,-6.117258,...,9.823048,-0.789921,-2.756095,2.640041,-6.786637,1.194315,5.363291,-2.853688,-6.029834,2.368524
222525,A7TUG9,AAAWMLNGCLQVMDSRTIPANRNADNVDPALQTATHLCFPTRPVRV...,3,5.927090,1.961192,-1.540268,6.250709,-8.029306,-7.804775,-2.339700,...,12.293169,-2.184128,2.933439,2.573843,-22.215565,-8.830070,3.892642,-3.441507,1.987736,-0.475868
222528,M2YJJ3,MGQKTIKRKIQSAERPAEADVAFLASTQHSKDLCYEYDAPEEVAVE...,3,1.033936,0.906112,0.650086,-0.072426,-8.039215,-3.622916,-5.147133,...,10.030788,-1.790666,-0.547477,5.562757,3.633627,-3.204210,2.977791,-1.391446,-2.141112,1.314265


In [66]:
res_data

Unnamed: 0,id,functionCounts_pred,functionCounts,iscorrect
0,Q5RF96,1,1,True
1,P9WIA9,1,1,True
2,H2E7Q7,1,1,True
3,E2JFG2,1,1,True
4,H2E7Q8,1,1,True
...,...,...,...,...
2756,Q753P9,1,2,False
2757,A0A068BGA5,2,1,False
2758,A0A2R6Q324,2,1,False
2759,Q6NRV0,1,1,True
