# Task2. Enzyme Catalytic Function Quantity Annotation

> author: Shizhenkun   
> email: zhenkun.shi@tib.cas.cn   
> date: 2021-09-28  




## 1. Import packages

In [1]:
import numpy as np
import pandas as pd


import sys
import os
from tqdm import tqdm
from functools import reduce

sys.path.append("../tools/")
import funclib

sys.path.append("../")
import benchmark_common as bcommon
import benchmark_train as btrain
import benchmark_test as btest
import config as cfg
import benchmark_evaluation as eva
import joblib



from sklearn import metrics

from pandarallel import pandarallel # 导入pandaralle
pandarallel.initialize() # 初始化该这个b...并行库

%load_ext autoreload
%autoreload 2

INFO: Pandarallel will run on 80 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


## 2. Load data

In [2]:
#read train test data
train = pd.read_feather(cfg.DATADIR+'task2/train.feather')
test = pd.read_feather(cfg.DATADIR+'task2/test.feather')
print('train size: {0}\ntest size: {1}'.format(len(train), len(test)))

train size: 222567
test size: 3304


## 3. Gather features

In [3]:
trainf=pd.read_feather(cfg.DATADIR+'train_rep32.feather')
testf=pd.read_feather(cfg.DATADIR+'test_rep32.feather')

train = train.merge(trainf, on='id', how='left')
test = test.merge(testf, on='id', how='left')

## 4. Prediction


### 4.1 singel-to-multi

In [4]:
train_s1=train.copy()
test_s1=test.copy()

train_s1['lb'] = train_s1.functionCounts.apply(lambda x : 0 if x==1 else 1)
test_s1['lb'] = test_s1.functionCounts.apply(lambda x : 0 if x==1 else 1)

X_train = np.array(train.iloc[:,np.r_[3:1283]])
Y_train = np.array(train_s1.lb.astype('int')).flatten()

X_test = np.array(test.iloc[:,np.r_[3:1283]])
Y_test = np.array(test_s1.lb.astype('int')).flatten()

In [None]:
funclib.run_baseline(X_train, Y_train, X_test, Y_test)

baslineName 	 accuracy 	 precision(PPV) 	 NPV 		 recall 	 f1 		 		 confusion Matrix
knn 		0.925545 	0.517857 		0.947385 	0.345238 	0.414286 	 tp: 87 fp: 81 fn: 165 tn: 2971


In [18]:
groundtruth, predict, predictprob, model = funclib.xgmain(X_train, Y_train, X_test, Y_test, type='binary')
joblib.dump(model, cfg.MODELDIR+'/single_multi.model')

['/home/shizhenkun/codebase/DMLF/model/single_multi.model']

In [20]:
pd.DataFrame(X_train)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1270,1271,1272,1273,1274,1275,1276,1277,1278,1279
0,-0.220921,4.081616,-3.814746,2.202727,-6.239662,-4.988444,-5.822192,-4.460664,-1.200592,-3.804104,...,18.058687,-1.100924,1.724075,0.143535,-6.999560,-6.251357,8.360884,-1.662625,-5.621017,1.682485
1,3.521839,4.318994,-1.853766,1.102569,-9.321230,-4.854475,-7.815628,-2.012899,-5.534803,-6.105119,...,8.096443,0.931363,-4.604083,0.339199,-6.082336,-3.915988,7.511534,-2.378523,-3.667611,4.634388
2,1.032287,-1.094576,-4.722376,4.625370,-1.973799,-1.705471,-4.638579,-6.950621,-1.872663,-3.980067,...,18.173588,-1.603059,0.301227,-2.721340,-1.693633,-4.528199,5.910798,-2.073927,-7.305595,0.270684
3,-2.420424,-1.460299,-2.409102,1.643003,-1.612438,-1.568280,2.726623,2.233466,-10.240229,-5.307086,...,15.082927,-0.857677,0.444023,1.980751,-1.351836,-4.664156,1.289754,-2.213044,2.253615,3.059324
4,-3.996948,-6.180151,-4.634508,3.155759,-10.585983,0.715175,-15.944781,-7.783895,-6.171251,-8.116997,...,24.049959,-0.592115,-8.274217,-0.269268,-7.028537,-4.726937,13.501459,-11.718015,0.038843,2.551557
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
222562,0.172802,2.350760,4.350393,3.693239,-5.923174,1.420440,-8.875187,4.948562,-9.260919,-9.089705,...,20.170313,-0.140875,-7.601892,0.804636,-11.607852,-6.285739,4.216749,2.142383,-14.011300,10.606584
222563,0.272997,2.976982,1.862016,5.408012,-5.642054,-5.404109,-11.418259,0.153906,-5.349937,-5.087845,...,17.778130,-1.399726,-7.715403,-1.014039,-9.040051,-3.262766,2.823239,2.903638,-3.791561,3.065740
222564,-1.024953,5.833755,1.151545,1.393820,-3.737662,0.271311,-4.385376,-0.669128,-8.530610,-3.202958,...,8.665022,-3.316796,-5.142276,6.789438,-5.080449,-4.224988,7.700522,-4.602151,-2.171469,3.767569
222565,-0.132955,0.242595,-2.817183,4.983530,-6.607008,-1.489796,-5.255298,-2.211435,-1.485693,-6.607338,...,11.285470,-1.765559,-4.504529,-3.887650,-9.018026,-5.810025,3.122884,3.802729,10.711333,-4.930251


### 4.2 2-8 functions

In [10]:
#gather >2 data
train_s2=train.copy()
test_s2=test.copy()

train_s2=train_s2[train_s2.functionCounts>=2]
test_s2=test_s2[test_s2.functionCounts>=2]

train_s2.reset_index(drop=True, inplace=True)
test_s2.reset_index(drop=True, inplace=True)

#define X,Y
X_train = np.array(train_s2.iloc[:,np.r_[3:1283]])
Y_train = np.array(train_s2.functionCounts.astype('int')-2).flatten()

X_test = np.array(test_s2.iloc[:,np.r_[3:1283]])
Y_test = np.array(test_s2.functionCounts.astype('int')-2).flatten()

In [11]:
funclib.run_baseline(X_train, Y_train, X_test, Y_test, type='multi')

 baslineName 		 accuracy 	 precision-macro 	 recall-macro 	 f1-macro
         knn  		0.833333  	0.678550 		0.623593 	0.646456
          lr  		0.761905  	0.707956 		0.637942 	0.521033
          xg  		0.849206  	0.854167 		0.630704 	0.623925
          dt  		0.702381  	0.479052 		0.482337 	0.479975
          rf  		0.853175  	0.864239 		0.546546 	0.594133
        gbdt  		0.853175  	0.856578 		0.573413 	0.602268


In [12]:
groundtruth, predict, predictprob, model = funclib.xgmain(X_train, Y_train, X_test, Y_test, type='multi')
joblib.dump(model, cfg.MODELDIR+'/multi_many.model')

['/home/shizhenkun/codebase/DMLF/model/multi_many.model']

## 5. Integration

### 5.1 sequence aligment

In [8]:
res_data=funclib.getblast(train,test)
res_data = res_data[['id', 'sseqid']].merge(train, left_on='sseqid',right_on='id', how='left')[['id_x','sseqid','functionCounts']]
res_data =res_data.rename(columns={'id_x':'id','sseqid':'id_ref', 'functionCounts':'functionCounts_pred'})
res_data = res_data.merge(test, on='id', how='left')[['id','functionCounts_pred','functionCounts']]

Write finished
Write finished
diamond makedb --in /tmp/train.fasta -d /tmp/train.dmnd


diamond v2.0.8.146 (C) Max Planck Society for the Advancement of Science
Documentation, support and updates available at http://www.diamondsearch.org

#CPU threads: 80
Scoring parameters: (Matrix=BLOSUM62 Lambda=0.267 K=0.041 Penalties=11/1)
Database input file: /tmp/train.fasta
Opening the database file...  [0.004s]
Loading sequences...  [0.472s]
Masking sequences...  [0.174s]
Writing sequences...  [0.097s]
Hashing sequences...  [0.027s]
Loading sequences...  [0s]
Writing trailer...  [0.001s]
Closing the input file...  [0.002s]
Closing the database file...  [0.024s]
Database hash = d455115f7ab276d8f0c450236f13a30a
Processed 222567 sequences, 93551643 letters.
Total time = 0.806s


diamond blastp -d /tmp/train.dmnd  -q  /tmp/test.fasta -o /tmp/test_fasta_results.tsv -b5 -c1 -k 1 --quiet


In [14]:
res_data

Unnamed: 0,id,functionCounts_pred,functionCounts
0,Q5RF96,1,1
1,P9WIA9,1,1
2,H2E7Q7,1,1
3,E2JFG2,1,1
4,H2E7Q8,1,1
...,...,...,...
2756,Q753P9,1,2
2757,A0A068BGA5,2,1
2758,A0A2R6Q324,2,1
2759,Q6NRV0,1,1


In [21]:
aa=test.iloc[:,np.r_[0,2]].merge(res_data.iloc[:,np.r_[0,1]], on='id', how='left').fillna(0)

In [15]:
eva.caculateMetrix(groundtruth=res_data.functionCounts, predict=res_data.functionCounts_pred, baselineName='diamond', type='multi')

        ours  		0.905469  	0.582284 		0.553278 	0.560797


In [23]:
eva.caculateMetrix(groundtruth=aa.functionCounts, predict=aa.functionCounts_pred, baselineName='diamond', type='multi')

     diamond  		0.756659  	0.509499 		0.585681 	0.477800


### 5.2 Xgboost

In [9]:
X_test = np.array(test.iloc[:,3:])
Y_test = np.array(test.functionCounts.astype('int')).flatten()

model_s = joblib.load(cfg.MODELDIR+'/single_multi.model')
model_m = joblib.load(cfg.MODELDIR+'/multi_many.model')

In [10]:
pred_s=model_s.predict(X_test)
pred_m=model_m.predict(X_test)

### 5.3 Results integration

In [11]:
pred_final = test.iloc[:,np.r_[0,2]]
pred_final = pred_final.merge(res_data, on='id', how='left')
pred_final['pred_s']=1-pred_s
pred_final['pred_m']=pred_m+2
pred_final = pred_final.iloc[:,np.r_[0,1,2,4,5]]

colnames=[  'id', 'functionCounts_groundtruth', 'functionCounts_blast', 'functionCounts_s', 'functionCounts_m' ]
pred_final.columns = colnames

def choose_functioncounts(blast, s,m):
    if str(blast)!='nan':
        return blast
    if s ==1:
        return 1
    return m

pred_final['functionCounts_dmlf'] = pred_final.apply(lambda x:choose_functioncounts(x.functionCounts_blast, x.functionCounts_s, x.functionCounts_m), axis=1)
pred_final=pred_final.iloc[:,np.r_[0,1,5]]

In [81]:
for i in range(1,9):
    right= len(pred_final[(pred_final.functionCounts_groundtruth==pred_final.functionCounts_dmlf) & (pred_final.functionCounts_groundtruth==i)])
    total= len(pred_final[(pred_final.functionCounts_groundtruth==i)])
    print(str(i)+'\t'+str(right)+'/'+str(total))

1	2942/3052
2	48/183
3	31/53
4	2/6
5	0/2
6	5/7
7	1/1
8	0/0


In [12]:
pred_final

Unnamed: 0,id,functionCounts_groundtruth,functionCounts_dmlf
0,Q5RF96,1,1.0
1,P9WIA9,1,1.0
2,H2E7Q7,1,1.0
3,A0A0D4BSN8,1,1.0
4,E2JFG2,1,1.0
...,...,...,...
3299,A0A2R6Q324,1,2.0
3300,Q6VE93,1,1.0
3301,A0A509AKI1,1,1.0
3302,Q6NRV0,1,1.0


In [13]:
eva.caculateMetrix(groundtruth=pred_final.functionCounts_groundtruth, predict=pred_final.functionCounts_dmlf, baselineName='ours', type='multi')

        ours  		0.917070  	0.583702 		0.552035 	0.560549
