# Task2. Enzyme Catalytic Function Quantity Annotation

> author: Shizhenkun   
> email: zhenkun.shi@tib.cas.cn   
> date: 2021-09-28  




## 1. Import packages

In [1]:
import numpy as np
import pandas as pd


import sys
import os
from tqdm import tqdm
from functools import reduce

sys.path.append("../tools/")
import funclib

sys.path.append("../")
import benchmark_common as bcommon
import benchmark_train as btrain
import benchmark_test as btest
import config as cfg
import benchmark_evaluation as eva
import joblib

from sklearn import metrics

from pandarallel import pandarallel # 导入pandaralle
pandarallel.initialize() # 初始化该这个b...并行库

%load_ext autoreload
%autoreload 2

INFO: Pandarallel will run on 80 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


## 2. Load data

In [2]:
#read train test data
train = pd.read_feather(cfg.DATADIR+'task2/train.feather')
test = pd.read_feather(cfg.DATADIR+'task2/test.feather')
print('train size: {0}\ntest size: {1}'.format(len(train), len(test)))

train size: 222567
test size: 3304


## 3. Gather features

In [3]:
trainf=pd.read_feather(cfg.DATADIR+'train_rep32.feather')
testf=pd.read_feather(cfg.DATADIR+'test_rep32.feather')

train = train.merge(trainf, on='id', how='left')
test = test.merge(testf, on='id', how='left')

## 4. Prediction


### 4.1 singel-to-multi

In [5]:
train_s1=train.copy()
test_s1=test.copy()

train_s1['lb'] = train_s1.functionCounts.apply(lambda x : 0 if x==1 else 1)
test_s1['lb'] = test_s1.functionCounts.apply(lambda x : 0 if x==1 else 1)

X_train = np.array(train.iloc[:,np.r_[3:1283]])
Y_train = np.array(train_s1.lb.astype('int')).flatten()

X_test = np.array(test.iloc[:,np.r_[3:1283]])
Y_test = np.array(test_s1.lb.astype('int')).flatten()

In [6]:
funclib.run_baseline(X_train, Y_train, X_test, Y_test)

baslineName 	 accuracy 	 precision(PPV) 	 NPV 		 recall 	 f1 		 		 confusion Matrix
knn 		0.925545 	0.517857 		0.947385 	0.345238 	0.414286 	 tp: 87 fp: 81 fn: 165 tn: 2971
lr 		0.919189 	0.407407 		0.932051 	0.130952 	0.198198 	 tp: 33 fp: 48 fn: 219 tn: 3004
xg 		0.940375 	0.802198 		0.944289 	0.289683 	0.425656 	 tp: 73 fp: 18 fn: 179 tn: 3034
dt 		0.886804 	0.261719 		0.939304 	0.265873 	0.263780 	 tp: 67 fp: 189 fn: 185 tn: 2863
rf 		0.937651 	0.883333 		0.938656 	0.210317 	0.339744 	 tp: 53 fp: 7 fn: 199 tn: 3045
gbdt 		0.928874 	0.869565 		0.929290 	0.079365 	0.145455 	 tp: 20 fp: 3 fn: 232 tn: 3049


In [8]:
groundtruth, predict, predictprob, model = funclib.xgmain(X_train, Y_train, X_test, Y_test, type='binary')
joblib.dump(model, cfg.MODELDIR+'/single_multi.model')

['/home/shizhenkun/codebase/DMLF/model/single_multi.model']

### 4.2 2-8 functions

In [10]:
#gather >2 data
train_s2=train.copy()
test_s2=test.copy()

train_s2=train_s2[train_s2.functionCounts>=2]
test_s2=test_s2[test_s2.functionCounts>=2]

train_s2.reset_index(drop=True, inplace=True)
test_s2.reset_index(drop=True, inplace=True)

#define X,Y
X_train = np.array(train_s2.iloc[:,np.r_[3:1283]])
Y_train = np.array(train_s2.functionCounts.astype('int')-2).flatten()

X_test = np.array(test_s2.iloc[:,np.r_[3:1283]])
Y_test = np.array(test_s2.functionCounts.astype('int')-2).flatten()

In [11]:
funclib.run_baseline(X_train, Y_train, X_test, Y_test, type='multi')

 baslineName 		 accuracy 	 precision-macro 	 recall-macro 	 f1-macro
         knn  		0.833333  	0.678550 		0.623593 	0.646456
          lr  		0.761905  	0.707956 		0.637942 	0.521033
          xg  		0.849206  	0.854167 		0.630704 	0.623925
          dt  		0.702381  	0.479052 		0.482337 	0.479975
          rf  		0.853175  	0.864239 		0.546546 	0.594133
        gbdt  		0.853175  	0.856578 		0.573413 	0.602268


In [12]:
groundtruth, predict, predictprob, model = funclib.xgmain(X_train, Y_train, X_test, Y_test, type='multi')
joblib.dump(model, cfg.MODELDIR+'/multi_many.model')

['/home/shizhenkun/codebase/DMLF/model/multi_many.model']

## 5. Integration

### 5.1 sequence aligment

In [6]:
res_data=funclib.getblast(train,test)
res_data = res_data[['id', 'sseqid']].merge(train, left_on='sseqid',right_on='id', how='left')[['id_x','sseqid','functionCounts']]
res_data =res_data.rename(columns={'id_x':'id','sseqid':'id_ref', 'functionCounts':'functionCounts_pred'})
res_data = res_data.merge(test, on='id', how='left')[['id','functionCounts_pred','functionCounts']]

Write finished
Write finished
diamond makedb --in /tmp/train.fasta -d /tmp/train.dmnd


diamond v2.0.8.146 (C) Max Planck Society for the Advancement of Science
Documentation, support and updates available at http://www.diamondsearch.org

#CPU threads: 32
Scoring parameters: (Matrix=BLOSUM62 Lambda=0.267 K=0.041 Penalties=11/1)
Database input file: /tmp/train.fasta
Opening the database file...  [0.004s]
Loading sequences...  [0.376s]
Masking sequences...  [0.282s]
Writing sequences...  [0.097s]
Hashing sequences...  [0.025s]
Loading sequences...  [0s]
Writing trailer...  [0.001s]
Closing the input file...  [0.002s]
Closing the database file...  [0.023s]
Database hash = d455115f7ab276d8f0c450236f13a30a
Processed 222567 sequences, 93551643 letters.
Total time = 0.814s


diamond blastp -d /tmp/train.dmnd  -q  /tmp/test.fasta -o /tmp/test_fasta_results.tsv -b5 -c1 -k 1 --quiet


### 5.2 Xgboost

In [22]:
X_test = np.array(test.iloc[:,3:])
Y_test = np.array(test.functionCounts.astype('int')).flatten()

model_s = joblib.load(cfg.MODELDIR+'/single_multi.model')
model_m = joblib.load(cfg.MODELDIR+'/multi_many.model')

In [23]:
pred_s=model_s.predict(X_test)
pred_m=model_m.predict(X_test)

### 5.3 Results integration

In [80]:
pred_final = test.iloc[:,np.r_[0,2]]
pred_final = pred_final.merge(res_data, on='id', how='left')
pred_final['pred_s']=1-pred_s
pred_final['pred_m']=pred_m+2
pred_final = pred_final.iloc[:,np.r_[0,1,2,4,5]]

colnames=[  'id', 'functionCounts_groundtruth', 'functionCounts_blast', 'functionCounts_s', 'functionCounts_m' ]
pred_final.columns = colnames

def choose_functioncounts(blast, s,m):
    if str(blast)!='nan':
        return blast
    if s ==1:
        return 1
    return m

pred_final['functionCounts_dmlf'] = pred_final.apply(lambda x:choose_functioncounts(x.functionCounts_blast, x.functionCounts_s, x.functionCounts_m), axis=1)
pred_final=pred_final.iloc[:,np.r_[0,1,5]]

In [81]:
for i in range(1,9):
    right= len(pred_final[(pred_final.functionCounts_groundtruth==pred_final.functionCounts_dmlf) & (pred_final.functionCounts_groundtruth==i)])
    total= len(pred_final[(pred_final.functionCounts_groundtruth==i)])
    print(str(i)+'\t'+str(right)+'/'+str(total))

1	2942/3052
2	48/183
3	31/53
4	2/6
5	0/2
6	5/7
7	1/1
8	0/0
