# Task3. Enzyme Commission Number Assignment

> author: Shizhenkun   
> email: zhenkun.shi@tib.cas.cn   
> date: 2021-10-21  


## 1. Import packages

In [12]:
import numpy as np
import pandas as pd
import sys
import os
from tqdm import tqdm
sys.path.append("../tools/")
import funclib

from xgboost.sklearn import XGBClassifier


from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import hamming_loss
from sklearn.metrics import jaccard_score
from sklearn.metrics import hinge_loss

sys.path.append("../")
import benchmark_common as bcommon
import benchmark_train as btrain
import benchmark_test as btest
import config as cfg
import benchmark_evaluation as eva

from pandarallel import pandarallel #  import pandaralle
pandarallel.initialize() # init

%load_ext autoreload
%autoreload 2

INFO: Pandarallel will run on 80 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## 2. Load data

In [14]:
#read train test data
train = pd.read_feather(cfg.DATADIR+'task3/train.feather')
test = pd.read_feather(cfg.DATADIR+'task3/test.feather')
print('train size: {0}\ntest size: {1}'.format(len(train), len(test)))

train size: 222567
test size: 3304


## 3. Gather features

In [43]:
trainf=pd.read_feather(cfg.DATADIR+'train_rep32.feather')
testf=pd.read_feather(cfg.DATADIR+'test_rep32.feather')


## 4. sequence aligment

In [8]:
res_data=funclib.getblast(train,test)

Write finished
Write finished
diamond makedb --in /tmp/train.fasta -d /tmp/train.dmnd


diamond v2.0.8.146 (C) Max Planck Society for the Advancement of Science
Documentation, support and updates available at http://www.diamondsearch.org

#CPU threads: 80
Scoring parameters: (Matrix=BLOSUM62 Lambda=0.267 K=0.041 Penalties=11/1)
Database input file: /tmp/train.fasta
Opening the database file...  [0.005s]
Loading sequences...  [0.473s]
Masking sequences...  [0.175s]
Writing sequences...  [0.097s]
Hashing sequences...  [0.027s]
Loading sequences...  [0s]
Writing trailer...  [0.001s]
Closing the input file...  [0.002s]
Closing the database file...  [0.023s]
Database hash = d455115f7ab276d8f0c450236f13a30a
Processed 222567 sequences, 93551643 letters.
Total time = 0.807s


diamond blastp -d /tmp/train.dmnd  -q  /tmp/test.fasta -o /tmp/test_fasta_results.tsv -b5 -c1 -k 1 --quiet


In [9]:
res_data = res_data[['id', 'sseqid']].merge(train, left_on='sseqid',right_on='id', how='left')[['id_x','sseqid','ec_number']]
res_data =res_data.rename(columns={'id_x':'id','sseqid':'id_ref', 'ec_number':'ec_number_pred'})
res_data=res_data.merge(test, on='id', how='left')[['id','ec_number','ec_number_pred']]

In [10]:
res_data[res_data.ec_number==res_data.ec_number_pred]

Unnamed: 0,id,ec_number,ec_number_pred
0,Q5RF96,3.4.-.-,3.4.-.-
1,P9WIA9,3.1.4.3,3.1.4.3
2,H2E7Q7,3.4.21.26,3.4.21.26
3,E2JFG2,3.4.21.26,3.4.21.26
4,H2E7Q8,3.4.21.26,3.4.21.26
...,...,...,...
2753,A7TS67,3.6.1.-,3.6.1.-
2754,A3LNL5,3.6.1.-,3.6.1.-
2755,A0A059TC02,1.2.1.44,1.2.1.44
2759,Q6NRV0,2.3.2.27,2.3.2.27


## 5. Make label

In [49]:
train_set= funclib.split_ecdf_to_single_lines(train)
test_set=funclib.split_ecdf_to_single_lines(test)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 222567/222567 [04:38<00:00, 798.47it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3304/3304 [00:00<00:00, 127142.10it/s]


In [19]:
#4. 加载EC号训练数据
print('loading ec to label dict')
if os.path.exists(cfg.FILE_EC_LABEL_DICT):
    dict_ec_label = np.load(cfg.FILE_EC_LABEL_DICT, allow_pickle=True).item()
else:
    dict_ec_label = btrain.make_ec_label(train_label=train_set['ec_number'], test_label=test_set['ec_number'], file_save= cfg.FILE_EC_LABEL_DICT, force_model_update=cfg.UPDATE_MODEL)


loading ec to label dict


In [51]:
train_set['ec_label'] = train_set.ec_number.parallel_apply(lambda x: dict_ec_label.get(x))
test_set['ec_label'] = test_set.ec_number.parallel_apply(lambda x: dict_ec_label.get(x))

In [55]:
train_set.ec_label.astype('int')
test_set.ec_label.astype('int')

0       3429
1       3207
2       3537
3        229
4       3537
        ... 
3655    1799
3656    1799
3657    3563
3658    2018
3659    3946
Name: ec_label, Length: 3660, dtype: int64

In [57]:
train_set2 = train_set.copy()
test_set2=test_set.copy()

In [61]:
train_set2 = train_set2.merge(trainf, on='id', how='left')
test_set2 = test_set2.merge(testf, on='id', how='left')

## 6. EC Learning

In [30]:
train_set = train_set.merge(trainf, on='id', how='left')
test_set = test_set.merge(testf, on='id', how='left')

In [70]:
train_X = train_set2.iloc[:, 4:]
train_Y = pd.DataFrame(train_set2['ec_label'])

test_X = test_set2.iloc[:, 4:]
test_Y = pd.DataFrame(test_set2['ec_label'])

In [81]:
cfg.FEATURE_NUM = 1280
#train
bcommon.prepare_slice_file(x_data=train_X, y_data=train_Y, x_file=cfg.DATADIR+'slice_train_x_esm32.txt', y_file=cfg.DATADIR+'slice_train_y_esm32.txt', ec_label_dict=dict_ec_label)
#test
bcommon.prepare_slice_file(x_data=test_X, y_data=test_Y, x_file=cfg.DATADIR+'slice_test_x_esm32.txt', y_file=cfg.DATADIR+'slice_test_y_esm32.txt', ec_label_dict=dict_ec_label)

slice files prepared success
slice files prepared success


In [99]:
print('step 6 trainning slice model')
#6. 训练Slice模型
btrain.train_ec_slice(trainX=cfg.DATADIR+'slice_train_x_esm32.txt', trainY=cfg.DATADIR+'slice_train_y_esm32.txt', modelPath=cfg.MODELDIR+'/slice_esm32')
slice_pred = btest.get_slice_res(slice_query_file=cfg.DATADIR+'slice_test_x_esm32.txt', model_path= cfg.MODELDIR+'/slice_esm32',dict_ec_label=dict_ec_label,test_set=test_set, res_file='/tmp/test.txt')

step 6 trainning slice model
 ./slice_train /home/shizhenkun/codebase/DMLF/data/slice_train_x_esm32.txt /home/shizhenkun/codebase/DMLF/data/slice_train_y_esm32.txt /home/shizhenkun/codebase/DMLF/model/slice_esm32 -m 100 -c 300 -s 300 -k 700 -o 32 -t 32 -C 1 -f 0.000001 -siter 20 -stype 0 -q 0 
Parameter Setting
-------------------------------------------------
Number of training examples=237492
Number of features=1280
Number of labels=5151
Number of train/test threads=32
Quiet=0
M=100
efConstruction=300
efSearch=300
Number of nearest neighbors=700
Number of threads for I/O=32
Cost co-efficient for discriminative classifier=1
Threshold for discriminative classifier=1e-06
Maximum number of iterations for the discriminative classifier=20
Separator Type=0
b_gen=0
-------------------------------------------------
Number of labels removed = 297


Your CPU supports instructions that this binary was not compiled to use: SSE3 SSE4.1 SSE4.2 AVX AVX2
For maximum performance, you can install NMSLIB from sources 
pip install --no-binary :all: nmslib


Training time of ANNS datastructure = 1.653427


Your CPU supports instructions that this binary was not compiled to use: SSE3 SSE4.1 SSE4.2 AVX AVX2
For maximum performance, you can install NMSLIB from sources 
pip install --no-binary :all: nmslib


Time taken to find approx nearest neighbors = 26.902141
Training generative model ...
Finding the most confusing negatives ...
Training discriminative classifiers ...
Total training time: 758.635 s
train finished
./slice_predict /home/shizhenkun/codebase/DMLF/data/slice_test_x_esm32.txt /home/shizhenkun/codebase/DMLF/model/slice_esm32 /tmp/test.txt -o 32 -b 0 -t 32 -q 0


Your CPU supports instructions that this binary was not compiled to use: SSE3 SSE4.1 SSE4.2 AVX AVX2
For maximum performance, you can install NMSLIB from sources 
pip install --no-binary :all: nmslib


Time taken to find approx nearest neighbors = 0.409780
Total prediction time: 1.64823 s
Prediction time per point: 0.450337 ms


## 7. EC prediction

In [101]:
s1res = test_set2.iloc[:,np.r_[0:5]].merge(slice_pred, on='id', how='left')
s1res[s1res.ec_number==s1res.top0]

Unnamed: 0,id,seq,ec_number,ec_label,f1,top0,top1,top2,top3,top4,...,top10,top11,top12,top13,top14,top15,top16,top17,top18,top19
0,Q5RF96,MARGGDTGCTGPSETSASGAVAIAFPGLEGPPADAQYQTLALTVPK...,3.4.-.-,3429,4.739295,3.4.-.-,2.3.1.-,3.4.23.-,3.1.2.-,2.4.99.18,...,2.7.1.82,3.4.21.98,1.1.1.-,2.7.7.41,3.1.1.31,2.7.1.127,3.4.21.32,3.2.2.21,3.4.19.6,2.7.1.-
5,Q6EZC2,MLKILWTYILFLLFISASARAEKPWYFDAIGLTETTMSLTDKNTPV...,3.4.21.-,3508,2.125966,3.4.21.-,2.3.1.-,3.4.21.62,3.4.22.-,3.5.1.-,...,3.1.26.4,3.1.3.1,1.1.1.47,3.4.11.19,3.4.21.75,4.2.2.-,2.7.7.65,3.6.4.13,3.4.21.93,3.4.21.94
15,F4JSV3,MMNGLRRTFWSSIHKKKDNNRVDDSLDRQKPTTTSRFGFFSNPSTP...,2.3.2.27,2018,0.374186,2.3.2.27,2.7.11.1,3.1.3.16,6.3.2.-,2.4.1.-,...,2.4.1.46,2.7.1.159,2.7.1.134,3.6.4.3,4.2.99.22,1.16.1.7,3.6.1.3,3.2.1.2,3.1.3.86,3.1.3.36
20,Q95QG8,MDIKFEGNDAECTAGLKKASEGSFVLKDHVLIEFKINGKVAGKIKT...,3.1.3.16,3124,0.609076,3.1.3.16,2.1.1.-,3.1.-.-,3.4.19.12,3.1.4.4,...,2.3.1.48,2.1.1.221,3.1.11.-,3.1.3.32,3.1.3.91,2.7.1.-,3.1.3.48,2.3.1.12,2.3.1.61,2.5.1.58
25,Q9LRI8,MSAMALSSTMALSLPQSSMSLSHCRHNRITILIPSSSLRRRGGSSI...,1.3.3.4,1184,1.873969,1.3.3.4,1.3.5.6,1.4.99.-,3.1.1.-,1.4.3.4,...,1.3.5.5,1.14.13.-,1.8.3.5,5.5.1.19,1.1.99.2,1.14.15.21,5.5.1.18,1.14.99.44,1.3.99.23,1.8.-.-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4693,Q8I4V8,MTTEQEFEKVELTADGGVIKTILKKGDEGEENIPKKGNEVTVHYVG...,5.2.1.8,4758,0.117628,5.2.1.8,3.1.-.-,2.3.2.27,3.1.11.6,2.5.1.59,...,2.3.1.57,1.13.12.7,3.4.21.42,1.1.1.303,1.3.1.88,3.7.1.19,2.4.1.293,1.14.99.51,3.1.4.2,2.4.1.246
4694,I6X8D2,MADVAESQENAPAERAELTVPEMRQWLRNWVGKAVGKAPDSIDESV...,2.3.1.-,1799,0.366090,2.3.1.-,2.3.1.41,2.3.1.39,3.4.21.-,2.3.1.240,...,2.3.1.165,2.7.1.71,2.3.1.38,1.3.1.39,6.3.2.26,2.3.1.86,1.2.1.95,3.5.1.-,2.3.1.236,2.3.1.252
4702,A0A068BGA5,MASFPPSLVFTVRRKEPILVLPSKPTPRELKQLSDIDDQEGLRFQV...,2.3.1.-,1799,-3.046058,2.3.1.-,2.3.1.20,2.3.1.21,1.14.-.-,6.3.2.-,...,2.3.1.162,2.3.1.75,2.3.1.12,2.3.1.133,2.3.1.214,2.3.1.140,2.1.1.68,3.4.24.64,2.3.1.224,2.3.1.61
4703,A0A2R6Q324,MASFPPSLVFTVRRKEPTLVLPSKPTPRELKQLSDIDDQEGLRFQV...,2.3.1.-,1799,-3.076983,2.3.1.-,2.3.1.20,2.3.1.21,1.14.-.-,6.3.2.-,...,2.3.1.162,2.3.1.75,2.3.1.12,2.3.1.133,2.1.1.68,2.3.1.214,2.3.1.140,3.4.24.64,2.3.1.224,2.3.1.61


In [106]:
prset = pd.read_feather(cfg.DATADIR+'task1/test.feather')
prediction_data = prset.merge(testf, on='id', how='left')
pr_X = prediction_data.iloc[:, 3:]

In [112]:
bcommon.prepare_slice_file_onlyx(x_data=pr_X,  x_file=cfg.DATADIR+'ftest_x.txt')

slice files prepared success


In [113]:



slice_pred = btest.get_slice_res(slice_query_file=cfg.DATADIR+'ftest_x.txt', model_path= cfg.MODELDIR+'/slice_esm32',dict_ec_label=dict_ec_label,test_set=prediction_data, res_file='/tmp/test.txt')




./slice_predict /home/shizhenkun/codebase/DMLF/data/ftest_x.txt /home/shizhenkun/codebase/DMLF/model/slice_esm32 /tmp/test.txt -o 32 -b 0 -t 32 -q 0


Your CPU supports instructions that this binary was not compiled to use: SSE3 SSE4.1 SSE4.2 AVX AVX2
For maximum performance, you can install NMSLIB from sources 
pip install --no-binary :all: nmslib


Time taken to find approx nearest neighbors = 0.750282
Total prediction time: 2.61021 s
Prediction time per point: 0.367584 ms


In [114]:
slice_pred

Unnamed: 0,id,top0,top1,top2,top3,top4,top5,top6,top7,top8,...,top10,top11,top12,top13,top14,top15,top16,top17,top18,top19
0,P69031,3.4.24.-,3.4.22.-,3.2.1.39,3.1.27.-,2.3.1.251,3.4.21.-,2.3.2.27,3.1.27.5,3.4.24.39,...,3.4.21.6,3.2.1.-,3.2.-.-,3.6.1.15,3.4.17.-,3.1.1.3,3.1.3.-,3.4.23.-,3.1.30.-,3.2.1.6
1,P69019,3.4.24.-,3.1.27.-,3.4.24.39,2.3.1.251,3.4.21.-,3.4.22.-,3.4.21.6,3.1.27.5,3.4.17.-,...,3.1.3.48,3.4.-.-,3.2.1.-,3.1.3.-,3.1.1.3,3.1.3.2,2.1.1.-,2.1.1.86,3.2.-.-,3.1.4.41
2,Q5RF96,3.4.-.-,2.3.1.-,3.4.23.-,3.1.2.-,2.4.99.18,2.4.1.198,4.2.99.22,3.4.22.-,3.1.21.-,...,2.7.1.82,3.4.21.98,1.1.1.-,2.7.7.41,3.1.1.31,2.7.1.127,3.4.21.32,3.2.2.21,3.4.19.6,2.7.1.-
3,Q9UM00,2.3.1.-,3.1.-.-,3.6.4.13,3.4.-.-,3.4.23.-,3.4.24.-,2.4.99.18,2.4.1.198,1.1.1.-,...,1.1.1.34,3.4.21.98,2.1.-.-,3.4.21.91,2.7.7.41,2.5.1.87,2.1.1.6,1.3.1.94,2.3.1.76,2.3.1.17
4,P0C250,3.1.27.-,3.1.27.5,4.1.1.39,3.2.1.14,3.4.21.-,2.3.2.27,1.6.5.-,3.4.22.-,3.2.1.-,...,2.1.1.86,3.4.17.-,3.4.23.-,3.1.1.-,3.6.1.15,3.2.2.22,3.1.3.2,2.4.99.20,2.4.-.-,3.2.-.-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7096,A0A061ACU2,3.6.4.13,3.1.-.-,3.4.23.-,3.4.-.-,2.4.1.198,2.4.1.34,3.4.24.-,2.7.11.1,2.3.1.26,...,2.1.1.17,2.-.-.-,2.3.2.26,3.1.1.5,2.4.1.256,3.6.1.15,2.4.1.258,3.4.22.-,2.3.2.-,4.6.1.1
7097,Q8GYS8,3.1.1.4,3.2.1.17,2.3.2.27,1.11.1.7,2.7.11.-,2.4.1.-,3.1.1.74,3.1.1.-,3.1.1.11,...,3.1.27.-,1.8.-.-,3.4.23.-,1.8.3.2,3.2.1.39,3.2.1.-,1.16.1.9,2.7.1.-,3.2.1.14,3.2.1.15
7098,Q6NRV0,2.3.2.27,3.1.-.-,1.-.-.-,3.6.4.-,2.3.2.-,3.6.4.13,3.1.22.-,2.7.13.3,2.7.-.-,...,3.4.-.-,2.7.7.-,2.7.1.-,3.6.4.3,3.6.5.5,3.1.2.-,3.1.21.4,4.2.1.-,2.7.11.25,2.3.1.168
7099,C5DLH0,3.1.-.-,3.6.1.-,3.1.22.-,4.6.1.16,3.1.13.-,2.3.2.-,2.7.1.158,2.7.1.159,1.13.11.58,...,2.7.1.134,3.5.1.52,3.1.4.1,1.13.11.12,2.1.1.211,2.4.1.198,6.5.1.3,3.1.3.33,6.3.4.11,5.5.1.19


In [122]:
test_set[test_set.id=='Q6NRV0']

Unnamed: 0,id,seq,ec_number,ec_label
3658,Q6NRV0,MPIRAYCTICSDFFDNARDVAAITCGHTFHQECLLQWFHSAPHRTC...,2.3.2.27,2018
