# Task3. Enzyme Commission Number Assignment

> author: Shizhenkun   
> email: zhenkun.shi@tib.cas.cn   
> date: 2021-10-21  


## 1. Import packages

In [1]:
import numpy as np
import pandas as pd
import sys
import os
from tqdm import tqdm
sys.path.append("../tools/")
import funclib

from xgboost.sklearn import XGBClassifier


from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import hamming_loss
from sklearn.metrics import jaccard_score
from sklearn.metrics import hinge_loss

sys.path.append("../")
import benchmark_common as bcommon
import benchmark_train as btrain
import benchmark_test as btest
import config as cfg
import benchmark_evaluation as eva

%load_ext autoreload
%autoreload 2

## 2. Load data

In [2]:
#read train test data
train = pd.read_feather(cfg.DATADIR+'task3/train.feather')
test = pd.read_feather(cfg.DATADIR+'task3/test.feather')
print('train size: {0}\ntest size: {1}'.format(len(train), len(test)))

train size: 222567
test size: 3304


## 3. Gather features

In [3]:
trainf=pd.read_feather(cfg.DATADIR+'train_rep32.feather')
testf=pd.read_feather(cfg.DATADIR+'test_rep32.feather')

train = train.merge(trainf, on='id', how='left')
test = test.merge(testf, on='id', how='left')

## 4. sequence aligment

In [4]:
res_data=funclib.getblast(train,test)

Write finished
Write finished
diamond makedb --in /tmp/train.fasta -d /tmp/train.dmnd


diamond v2.0.8.146 (C) Max Planck Society for the Advancement of Science
Documentation, support and updates available at http://www.diamondsearch.org

#CPU threads: 32
Scoring parameters: (Matrix=BLOSUM62 Lambda=0.267 K=0.041 Penalties=11/1)
Database input file: /tmp/train.fasta
Opening the database file...  [0.004s]
Loading sequences...  [0.38s]
Masking sequences...  [0.255s]
Writing sequences...  [0.096s]
Hashing sequences...  [0.027s]
Loading sequences...  [0s]
Writing trailer...  [0.001s]
Closing the input file...  [0.003s]
Closing the database file...  [0.024s]
Database hash = d455115f7ab276d8f0c450236f13a30a
Processed 222567 sequences, 93551643 letters.
Total time = 0.793s


diamond blastp -d /tmp/train.dmnd  -q  /tmp/test.fasta -o /tmp/test_fasta_results.tsv -b5 -c1 -k 1 --quiet


In [5]:
res_data = res_data[['id', 'sseqid']].merge(train, left_on='sseqid',right_on='id', how='left')[['id_x','sseqid','ec_number']]
res_data =res_data.rename(columns={'id_x':'id','sseqid':'id_ref', 'ec_number':'ec_number_pred'})
res_data=res_data.merge(test, on='id', how='left')[['id','ec_number','ec_number_pred']]

In [6]:
res_data[res_data.ec_number==res_data.ec_number_pred]

Unnamed: 0,id,ec_number,ec_number_pred
0,Q5RF96,3.4.-.-,3.4.-.-
1,P9WIA9,3.1.4.3,3.1.4.3
2,H2E7Q7,3.4.21.26,3.4.21.26
3,E2JFG2,3.4.21.26,3.4.21.26
4,H2E7Q8,3.4.21.26,3.4.21.26
...,...,...,...
2753,A7TS67,3.6.1.-,3.6.1.-
2754,A3LNL5,3.6.1.-,3.6.1.-
2755,A0A059TC02,1.2.1.44,1.2.1.44
2759,Q6NRV0,2.3.2.27,2.3.2.27


## 5. Make label

In [None]:
train_set= funclib.split_ecdf_to_single_lines(train)

162000it [19:36:12,  1.30it/s]

In [28]:
#4. 加载EC号训练数据
print('loading ec to label dict')
if os.path.exists(cfg.FILE_EC_LABEL_DICT):
    dict_ec_label = np.load(cfg.FILE_EC_LABEL_DICT, allow_pickle=True).item()
else:
    dict_ec_label = make_ec_label(train_label=train['ec_number'], test_label=test['ec_number'], file_save= cfg.FILE_EC_LABEL_DICT, force_model_update=cfg.UPDATE_MODEL)


loading ec to label dict


In [29]:
dict_ec_label

{'1.-.-.-': 0,
 '1.1.-.-': 1,
 '1.1.1.-': 2,
 '1.1.1.-,4.2.1.107,4.2.1.119': 3,
 '1.1.1.-,4.2.1.119': 4,
 '1.1.1.-,5.1.3.-,4.2.1.76': 5,
 '1.1.1.1': 6,
 '1.1.1.1,1.1.1.284': 7,
 '1.1.1.1,1.1.1.284,1.1.1.73': 8,
 '1.1.1.1,1.1.1.4,1.2.1.3': 9,
 '1.1.1.1,1.2.1.10': 10,
 '1.1.1.1,1.2.1.3': 11,
 '1.1.1.10': 12,
 '1.1.1.10,1.1.1.162': 13,
 '1.1.1.100': 14,
 '1.1.1.100,2.3.1.41,2.3.1.39,2.3.1.38,1.3.1.39,4.2.1.59,3.1.2.14,2.3.1.85': 15,
 '1.1.1.101': 16,
 '1.1.1.102': 17,
 '1.1.1.103': 18,
 '1.1.1.105': 19,
 '1.1.1.105,1.1.1.62,1.1.1.239': 20,
 '1.1.1.107': 21,
 '1.1.1.108': 22,
 '1.1.1.11': 23,
 '1.1.1.116': 24,
 '1.1.1.117': 25,
 '1.1.1.119': 26,
 '1.1.1.12': 27,
 '1.1.1.122': 28,
 '1.1.1.124': 29,
 '1.1.1.127': 30,
 '1.1.1.130': 31,
 '1.1.1.132': 32,
 '1.1.1.133': 33,
 '1.1.1.133,5.1.3.13': 34,
 '1.1.1.135': 35,
 '1.1.1.136': 36,
 '1.1.1.138': 37,
 '1.1.1.14': 38,
 '1.1.1.140': 39,
 '1.1.1.141': 40,
 '1.1.1.144,1.1.1.347': 41,
 '1.1.1.144,1.14.14.52,1.1.1.243,1.14.14.51': 42,
 '1.1.1.145,5

## 4. Slice+esm32

In [26]:
train

Unnamed: 0,id,seq,ec_number,f1,f2,f3,f4,f5,f6,f7,...,f1271,f1272,f1273,f1274,f1275,f1276,f1277,f1278,f1279,f1280
0,P00958,MSFLISFDKSKKHPAHLQLANNLKIALALEYASKNLKPEVDNDNAA...,6.1.1.10,-0.220921,4.081616,-3.814746,2.202727,-6.239662,-4.988444,-5.822192,...,18.058687,-1.100924,1.724075,0.143535,-6.999560,-6.251357,8.360884,-1.662625,-5.621017,1.682485
1,P00812,METGPHYNYYKNRELSIVLAPFSGGQGKLGVEKGPKYMLKHGLQTS...,3.5.3.1,3.521839,4.318994,-1.853766,1.102569,-9.321230,-4.854475,-7.815628,...,8.096443,0.931363,-4.604083,0.339199,-6.082336,-3.915988,7.511534,-2.378523,-3.667611,4.634388
2,P00959,MTQVAKKILVTCALPYANGSIHLGHMLEHIQADVWVRYQRMRGHEV...,6.1.1.10,1.032287,-1.094576,-4.722376,4.625370,-1.973799,-1.705471,-4.638579,...,18.173588,-1.603059,0.301227,-2.721340,-1.693633,-4.528199,5.910798,-2.073927,-7.305595,0.270684
3,P00348,MAFATRQLVRSLSSSSTAAASAKKILVKHVTVIGGGLMGAGIAQVA...,1.1.1.35,-2.420424,-1.460299,-2.409102,1.643003,-1.612438,-1.568280,2.726623,...,15.082927,-0.857677,0.444023,1.980751,-1.351836,-4.664156,1.289754,-2.213044,2.253615,3.059324
4,P00469,MLEQPYLDLAKKVLDEGHFKPDRTHTGTYSIFGHQMRFDLSKGFPL...,2.1.1.45,-3.996948,-6.180151,-4.634508,3.155759,-10.585983,0.715175,-15.944781,...,24.049959,-0.592115,-8.274217,-0.269268,-7.028537,-4.726937,13.501459,-11.718015,0.038843,2.551557
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
222562,Q8I6K2,MSNTAVLNDLVALYDRPTEPMFRVKAKKSFKVPKEYVTDRFKNVAV...,1.14.18.-,0.172802,2.350760,4.350393,3.693239,-5.923174,1.420440,-8.875187,...,20.170313,-0.140875,-7.601892,0.804636,-11.607852,-6.285739,4.216749,2.142383,-14.011300,10.606584
222563,O81103,MATAPSPTTMGTYSSLISTNSFSTFLPNKSQLSLSGKSKHYVARRS...,1.10.3.1,0.272997,2.976982,1.862016,5.408012,-5.642054,-5.404109,-11.418259,...,17.778130,-1.399726,-7.715403,-1.014039,-9.040051,-3.262766,2.823239,2.903638,-3.791561,3.065740
222564,Q21221,MSSGAPSGSSMSSTPGSPPPRAGGPNSVSFKDLCCLFCCPPFPSSI...,3.1.2.22,-1.024953,5.833755,1.151545,1.393820,-3.737662,0.271311,-4.385376,...,8.665022,-3.316796,-5.142276,6.789438,-5.080449,-4.224988,7.700522,-4.602151,-2.171469,3.767569
222565,Q6QJ72,MSRLLLPKLFSISRTQVPAASLFNNLYRRHKRFVHWTSKMSTDSVR...,4.2.1.96,-0.132955,0.242595,-2.817183,4.983530,-6.607008,-1.489796,-5.255298,...,11.285470,-1.765559,-4.504529,-3.887650,-9.018026,-5.810025,3.122884,3.802729,10.711333,-4.930251


In [27]:
train.iloc[:,3:]

Unnamed: 0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,...,f1271,f1272,f1273,f1274,f1275,f1276,f1277,f1278,f1279,f1280
0,-0.220921,4.081616,-3.814746,2.202727,-6.239662,-4.988444,-5.822192,-4.460664,-1.200592,-3.804104,...,18.058687,-1.100924,1.724075,0.143535,-6.999560,-6.251357,8.360884,-1.662625,-5.621017,1.682485
1,3.521839,4.318994,-1.853766,1.102569,-9.321230,-4.854475,-7.815628,-2.012899,-5.534803,-6.105119,...,8.096443,0.931363,-4.604083,0.339199,-6.082336,-3.915988,7.511534,-2.378523,-3.667611,4.634388
2,1.032287,-1.094576,-4.722376,4.625370,-1.973799,-1.705471,-4.638579,-6.950621,-1.872663,-3.980067,...,18.173588,-1.603059,0.301227,-2.721340,-1.693633,-4.528199,5.910798,-2.073927,-7.305595,0.270684
3,-2.420424,-1.460299,-2.409102,1.643003,-1.612438,-1.568280,2.726623,2.233466,-10.240229,-5.307086,...,15.082927,-0.857677,0.444023,1.980751,-1.351836,-4.664156,1.289754,-2.213044,2.253615,3.059324
4,-3.996948,-6.180151,-4.634508,3.155759,-10.585983,0.715175,-15.944781,-7.783895,-6.171251,-8.116997,...,24.049959,-0.592115,-8.274217,-0.269268,-7.028537,-4.726937,13.501459,-11.718015,0.038843,2.551557
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
222562,0.172802,2.350760,4.350393,3.693239,-5.923174,1.420440,-8.875187,4.948562,-9.260919,-9.089705,...,20.170313,-0.140875,-7.601892,0.804636,-11.607852,-6.285739,4.216749,2.142383,-14.011300,10.606584
222563,0.272997,2.976982,1.862016,5.408012,-5.642054,-5.404109,-11.418259,0.153906,-5.349937,-5.087845,...,17.778130,-1.399726,-7.715403,-1.014039,-9.040051,-3.262766,2.823239,2.903638,-3.791561,3.065740
222564,-1.024953,5.833755,1.151545,1.393820,-3.737662,0.271311,-4.385376,-0.669128,-8.530610,-3.202958,...,8.665022,-3.316796,-5.142276,6.789438,-5.080449,-4.224988,7.700522,-4.602151,-2.171469,3.767569
222565,-0.132955,0.242595,-2.817183,4.983530,-6.607008,-1.489796,-5.255298,-2.211435,-1.485693,-6.607338,...,11.285470,-1.765559,-4.504529,-3.887650,-9.018026,-5.810025,3.122884,3.802729,10.711333,-4.930251


In [133]:
train_X = train_esm32.iloc[:, 7:]
train_Y = pd.DataFrame(train_esm32['ec_label'])

test_X = test_esm32.iloc[:, 7:]
test_Y = pd.DataFrame(test_esm32['ec_label'])

In [134]:
cfg.FEATURE_NUM = 1280
#train
bcommon.prepare_slice_file(x_data=train_X, y_data=train_Y, x_file=cfg.DATADIR+'slice_train_x_esm32.txt', y_file=cfg.DATADIR+'slice_train_y_esm32.txt', ec_label_dict=dict_ec_label)
#test
bcommon.prepare_slice_file(x_data=test_X, y_data=test_Y, x_file=cfg.DATADIR+'slice_test_x_esm32.txt', y_file=cfg.DATADIR+'slice_test_y_esm32.txt', ec_label_dict=dict_ec_label)

slice files prepared success
slice files prepared success


In [135]:
print('step 6 trainning slice model')
#6. 训练Slice模型
btrain.train_ec_slice(trainX=cfg.DATADIR+'slice_train_x_esm32.txt', trainY=cfg.DATADIR+'slice_train_y_unirep.txt', modelPath=cfg.MODELDIR+'/slice_esm32')
slice_pred = btest.get_slice_res(slice_query_file=cfg.DATADIR+'slice_test_x_esm32.txt', model_path= cfg.MODELDIR+'/slice_esm32',dict_ec_label=dict_ec_label,test_set=test, res_file='/tmp/test.txt')

step 6 trainning slice model
 ./slice_train /home/shizhenkun/codebase/BioUniprot/data/benchmark/data/slice_train_x_esm32.txt /home/shizhenkun/codebase/BioUniprot/data/benchmark/data/slice_train_y_unirep.txt /home/shizhenkun/codebase/BioUniprot/data/benchmark/model/slice_esm32 -m 100 -c 300 -s 300 -k 700 -o 32 -t 32 -C 1 -f 0.000001 -siter 20 -stype 0 -q 0 
train finished
./slice_predict /home/shizhenkun/codebase/BioUniprot/data/benchmark/data/slice_test_x_esm32.txt /home/shizhenkun/codebase/BioUniprot/data/benchmark/model/slice_esm32 /tmp/test.txt -o 32 -b 0 -t 32 -q 0


In [136]:
s1res = test.iloc[:,np.r_[0:5]].merge(slice_pred, on='id', how='left')
s1res[s1res.ec_number==s1res.top0]

Unnamed: 0,id,ec_label,isemzyme,functionCounts,ec_number,top0,top1,top2,top3,top4,...,top10,top11,top12,top13,top14,top15,top16,top17,top18,top19
4852,A0A3B1EFP9,13.0,True,1,1.1.-.-,1.1.-.-,1.1.99.29,4.1.2.10,3.2.1.21,1.11.1.16,...,1.1.99.1,1.4.3.16,1.1.3.4,1.1.5.4,1.1.1.-,1.1.3.20,1.4.3.4,1.11.1.5,1.14.14.-,1.1.5.3
4870,A0A1Y0BRF3,18.0,True,1,1.1.1.-,1.1.1.-,1.3.1.-,1.1.1.1,5.1.3.2,2.4.1.-,...,1.2.1.38,1.1.1.2,1.2.1.44,1.3.1.33,1.1.1.270,1.1.1.170,1.5.1.46,4.2.1.47,1.4.3.3,1.3.1.45
4871,A0A1A9TAK5,18.0,True,1,1.1.1.-,1.1.1.-,1.1.1.1,1.1.1.100,1.1.-.-,3.1.-.-,...,1.1.1.138,1.1.1.300,1.3.1.38,1.1.1.236,2.7.10.2,1.1.1.153,1.1.1.36,1.1.1.141,1.1.1.30,1.1.1.332
4872,Q5M8N4,18.0,True,1,1.1.1.-,1.1.1.-,2.1.1.163,1.4.99.-,2.4.1.227,5.1.3.2,...,1.1.1.133,1.2.1.84,1.1.1.170,1.1.1.271,1.1.1.37,2.7.-.-,1.3.3.4,2.3.1.20,1.17.1.8,1.2.1.-
4873,M1VWN4,18.0,True,1,1.1.1.-,1.1.1.-,2.3.1.-,1.17.1.8,1.1.1.330,1.1.-.-,...,1.6.5.9,1.1.5.3,1.1.1.270,1.18.1.-,2.7.1.-,1.3.1.75,1.5.1.46,3.4.14.5,1.1.1.49,1.2.1.-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7857,C5NN18,5836.0,True,1,6.2.1.-,6.2.1.-,6.2.1.12,6.2.1.3,3.6.4.13,1.6.2.2,...,6.2.1.26,1.13.12.7,2.6.1.-,2.4.1.-,1.6.5.4,3.5.1.4,3.4.14.5,1.1.1.9,6.2.1.41,1.1.1.195
7930,Q8GZ29,5904.0,True,1,6.3.2.-,6.3.2.-,6.2.1.3,6.2.1.-,6.2.1.16,2.8.2.-,...,2.7.7.9,6.2.1.12,4.4.1.14,6.3.2.17,6.2.1.1,6.3.2.7,6.3.5.4,2.1.1.320,6.3.2.20,6.2.1.7
7948,I3R9K1,5955.0,True,1,6.3.3.3,6.3.3.3,1.3.7.7,6.3.4.2,6.3.5.11,2.7.1.-,...,3.6.1.15,2.7.7.2,6.3.2.8,2.7.8.28,6.3.5.9,3.4.19.15,2.3.1.8,1.1.1.47,4.1.1.-,2.7.1.78
7951,G0HV10,5977.0,True,1,6.3.4.2,6.3.4.2,6.3.5.5,6.3.5.2,6.3.5.11,2.3.1.31,...,2.7.7.27,1.20.4.4,1.20.4.-,2.3.1.8,2.3.1.46,3.4.19.9,1.3.99.37,2.2.1.10,1.7.1.16,4.2.1.155
