# Task3. Enzyme Commission Number Assignment

> author: Shizhenkun   
> email: zhenkun.shi@tib.cas.cn   
> date: 2021-10-21  


## 1. Import packages

In [1]:
import numpy as np
import pandas as pd
import sys
import os
from tqdm import tqdm
sys.path.append("../tools/")
import funclib
import time
from xgboost.sklearn import XGBClassifier


from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import hamming_loss
from sklearn.metrics import jaccard_score
from sklearn.metrics import hinge_loss

sys.path.append("../")
import benchmark_common as bcommon
import benchmark_train as btrain
import benchmark_test as btest
import config as cfg
import benchmark_evaluation as eva

from pandarallel import pandarallel #  import pandaralle
pandarallel.initialize() # init

%load_ext autoreload
%autoreload 2

INFO: Pandarallel will run on 80 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


## 2. Load data

In [2]:
#read train test data
train = pd.read_feather(cfg.DATADIR+'task3/train.feather')
test = pd.read_feather(cfg.DATADIR+'task3/test.feather')
print('train size: {0}\ntest size: {1}'.format(len(train), len(test)))

train size: 222567
test size: 3304


## 3. Gather features

In [3]:
trainf=pd.read_feather(cfg.DATADIR+'train_rep32.feather')
testf=pd.read_feather(cfg.DATADIR+'test_rep32.feather')

## 4. sequence aligment

In [12]:
res_data=funclib.getblast(train,test)

Write finished
Write finished
diamond makedb --in /tmp/train.fasta -d /tmp/train.dmnd


diamond v2.0.8.146 (C) Max Planck Society for the Advancement of Science
Documentation, support and updates available at http://www.diamondsearch.org

#CPU threads: 80
Scoring parameters: (Matrix=BLOSUM62 Lambda=0.267 K=0.041 Penalties=11/1)
Database input file: /tmp/train.fasta
Opening the database file...  [0.005s]
Loading sequences...  [0.49s]
Masking sequences...  [0.199s]
Writing sequences...  [0.677s]
Hashing sequences...  [0.031s]
Loading sequences...  [0s]
Writing trailer...  [0.032s]
Closing the input file...  [0.003s]
Closing the database file...  [0.214s]
Database hash = d455115f7ab276d8f0c450236f13a30a
Processed 222567 sequences, 93551643 letters.
Total time = 1.655s


diamond blastp -d /tmp/train.dmnd  -q  /tmp/test.fasta -o /tmp/test_fasta_results.tsv -b5 -c1 -k 1 --quiet


In [13]:
res_data = res_data[['id', 'sseqid']].merge(train, left_on='sseqid',right_on='id', how='left')[['id_x','sseqid','ec_number']]
res_data =res_data.rename(columns={'id_x':'id','sseqid':'id_ref', 'ec_number':'ec_number_pred'})
res_data=res_data.merge(test, on='id', how='left')[['id','ec_number','ec_number_pred']]

In [14]:
res_data

Unnamed: 0,id,ec_number,ec_number_pred
0,Q5RF96,3.4.-.-,3.4.-.-
1,P9WIA9,3.1.4.3,3.1.4.3
2,H2E7Q7,3.4.21.26,3.4.21.26
3,E2JFG2,3.4.21.26,3.4.21.26
4,H2E7Q8,3.4.21.26,3.4.21.26
...,...,...,...
2756,Q753P9,"3.6.1.-,3.1.13.-",3.6.1.-
2757,A0A068BGA5,2.3.1.-,"2.3.1.232,2.3.1.196"
2758,A0A2R6Q324,2.3.1.-,"2.3.1.232,2.3.1.196"
2759,Q6NRV0,2.3.2.27,2.3.2.27


In [17]:
res_data[res_data.ec_number!=res_data.ec_number_pred]

Unnamed: 0,id,ec_number,ec_number_pred
6,L8EUQ6,"1.14.13.38,1.14.13.234",1.14.13.38
7,Q4VKV0,1.2.99.10,1.2.1.3
9,Q65YW9,1.14.14.35,1.14.14.5
13,Q845S9,1.5.1.42,1.5.1.38
14,Q65YX0,1.5.1.42,1.5.1.38
...,...,...,...
2749,B8PD44,"4.2.3.125,4.2.3.127,4.2.3.126,4.2.3.133,4.2.3.128","4.2.3.129,4.2.3.91,4.2.3.127,4.2.3.128"
2752,I6X8D2,2.3.1.-,2.3.1.41
2756,Q753P9,"3.6.1.-,3.1.13.-",3.6.1.-
2757,A0A068BGA5,2.3.1.-,"2.3.1.232,2.3.1.196"


In [60]:
test[~test.ec_number.str.contains('-')]

Unnamed: 0,id,seq,ec_number
1,P9WIA9,DAGVSWKVYRNKTLGPISSVLTYGSLVTSFKQSADPRSDLVRFGVA...,3.1.4.3
2,H2E7Q7,MARTPWLPNAYPPARRSDHVDIYKSALRGDVRVQDPYQWLEEYTDE...,3.4.21.26
3,A0A0D4BSN8,MKLDDKRILIIGAGEVGTAVAEDLVNRSDPTEIIIHTSRQQTMDMR...,1.1.1.397
4,E2JFG2,MPPTPWAPHSYPPTRRSDHVDVYQSASRGEVPVPDPYQWLEENSNE...,3.4.21.26
6,A0A0D4BS77,MTRTDFAQSAVASIFTGAIASHAAVLADDLGLFDALAKGKLRNRDL...,2.1.1.47
...,...,...,...
3290,B8PD44,MPSTPRQFVLPDLFPLVPFKGSTNPHYVKAAAESSAWINSYNVFTD...,"4.2.3.128,4.2.3.126,4.2.3.133,4.2.3.125,4.2.3.127"
3292,Q8I4V8,MTTEQEFEKVELTADGGVIKTILKKGDEGEENIPKKGNEVTVHYVG...,5.2.1.8
3296,A0A059TC02,MRSVSGQVVCVTGAGGFIASWLVKILLEKGYTVRGTVRNPDDPKNG...,1.2.1.44
3301,A0A509AKI1,MVLLNGKLKYIAVVAIFYNLIILLVKEKFPYICTKKKFHAISNRIL...,3.4.21.62


In [67]:
res_data.drop_duplicates(subset='id')

Unnamed: 0,id,ec_number,ec_number_pred
0,Q5RF96,3.4.-.-,3.4.-.-
1,P9WIA9,3.1.4.3,3.1.4.3
2,H2E7Q7,3.4.21.26,3.4.21.26
3,E2JFG2,3.4.21.26,3.4.21.26
4,H2E7Q8,3.4.21.26,3.4.21.26
...,...,...,...
2756,Q753P9,"3.6.1.-,3.1.13.-",3.6.1.-
2757,A0A068BGA5,2.3.1.-,"2.3.1.232,2.3.1.196"
2758,A0A2R6Q324,2.3.1.-,"2.3.1.232,2.3.1.196"
2759,Q6NRV0,2.3.2.27,2.3.2.27


In [55]:
res_data[(~res_data.ec_number.str.contains('-'))&(res_data.ec_number!=res_data.ec_number_pred)]

Unnamed: 0,id,ec_number,ec_number_pred
6,L8EUQ6,"1.14.13.234,1.14.13.38",1.14.13.38
7,Q4VKV0,1.2.99.10,1.2.1.3
9,Q65YW9,1.14.14.35,1.14.14.5
13,Q845S9,1.5.1.42,1.5.1.38
14,Q65YX0,1.5.1.42,1.5.1.38
...,...,...,...
2734,Q6E593,"2.3.1.196,2.3.1.224",2.3.1.196
2740,E9KBR8,1.14.19.76,1.14.13.87
2746,A0A348B779,"4.2.3.126,4.2.3.133,4.2.3.125,4.2.3.127","4.2.3.126,4.2.3.125,4.2.3.23"
2748,A0A348B788,4.2.3.7,4.2.3.135


In [69]:
res_data[(res_data.ec_number!=res_data.ec_number_pred)]

Unnamed: 0,id,ec_number,ec_number_pred
6,L8EUQ6,"1.14.13.234,1.14.13.38",1.14.13.38
7,Q4VKV0,1.2.99.10,1.2.1.3
9,Q65YW9,1.14.14.35,1.14.14.5
13,Q845S9,1.5.1.42,1.5.1.38
14,Q65YX0,1.5.1.42,1.5.1.38
...,...,...,...
2749,B8PD44,"4.2.3.128,4.2.3.126,4.2.3.133,4.2.3.125,4.2.3.127","4.2.3.91,4.2.3.128,4.2.3.127,4.2.3.129"
2752,I6X8D2,2.3.1.-,2.3.1.41
2756,Q753P9,"3.6.1.-,3.1.13.-",3.6.1.-
2757,A0A068BGA5,2.3.1.-,"2.3.1.232,2.3.1.196"


In [54]:
pd.set_option('display.max_columns', 10)
pd.set_option('display.max_rows', 20)

In [56]:
res_data[res_data.ec_number!=res_data.ec_number_pred]

Unnamed: 0,id,ec_number,ec_number_pred
6,L8EUQ6,"1.14.13.234,1.14.13.38",1.14.13.38
7,Q4VKV0,1.2.99.10,1.2.1.3
9,Q65YW9,1.14.14.35,1.14.14.5
13,Q845S9,1.5.1.42,1.5.1.38
14,Q65YX0,1.5.1.42,1.5.1.38
...,...,...,...
2749,B8PD44,"4.2.3.128,4.2.3.126,4.2.3.133,4.2.3.125,4.2.3.127","4.2.3.91,4.2.3.128,4.2.3.127,4.2.3.129"
2752,I6X8D2,2.3.1.-,2.3.1.41
2756,Q753P9,"3.6.1.-,3.1.13.-",3.6.1.-
2757,A0A068BGA5,2.3.1.-,"2.3.1.232,2.3.1.196"


## 5. Make label

In [7]:
train_set= funclib.split_ecdf_to_single_lines(train)
test_set=funclib.split_ecdf_to_single_lines(test)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 222567/222567 [04:40<00:00, 794.01it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3304/3304 [00:00<00:00, 202312.19it/s]


In [8]:
#4. 加载EC号训练数据
print('loading ec to label dict')
if os.path.exists(cfg.FILE_EC_LABEL_DICT):
    dict_ec_label = np.load(cfg.FILE_EC_LABEL_DICT, allow_pickle=True).item()
else:
    dict_ec_label = btrain.make_ec_label(train_label=train_set['ec_number'], test_label=test_set['ec_number'], file_save= cfg.FILE_EC_LABEL_DICT, force_model_update=cfg.UPDATE_MODEL)


loading ec to label dict


In [9]:
train_set['ec_label'] = train_set.ec_number.parallel_apply(lambda x: dict_ec_label.get(x))
test_set['ec_label'] = test_set.ec_number.parallel_apply(lambda x: dict_ec_label.get(x))

In [10]:
train_set.ec_label.astype('int')
test_set.ec_label.astype('int')

0       3429
1       3207
2       3537
3        229
4       3537
        ... 
3655    1799
3656    1799
3657    3563
3658    2018
3659    3946
Name: ec_label, Length: 3660, dtype: int64

In [11]:
train_set2 = train_set.copy()
test_set2=test_set.copy()

In [12]:
train_set2 = train_set2.merge(trainf, on='id', how='left')
test_set2 = test_set2.merge(testf, on='id', how='left')

## 6. EC Learning

In [13]:
train_set = train_set.merge(trainf, on='id', how='left')
test_set = test_set.merge(testf, on='id', how='left')

In [14]:
train_X = train_set2.iloc[:, 4:]
train_Y = pd.DataFrame(train_set2['ec_label'])

test_X = test_set2.iloc[:, 4:]
test_Y = pd.DataFrame(test_set2['ec_label'])

## 7. EC prediction

In [17]:
s1res = test_set2.iloc[:,np.r_[0:5]].merge(slice_pred, on='id', how='left')
s1res[s1res.ec_number==s1res.top0]

Unnamed: 0,id,seq,ec_number,ec_label,f1,top0,top1,top2,top3,top4,...,top10,top11,top12,top13,top14,top15,top16,top17,top18,top19
0,Q5RF96,MARGGDTGCTGPSETSASGAVAIAFPGLEGPPADAQYQTLALTVPK...,3.4.-.-,3429,4.739295,3.4.-.-,2.3.1.-,3.4.23.-,3.1.2.-,2.4.99.18,...,2.7.7.41,3.1.3.4,3.4.21.32,2.4.1.198,3.2.1.14,1.1.1.-,2.7.1.127,3.2.2.21,3.1.1.31,3.4.19.6
1,P9WIA9,DAGVSWKVYRNKTLGPISSVLTYGSLVTSFKQSADPRSDLVRFGVA...,3.1.4.3,3207,1.053501,3.1.4.3,3.1.6.-,3.6.1.-,3.1.3.2,2.7.8.20,...,3.1.13.4,3.2.-.-,3.4.24.-,3.1.6.14,2.-.-.-,3.2.1.78,3.2.1.28,2.7.7.27,3.1.1.72,3.2.1.99
5,Q6EZC2,MLKILWTYILFLLFISASARAEKPWYFDAIGLTETTMSLTDKNTPV...,3.4.21.-,3508,2.125966,3.4.21.-,2.3.1.-,3.4.21.62,3.4.22.-,3.5.1.-,...,3.6.1.15,1.1.1.47,3.1.3.1,3.6.4.13,3.4.11.19,3.4.21.75,2.7.7.65,3.4.21.94,3.4.21.93,3.4.21.74
15,F4JSV3,MMNGLRRTFWSSIHKKKDNNRVDDSLDRQKPTTTSRFGFFSNPSTP...,2.3.2.27,2018,0.374186,2.3.2.27,2.7.11.1,3.1.3.16,6.3.2.-,2.4.1.-,...,2.4.1.46,3.6.4.3,2.7.1.134,2.7.1.159,1.16.1.7,4.2.99.22,3.6.1.3,3.2.1.2,3.1.3.36,3.1.3.86
20,Q95QG8,MDIKFEGNDAECTAGLKKASEGSFVLKDHVLIEFKINGKVAGKIKT...,3.1.3.16,3124,0.609076,3.1.3.16,2.1.1.-,3.1.-.-,3.4.19.12,3.1.13.-,...,3.1.3.77,3.1.11.-,2.1.1.221,3.1.3.48,3.1.3.91,3.1.3.32,2.3.1.12,2.3.1.168,2.7.1.-,2.3.1.61
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4693,Q8I4V8,MTTEQEFEKVELTADGGVIKTILKKGDEGEENIPKKGNEVTVHYVG...,5.2.1.8,4758,0.117628,5.2.1.8,3.1.-.-,2.3.2.27,3.1.11.6,2.5.1.59,...,2.3.1.57,1.13.12.7,3.6.4.3,1.5.3.7,1.1.1.303,3.4.21.42,3.7.1.19,2.1.1.287,6.2.1.12,3.1.4.2
4694,I6X8D2,MADVAESQENAPAERAELTVPEMRQWLRNWVGKAVGKAPDSIDESV...,2.3.1.-,1799,0.366090,2.3.1.-,2.3.1.41,3.4.21.-,2.3.1.39,2.3.1.239,...,2.3.1.165,2.3.1.38,1.3.1.39,2.7.1.71,6.3.2.26,2.7.11.1,2.3.1.86,3.5.1.-,1.2.1.95,2.3.1.236
4702,A0A068BGA5,MASFPPSLVFTVRRKEPILVLPSKPTPRELKQLSDIDDQEGLRFQV...,2.3.1.-,1799,-3.046058,2.3.1.-,2.3.1.21,2.3.1.20,1.14.-.-,2.3.1.196,...,2.3.1.162,2.3.1.64,2.3.1.28,2.3.1.140,2.3.1.224,2.3.1.75,2.3.1.12,2.3.1.137,2.3.1.160,2.3.1.61
4703,A0A2R6Q324,MASFPPSLVFTVRRKEPTLVLPSKPTPRELKQLSDIDDQEGLRFQV...,2.3.1.-,1799,-3.076983,2.3.1.-,2.3.1.21,2.3.1.20,1.14.-.-,2.3.1.196,...,2.3.1.162,2.3.1.64,2.3.1.28,2.3.1.140,2.3.1.75,2.3.1.224,2.3.1.137,2.3.1.12,2.1.1.68,2.3.1.160


In [18]:
prset = pd.read_feather(cfg.DATADIR+'task1/test.feather')
prediction_data = prset.merge(testf, on='id', how='left')
pr_X = prediction_data.iloc[:, 3:]