# Task3. 酶的EC号预测集成模型-Slice

> author: Shizhenkun   
> email: zhenkun.shi@tib.cas.cn   
> date: 2021-06-22  

## 任务简介
该任务通过给定酶序列，预测该酶的反应类别（EC号），构建的模型为集成模型
1. 首先根据同源比对，将有比对结果的进行选择
2. 根据slice模型进行补充
3. 。。。


## 0. 导入必要的包

In [20]:
import numpy as np
import pandas as pd
import random
import sys
import os
from tqdm import tqdm
sys.path.append("../../tools/")
import commontools
import ucTools
import funclib
import time
from sklearn.metrics import precision_score, recall_score, f1_score
sys.path.append("../../")
import train


%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## 2. 添加自定义函数

In [2]:
def to_file_matrix(file, ds, col_num,stype='label'):
    """
    创建slice需要的数据文件
    """
    if stype== 'label':
        seps = ':'
    if stype == 'feature':
        seps = ' '
    ds.to_csv(file, index= 0, header =0 , sep= seps)
    cmd = '\'1i\{0} {1}\''.format(len(ds), col_num)
    !sed -i $cmd $file
    
def sort_results(result_slice):
    """
    将slice的实验结果排序，并按照推荐顺序以两个矩阵的形式返回
    @pred_top：预测结果排序
    @pred_pb_top：预测结果评分排序
    """
    pred_top =[]
    pred_pb_top =[]
    aac =[]
    for index, row in result_slice.iterrows():
        row_trans= [*row.apply(lambda x: x.split(':')).values]
        row_trans = pd.DataFrame(row_trans).sort_values(by=[1], ascending=False)
        pred_top += [list(np.array(row_trans[0]).astype('int'))]
        pred_pb_top += [list(np.array(row_trans[1]).astype('float'))]
    pred_top = pd.DataFrame(pred_top)
    pred_pb_top = pd.DataFrame(pred_pb_top)
    return pred_top,  pred_pb_top

def cal_precision_k_single(pred_top, test_res):
    
    #拼合
    test_res = pd.concat([test_res, pred_top], axis=1)
    top1_s = len(test_res[test_res.ec_label==test_res[0]])
    top2_s = len(test_res[(test_res.ec_label==test_res[0]) | (test_res.ec_label==test_res[1])] )
    top3_s = len(test_res[(test_res.ec_label==test_res[0]) | (test_res.ec_label==test_res[1]) | (test_res.ec_label==test_res[2])])
    top4_s = len(test_res[(test_res.ec_label==test_res[0]) | (test_res.ec_label==test_res[1]) | (test_res.ec_label==test_res[2]) | (test_res.ec_label==test_res[3])])
    top5_s = len(test_res[(test_res.ec_label==test_res[0]) | (test_res.ec_label==test_res[1]) | (test_res.ec_label==test_res[2]) | (test_res.ec_label==test_res[3]) | (test_res.ec_label==test_res[4])])
    records = len(test)
    print('\n\nslice 单独模型 \n')
    print('Top1 \t\t Top2 \t\t Top3 \t\t Top4 \t\t Top5')
    print('{0:.6f} \t {1:.6f} \t {2:.6f} \t {3:.6f} \t {4:.6f}'.format(top1_s/records, top2_s/records, top3_s/records, top4_s/records, top5_s/records))

def cal_precision_k_integrated(pred_top,result_diamond, test_res):

    #拼合
    test_res = pd.concat([test_res, pred_top], axis=1)
    test_res = test_res.merge(result_diamond, how='left', on='id')
    test_res['inti'] = test_res[[0,'diamond_lb_pred']].apply(lambda x: int(x.diamond_lb_pred) if ~np.isnan(x.diamond_lb_pred) else int(x[0]), axis=1)
    pr_marco = precision_score(test_res.ec_label, test_res.inti, average='macro', zero_division=True)
    recall_marco = recall_score(test_res.ec_label, test_res.inti, average='macro', zero_division=True)
    f1_marco = f1_score(test_res.ec_label, test_res.inti, average='macro', zero_division=True)
    top1_s = len( (test_res[  (test_res.ec_label == test_res.diamond_lb_pred) | (test_res.ec_label==test_res[0])]))
    top2_s = len(test_res[(test_res.ec_label == test_res.diamond_lb_pred) | (test_res.ec_label==test_res[0]) | (test_res.ec_label==test_res[1])] )
    top3_s = len(test_res[(test_res.ec_label == test_res.diamond_lb_pred) | (test_res.ec_label==test_res[0]) | (test_res.ec_label==test_res[1]) | (test_res.ec_label==test_res[2])])
    top4_s = len(test_res[(test_res.ec_label == test_res.diamond_lb_pred) | (test_res.ec_label==test_res[0]) | (test_res.ec_label==test_res[1]) | (test_res.ec_label==test_res[2]) | (test_res.ec_label==test_res[3])])
    top5_s = len(test_res[(test_res.ec_label == test_res.diamond_lb_pred) | (test_res.ec_label==test_res[0]) | (test_res.ec_label==test_res[1]) | (test_res.ec_label==test_res[2]) | (test_res.ec_label==test_res[3]) | (test_res.ec_label==test_res[4])])
    records = len(test)
    
    print('\n\nslice + 同源比对集成模型 \n')
    print('Top1 \t\t Top2 \t\t Top3 \t\t Top4 \t\t Top5 \t\t Macro_Precision \t Macro_Recall \t Macro_F1')
    print('{0:.6f} \t {1:.6f} \t {2:.6f} \t {3:.6f} \t {4:.6f} \t {5:.6f} \t\t {6:.6f} \t {7:.6f}'.format(top1_s/records, top2_s/records, top3_s/records, top4_s/records, top5_s/records, pr_marco, recall_marco, f1_marco))
    
def get_ec_level_1(eclist):
    counter = 0
    ec_l1=[]
    for item in eclist:
        try:
            ec_l1 +=[int(item[0:item.index('.')])-1]
        except:
            ec_l1+=[-9]
    return ec_l1

## 3. 加载数据训练测试数据

In [3]:
#导入训练数据

train = pd.read_hdf('./data/train.h5',key='data')
test = pd.read_hdf('./data/test.h5',key='data')

train.reset_index(inplace=True)
test.reset_index(inplace=True)

print('loading success')

loading success


In [6]:
# 1515 数据
sprot1515 = pd.read_csv('./data/ecoli1515/1515testset_with_unirep.tsv',  sep='\t')

In [4]:
train

Unnamed: 0,index,id,name,isemzyme,isMultiFunctional,functionCounts,ec_number,ec_specific_level,date_integraged,date_sequence_update,...,f1893,f1894,f1895,f1896,f1897,f1898,f1899,f1900,ec_label,ec_appears
0,0,P00693,AMY1_HORVU,True,False,1,3.2.1.1,4,1986-07-21,1986-07-21,...,0.091042,0.331697,-0.060142,-0.067148,-0.039508,-0.044459,-0.043506,0.125468,2563,118
1,454,P0A7A7,PLSB_ECOLI,True,False,1,2.3.1.15,4,1986-07-21,2007-01-23,...,-0.136475,0.025944,-0.249188,0.007864,-0.028408,0.008682,-0.023703,-0.068251,1379,125
2,455,P00829,ATPB_BOVIN,True,False,1,7.1.2.2,4,1986-07-21,1990-01-01,...,-0.101159,-0.163463,-0.071701,-0.003490,-0.001925,0.013616,-0.024341,-0.052469,2765,1761
3,456,P00826,ATPB_TOBAC,True,False,1,7.1.2.2,4,1986-07-21,1986-07-21,...,-0.065063,-0.243096,-0.028025,-0.001211,-0.007412,0.043579,-0.043802,-0.107109,2765,1761
4,457,P36369,K1B26_MOUSE,True,False,1,3.4.21.35,4,1986-07-21,1994-06-01,...,0.067026,0.217421,-0.589387,0.669929,-0.019626,-0.113081,0.184676,-0.027589,629,28
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
185448,185351,Q7Z1V1,CP51_TRYCC,True,False,1,1.14.14.154,4,2009-11-24,2003-10-01,...,-0.058692,-0.142551,-0.129749,0.361156,-0.004137,0.048520,-0.113003,-0.065445,365,32
185449,185352,O04086,Y1105_ARATH,True,False,1,2.7.11.1,4,2009-11-24,1997-07-01,...,0.015088,-0.029609,-0.751144,-0.013137,0.053330,0.010189,0.065682,0.003802,1577,2358
185450,185353,O82318,SKM1_ARATH,True,False,1,2.7.11.1,4,2009-11-24,1998-11-01,...,0.046748,0.009634,0.408559,0.019050,0.046962,0.031351,0.132357,-0.168701,1577,2358
185451,185346,Q9LT96,Y5977_ARATH,True,False,1,2.7.11.1,4,2009-11-24,2000-10-01,...,0.191198,0.002104,-0.282884,-0.603126,-0.040193,0.004247,0.229937,0.043901,1577,2358


In [5]:
test

Unnamed: 0,index,id,name,isemzyme,isMultiFunctional,functionCounts,ec_number,ec_specific_level,date_integraged,date_sequence_update,...,f1893,f1894,f1895,f1896,f1897,f1898,f1899,f1900,ec_label,ec_appears
0,185453,B0VRF9,ACCD_ACIBS,True,False,1,2.1.3.15,4,2009-12-15,2008-04-08,...,0.224951,0.412453,-0.100746,-0.581480,-0.003753,0.020245,-0.029395,0.148873,423,757
1,185837,Q339X2,BGL34_ORYSJ,True,False,1,3.2.1.21,4,2009-12-15,2005-12-06,...,-0.030850,0.062774,-0.194467,0.033042,-0.003844,-0.097847,-0.374020,0.052672,549,189
2,185838,A0LEQ5,NUOK1_SYNFM,True,False,1,7.1.1.-,3,2009-12-15,2006-12-12,...,-0.051995,-0.463824,-0.309717,-0.030179,0.049086,0.025952,-0.012106,-0.026114,462,3618
3,185839,Q0DIT2,BGL19_ORYSJ,True,False,1,3.2.1.21,4,2009-12-15,2006-10-17,...,-0.012665,0.082158,-0.078772,-0.071780,0.005139,0.116421,0.199764,0.044296,549,189
4,185840,Q28FQ5,TYDP2_XENTR,True,False,1,3.1.4.-,3,2009-12-15,2009-12-15,...,-0.134549,0.034979,-0.042452,0.074300,0.081314,0.057384,-0.079496,-0.031218,1852,70
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18863,204222,Q9RN59,SNOAL_STRNO,True,False,1,5.5.1.26,4,2021-04-07,2000-05-01,...,0.153777,0.043218,-0.184997,-0.191364,-0.001056,-0.002186,0.123507,-0.126100,2343,1
18864,204223,G3F5K2,BKT_PROBT,True,False,1,1.14.99.63,4,2021-04-07,2011-11-16,...,-0.113104,0.014811,-0.414794,-0.077786,-0.067015,0.046054,-0.030360,0.019115,3711,5
18865,204224,P93789,SGT1_SOLTU,True,False,1,2.4.1.-,3,2021-04-07,2006-01-10,...,0.033277,-0.165813,-0.493065,0.255037,-0.003941,0.037680,0.005358,-0.186378,321,811
18866,204226,A0A1Z3GBS4,CYPH3_ISORU,True,False,1,1.14.14.175,4,2021-04-07,2017-09-27,...,-0.053471,-0.044440,-0.855253,0.103256,-0.021321,0.142498,-0.021635,-0.000273,2268,2


In [7]:
sprot1515

Unnamed: 0.1,Unnamed: 0,id,ec_number,seq,isemzyme,isMultiFunctional,blattner_id,blattner_ec_number,0,1,...,1890,1891,1892,1893,1894,1895,1896,1897,1898,1899
0,0,P42641,3.6.5.-,MKFVDEASILVVAGDGGNGCVSFRREKYIPKGGPDGGDGGDGGDVW...,True,False,b3183,,0.003056,-0.087184,...,0.000332,-0.065542,0.117107,-0.236777,-0.622398,0.012113,0.010506,0.058984,0.061533,0.968014
1,1,P0AE22,3.1.3.2,MRKITQAISAVCLLFALNSSAVALASSPSPLNPGTNVARLAEQAPI...,True,False,b4055,3.1.3.2,0.001026,0.618393,...,0.030307,-0.244636,-0.038585,0.151263,-0.020389,0.095321,-0.031043,-0.881730,-0.196211,0.054455
2,2,P21499,3.1.13.1,MSQDPFQEREAEKYANPIPSREFILEHLTKREKPASRDELAVELHI...,True,False,b4179,3.1.-.-,0.073632,-0.003258,...,0.002123,0.060351,-0.008826,0.006141,-0.129352,-0.075010,0.023712,-0.002246,0.893310,0.140025
3,3,P39286,3.6.1.-,MSKNKLSKGQQRRVNANHQRRLKTSKEKPDYDDNLFGEPDEGIVIS...,True,False,b4161,3.6.1.-,0.000775,-0.008305,...,0.066885,0.020991,0.073577,-0.034962,-0.036755,0.053803,-0.017355,0.100456,-0.202344,0.351100
4,4,P56100,7.1.1.7,MWYFAWILGTLLACSFGVITALALEHVESGKAGQEDI,True,False,b4515,1.10.3.10,0.001811,-0.050181,...,0.042016,-0.028950,0.305668,-0.105587,-0.113769,0.680413,0.138797,-0.268528,-0.076093,-0.038212
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4745,4745,P36837,,MNTTTPMGMLQQPRPFFMIFFVELWERFGYYGVQGVLAVFFVKQLG...,False,False,b3496,,0.000070,-0.021424,...,0.005911,-0.016855,0.377301,-0.500727,-0.247861,0.752678,-0.005501,0.396191,-0.215832,-0.062812
4746,4746,P77304,,MSTANQKPTESVSLNAFKQPKAFYLIFSIELWERFGYYGLQGIMAV...,False,False,b1634,,0.008628,-0.065382,...,-0.001197,-0.101824,0.013007,-0.304450,-0.295525,-0.124492,0.010846,0.032853,-0.155702,-0.183535
4747,4747,P27248,2.1.2.10,MAQQTPLYEQHTLCGARMVDFHGWMMPLHYGSQIDEHHAVRTDAGM...,True,False,b2905,2.1.2.10,0.006723,0.227349,...,-0.124573,-0.032099,-0.170996,0.104425,-0.163041,0.183720,-0.038814,0.036993,-0.044558,0.010447
4748,4748,P0AC88,4.2.1.47,MSKVALITGVTGQDGSYLAEFLLEKGYEVHGIKRRASSFNTERVDH...,True,False,b2053,4.2.1.47,0.000665,-0.029179,...,0.145032,0.867074,-0.263008,0.009123,0.036783,-0.073805,0.439632,0.014501,-0.205913,-0.005663


In [8]:
# 划分特征与标签
train_ds = train.iloc[:,13:1913]
test_ds = test.iloc[:,13:1913]
train_lb = train.ec_label
test_lb = test.ec_label

In [9]:
# 划分特征与标签
train_ds = train.iloc[:,13:1913]
# test_ds = test.iloc[:,13:1913]
train_lb = train.ec_label
# test_lb = test.ec_label

In [10]:
train_ds

Unnamed: 0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,...,f1891,f1892,f1893,f1894,f1895,f1896,f1897,f1898,f1899,f1900
0,0.001718,0.191248,0.015037,-0.013877,-0.284253,0.006640,-0.425640,0.011292,-0.006632,0.157273,...,0.074995,-0.098609,0.091042,0.331697,-0.060142,-0.067148,-0.039508,-0.044459,-0.043506,0.125468
1,0.014651,0.041089,0.089012,-0.002785,0.899913,0.007275,-0.036483,-0.007882,-0.001112,-0.308554,...,0.008301,0.082664,-0.136475,0.025944,-0.249188,0.007864,-0.028408,0.008682,-0.023703,-0.068251
2,0.014467,-0.427641,0.059957,-0.011153,0.008796,0.045480,0.175077,-0.018369,-0.005354,0.021986,...,0.006490,0.007475,-0.101159,-0.163463,-0.071701,-0.003490,-0.001925,0.013616,-0.024341,-0.052469
3,-0.039211,-0.262056,0.038029,-0.002485,-0.106830,-0.006537,0.767491,-0.024982,-0.000902,0.003590,...,0.040472,0.184513,-0.065063,-0.243096,-0.028025,-0.001211,-0.007412,0.043579,-0.043802,-0.107109
4,0.000643,0.057234,0.165912,-0.482218,-0.111323,-0.001087,-0.127547,-0.020965,0.056458,0.019106,...,-0.021904,-0.001924,0.067026,0.217421,-0.589387,0.669929,-0.019626,-0.113081,0.184676,-0.027589
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
185448,0.001158,-0.308516,-0.115683,-0.573809,-0.377520,-0.003909,-0.439820,-0.008451,-0.021835,0.021223,...,0.057593,0.028667,-0.058692,-0.142551,-0.129749,0.361156,-0.004137,0.048520,-0.113003,-0.065445
185449,0.006551,-0.062314,-0.014859,-0.018300,-0.748150,-0.083225,-0.413944,-0.002918,-0.002820,-0.004568,...,0.055431,-0.093472,0.015088,-0.029609,-0.751144,-0.013137,0.053330,0.010189,0.065682,0.003802
185450,0.000417,-0.015063,0.024288,-0.068539,-0.605343,0.027857,-0.120554,-0.015149,-0.016744,0.070723,...,0.051789,-0.024866,0.046748,0.009634,0.408559,0.019050,0.046962,0.031351,0.132357,-0.168701
185451,0.005470,-0.263507,-0.049972,-0.117916,-0.203359,-0.087754,-0.609994,-0.003667,-0.001052,0.048694,...,0.019009,-0.049262,0.191198,0.002104,-0.282884,-0.603126,-0.040193,0.004247,0.229937,0.043901


In [11]:
train_lb

0         2563
1         1379
2         2765
3         2765
4          629
          ... 
185448     365
185449    1577
185450    1577
185451    1577
185452    1577
Name: ec_label, Length: 185453, dtype: int64

In [13]:
len(set(train_lb))

3364

In [20]:


#最大的EC数目
max_ec = len(set(list(train.ec_number) + list(test.ec_number)))

#unirep的数据精度
unit_thres = 10
print('开始精度采样，精度:{0}'.format(unit_thres))
train_ds = train_ds.round(unit_thres)
test_ds = test_ds.round(unit_thres)


time_start=time.time()
#训练特征
file = r'./data/slice/train.txt'
print('开始写入特征文件：{0}'.format(file))
to_file_matrix(file=file, ds=train_ds, col_num=1900, stype='feature')

#测试特征
file = r'./data/slice/test.txt'
to_file_matrix(file=file, ds=test_ds, col_num=1900, stype='feature')
time_end=time.time()
print('time cost',time_end-time_start,'s')

开始精度采样，精度:10
开始写入特征文件：./data/slice/train.txt
time cost 309.1047832965851 s


In [22]:
uctools =  ucTools.ucTools('172.16.25.20')
cnx_ecnumber = uctools.db_conn()
DATE_TRAIN ='2018-01-01'
file_ec_train_x = '../../data/preprocess/ec_train_x.feather'
file_ec_train_y = '../../data/preprocess/ec_train_y.feather'
ec_X, ec_Y = train.get_ec_train_set(train_date=DATE_TRAIN,
                                eclabelfile='../../data/dict_ec_label.npy',
                                featurefile='../../data/sprot_unirep.feather',
                                cnx=cnx_ecnumber,
                                trainX=file_ec_train_x,
                                trainY=file_ec_train_y
                            )

In [26]:
ec_Y.ec_label

0         5108
1         5275
2         5275
3         4994
4         5005
          ... 
243972    5311
243973    5275
243974    3672
243975    5124
243976    2409
Name: ec_label, Length: 243977, dtype: int64

In [31]:
#训练特征
file = r'./data/slice/trainbbbb.txt'
unit_thres = 10
print('开始写入特征文件：{0}'.format(file))
ec_X = ec_X.round(unit_thres)
to_file_matrix(file=file, ds=ec_X, col_num=1900, stype='feature')

开始写入特征文件：./data/slice/trainbbbb.txt


In [39]:
file  = r'./data/slice/train_lbbbb.txt'
train_lb = pd.DataFrame(ec_Y.ec_label)
train_lb['indicator'] = 1
to_file_matrix(file=file, ds=train_lb, col_num=5420, stype='label')

In [38]:
train_lb

Unnamed: 0,ec_label,indicator
0,5108,1
1,5275,1
2,5275,1
3,4994,1
4,5005,1
...,...,...
243972,5311,1
243973,5275,1
243974,3672,1
243975,5124,1


In [23]:
time_start=time.time()
# 训练标签
train_lb = pd.DataFrame(train_lb)
train_lb['indicator'] = 1

file  = r'./data/slice/train_lb.txt'
to_file_matrix(file=file, ds=train_lb, col_num=max_ec, stype='label')

#测试标签
test_lb = pd.DataFrame(test_lb)
test_lb['indicator'] = 1

file  = r'./data/slice/test_lb.txt'
to_file_matrix(file=file, ds=test_lb, col_num=max_ec, stype='label')

time_end=time.time()
print('time cost',time_end-time_start,'s')

time cost 0.6416594982147217 s


## 4. 训练模型

In [40]:
! ./slice_train ./data/slice/trainbbbb.txt ./data/slice/train_lbbbb.txt ./model -m 100 -c 300 -s 300 -k 700 -o 32 -t 32 -C 1 -f 0.000001 -siter 20 -stype 0 -q 0

Parameter Setting
-------------------------------------------------
Number of training examples=243977
Number of features=1900
Number of labels=5420
Number of train/test threads=32
Quiet=0
M=100
efConstruction=300
efSearch=300
Number of nearest neighbors=700
Number of threads for I/O=32
Cost co-efficient for discriminative classifier=1
Threshold for discriminative classifier=1e-06
Maximum number of iterations for the discriminative classifier=20
Separator Type=0
b_gen=0
-------------------------------------------------
Number of labels removed = 529
Training generative model ...
Training time of ANNS datastructure = 1.028082
Finding the most confusing negatives ...
Time taken to find approx nearest neighbors = 28.432863
Training discriminative classifiers ...
Total training time: 959.162 s


In [22]:
! ./slice_train ./data/slice/train.txt ./data/slice/train_lb.txt ./model -m 100 -c 300 -s 300 -k 700 -o 32 -t 32 -C 1 -f 0.000001 -siter 20 -stype 0 -q 0

Parameter Setting
-------------------------------------------------
Number of training examples=185453
Number of features=1900
Number of labels=4767
Number of train/test threads=32
Quiet=0
M=100
efConstruction=300
efSearch=300
Number of nearest neighbors=700
Number of threads for I/O=32
Cost co-efficient for discriminative classifier=1
Threshold for discriminative classifier=1e-06
Maximum number of iterations for the discriminative classifier=20
Separator Type=0
b_gen=0
-------------------------------------------------
Number of labels removed = 1403
Training generative model ...
Training time of ANNS datastructure = 0.699523
Finding the most confusing negatives ...
Time taken to find approx nearest neighbors = 17.476056
Training discriminative classifiers ...
Total training time: 629.28 s


## 5. 测试模型

In [24]:
! ./slice_predict ./data/slice/test.txt ./model ./results/slice_results.txt -o 32 -b 0 -t 32 -q 0

Time taken to find approx nearest neighbors = 1.567295
Total prediction time: 5.17639 s
Prediction time per point: 0.274348 ms


In [29]:
! ./precision_k ./results/slice_results.txt ./data/slice/test_lb.txt 20

score file read ./results/slice_results.txt
lbl file read ./data/slice/test_lb.txt
num_inst=18868 num_lbl=4767
Precision@1 = 0.369197
Precision@2 = 0.219525
Precision@3 = 0.157003
Precision@4 = 0.122840
Precision@5 = 0.100922
Precision@6 = 0.085992
Precision@7 = 0.074866
Precision@8 = 0.066561
Precision@9 = 0.059843
Precision@10 = 0.054431
Precision@11 = 0.049955
Precision@12 = 0.046167
Precision@13 = 0.042926
Precision@14 = 0.040109
Precision@15 = 0.037697
Precision@16 = 0.035526
Precision@17 = 0.033589
Precision@18 = 0.031859
Precision@19 = 0.030321
Precision@20 = 0.028935


In [30]:
result_slice = pd.read_csv('./results/slice_results.txt',  header = None, skiprows=1 ,sep=' ')
test_res = test[['id', 'name', 'ec_number', 'date_integraged','ec_specific_level','ec_appears', 'ec_label']]

# 获取比对结果
result_diamond=funclib.getblast(train,test)
# 给预测结果添加EC
id_ec_dict = {v: k for v,k in zip( train.id, train.ec_number)} 
result_diamond['diamond_ec_pred'] = result_diamond.sseqid.apply(lambda x : id_ec_dict.get(x))
ec_label = pd.concat([train[['ec_number', 'ec_label']], test[['ec_number', 'ec_label']]], axis=0)
label_ec_dict = {v: k for k,v in zip( ec_label.ec_label, ec_label.ec_number)} 
result_diamond['diamond_lb_pred'] = result_diamond.diamond_ec_pred.apply(lambda x : int(label_ec_dict.get(x)))
result_diamond_fed = result_diamond[['id', 'diamond_lb_pred', 'diamond_ec_pred']]

Write finished
Write finished
diamond makedb --in /tmp/train.fasta -d /tmp/train.dmnd
diamond blastp -d /tmp/train.dmnd  -q  /tmp/test.fasta -o /tmp/test_fasta_results.tsv -b5 -c1 -k 1


In [89]:
# 计算比对结果
pred_top, pred_top_pb = sort_results(result_slice)
cal_precision_k_single(pred_top, test_res)
cal_precision_k_integrated(pred_top,result_diamond_fed, test_res )



slice 单独模型 

Top1 		 Top2 		 Top3 		 Top4 		 Top5
0.369197 	 0.439050 	 0.471009 	 0.491308 	 0.504611


slice + 同源比对集成模型 

Top1 		 Top2 		 Top3 		 Top4 		 Top5 		 Macro_Precision 	 Macro_Recall 	 Macro_F1
0.661649 	 0.676542 	 0.685128 	 0.691912 	 0.696046 	 0.689518 		 0.513355 	 0.281952


In [92]:
train.iloc[:,1:10]

Unnamed: 0,id,name,isemzyme,isMultiFunctional,functionCounts,ec_number,ec_specific_level,date_integraged,date_sequence_update
0,P00693,AMY1_HORVU,True,False,1,3.2.1.1,4,1986-07-21,1986-07-21
1,P0A7A7,PLSB_ECOLI,True,False,1,2.3.1.15,4,1986-07-21,2007-01-23
2,P00829,ATPB_BOVIN,True,False,1,7.1.2.2,4,1986-07-21,1990-01-01
3,P00826,ATPB_TOBAC,True,False,1,7.1.2.2,4,1986-07-21,1986-07-21
4,P36369,K1B26_MOUSE,True,False,1,3.4.21.35,4,1986-07-21,1994-06-01
...,...,...,...,...,...,...,...,...,...
185448,Q7Z1V1,CP51_TRYCC,True,False,1,1.14.14.154,4,2009-11-24,2003-10-01
185449,O04086,Y1105_ARATH,True,False,1,2.7.11.1,4,2009-11-24,1997-07-01
185450,O82318,SKM1_ARATH,True,False,1,2.7.11.1,4,2009-11-24,1998-11-01
185451,Q9LT96,Y5977_ARATH,True,False,1,2.7.11.1,4,2009-11-24,2000-10-01
