# Task3. 酶的EC号预测集成模型-Slice

> author: Shizhenkun   
> email: zhenkun.shi@tib.cas.cn   
> date: 2021-06-22  

## 任务简介
该任务通过给定酶序列，预测该酶的反应类别（EC号），构建的模型为集成模型
1. 首先根据同源比对，将有比对结果的进行选择
2. 根据slice模型进行补充
3. 。。。


## 0. 导入必要的包

In [78]:
import numpy as np
import pandas as pd
import random
import sys
import os
from tqdm import tqdm
sys.path.append("../../tools/")
import commontools
import funclib
import time
from sklearn.metrics import precision_score, recall_score, f1_score
%load_ext autoreload


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## 2. 添加自定义函数

In [88]:
def to_file_matrix(file, ds, col_num,stype='label'):
    """
    创建slice需要的数据文件
    """
    if stype== 'label':
        seps = ':'
    if stype == 'feature':
        seps = ' '
    ds.to_csv(file, index= 0, header =0 , sep= seps)
    cmd = '\'1i\{0} {1}\''.format(len(ds), col_num)
    !sed -i $cmd $file
    
def sort_results(result_slice):
    """
    将slice的实验结果排序，并按照推荐顺序以两个矩阵的形式返回
    @pred_top：预测结果排序
    @pred_pb_top：预测结果评分排序
    """
    pred_top =[]
    pred_pb_top =[]
    aac =[]
    for index, row in result_slice.iterrows():
        row_trans= [*row.apply(lambda x: x.split(':')).values]
        row_trans = pd.DataFrame(row_trans).sort_values(by=[1], ascending=False)
        pred_top += [list(np.array(row_trans[0]).astype('int'))]
        pred_pb_top += [list(np.array(row_trans[1]).astype('float'))]
    pred_top = pd.DataFrame(pred_top)
    pred_pb_top = pd.DataFrame(pred_pb_top)
    return pred_top,  pred_pb_top

def cal_precision_k_single(pred_top, test_res):
    
    #拼合
    test_res = pd.concat([test_res, pred_top], axis=1)
    top1_s = len(test_res[test_res.ec_label==test_res[0]])
    top2_s = len(test_res[(test_res.ec_label==test_res[0]) | (test_res.ec_label==test_res[1])] )
    top3_s = len(test_res[(test_res.ec_label==test_res[0]) | (test_res.ec_label==test_res[1]) | (test_res.ec_label==test_res[2])])
    top4_s = len(test_res[(test_res.ec_label==test_res[0]) | (test_res.ec_label==test_res[1]) | (test_res.ec_label==test_res[2]) | (test_res.ec_label==test_res[3])])
    top5_s = len(test_res[(test_res.ec_label==test_res[0]) | (test_res.ec_label==test_res[1]) | (test_res.ec_label==test_res[2]) | (test_res.ec_label==test_res[3]) | (test_res.ec_label==test_res[4])])
    records = len(test)
    print('\n\nslice 单独模型 \n')
    print('Top1 \t\t Top2 \t\t Top3 \t\t Top4 \t\t Top5')
    print('{0:.6f} \t {1:.6f} \t {2:.6f} \t {3:.6f} \t {4:.6f}'.format(top1_s/records, top2_s/records, top3_s/records, top4_s/records, top5_s/records))

def cal_precision_k_integrated(pred_top,result_diamond, test_res):

    #拼合
    test_res = pd.concat([test_res, pred_top], axis=1)
    test_res = test_res.merge(result_diamond, how='left', on='id')
    test_res['inti'] = test_res[[0,'diamond_lb_pred']].apply(lambda x: int(x.diamond_lb_pred) if ~np.isnan(x.diamond_lb_pred) else int(x[0]), axis=1)
    pr_marco = precision_score(test_res.ec_label, test_res.inti, average='macro', zero_division=True)
    recall_marco = recall_score(test_res.ec_label, test_res.inti, average='macro', zero_division=True)
    f1_marco = f1_score(test_res.ec_label, test_res.inti, average='macro', zero_division=True)
    top1_s = len( (test_res[  (test_res.ec_label == test_res.diamond_lb_pred) | (test_res.ec_label==test_res[0])]))
    top2_s = len(test_res[(test_res.ec_label == test_res.diamond_lb_pred) | (test_res.ec_label==test_res[0]) | (test_res.ec_label==test_res[1])] )
    top3_s = len(test_res[(test_res.ec_label == test_res.diamond_lb_pred) | (test_res.ec_label==test_res[0]) | (test_res.ec_label==test_res[1]) | (test_res.ec_label==test_res[2])])
    top4_s = len(test_res[(test_res.ec_label == test_res.diamond_lb_pred) | (test_res.ec_label==test_res[0]) | (test_res.ec_label==test_res[1]) | (test_res.ec_label==test_res[2]) | (test_res.ec_label==test_res[3])])
    top5_s = len(test_res[(test_res.ec_label == test_res.diamond_lb_pred) | (test_res.ec_label==test_res[0]) | (test_res.ec_label==test_res[1]) | (test_res.ec_label==test_res[2]) | (test_res.ec_label==test_res[3]) | (test_res.ec_label==test_res[4])])
    records = len(test)
    
    print('\n\nslice + 同源比对集成模型 \n')
    print('Top1 \t\t Top2 \t\t Top3 \t\t Top4 \t\t Top5 \t\t Macro_Precision \t Macro_Recall \t Macro_F1')
    print('{0:.6f} \t {1:.6f} \t {2:.6f} \t {3:.6f} \t {4:.6f} \t {5:.6f} \t\t {6:.6f} \t {7:.6f}'.format(top1_s/records, top2_s/records, top3_s/records, top4_s/records, top5_s/records, pr_marco, recall_marco, f1_marco))
    
def get_ec_level_1(eclist):
    counter = 0
    ec_l1=[]
    for item in eclist:
        try:
            ec_l1 +=[int(item[0:item.index('.')])-1]
        except:
            ec_l1+=[-9]
    return ec_l1

## 3. 加载数据训练测试数据

In [3]:
#导入训练数据

train = pd.read_hdf('./data/train.h5',key='data')
test = pd.read_hdf('./data/test.h5',key='data')

train.reset_index(inplace=True)
test.reset_index(inplace=True)

print('loading success')

loading success


In [20]:
# 划分特征与标签
train_ds = train.iloc[:,13:1913]
test_ds = test.iloc[:,13:1913]
train_lb = train.ec_label
test_lb = test.ec_label

#最大的EC数目
max_ec = len(set(list(train.ec_number) + list(test.ec_number)))

#unirep的数据精度
unit_thres = 10
print('开始精度采样，精度:{0}'.format(unit_thres))
train_ds = train_ds.round(unit_thres)
test_ds = test_ds.round(unit_thres)


time_start=time.time()
#训练特征
file = r'./data/slice/train.txt'
print('开始写入特征文件：{0}'.format(file))
to_file_matrix(file=file, ds=train_ds, col_num=1900, stype='feature')

#测试特征
file = r'./data/slice/test.txt'
to_file_matrix(file=file, ds=test_ds, col_num=1900, stype='feature')
time_end=time.time()
print('time cost',time_end-time_start,'s')

开始精度采样，精度:10
开始写入特征文件：./data/slice/train.txt
time cost 309.1047832965851 s


In [27]:
train_lb

Unnamed: 0,ec_label,indicator
0,2563,1
1,1379,1
2,2765,1
3,2765,1
4,629,1
...,...,...
185448,365,1
185449,1577,1
185450,1577,1
185451,1577,1


In [23]:
time_start=time.time()
# 训练标签
train_lb = pd.DataFrame(train_lb)
train_lb['indicator'] = 1

file  = r'./data/slice/train_lb.txt'
to_file_matrix(file=file, ds=train_lb, col_num=max_ec, stype='label')

#测试标签
test_lb = pd.DataFrame(test_lb)
test_lb['indicator'] = 1

file  = r'./data/slice/test_lb.txt'
to_file_matrix(file=file, ds=test_lb, col_num=max_ec, stype='label')

time_end=time.time()
print('time cost',time_end-time_start,'s')

time cost 0.6416594982147217 s


## 4. 训练模型

In [22]:
! ./slice_train ./data/slice/train.txt ./data/slice/train_lb.txt ./model -m 100 -c 300 -s 300 -k 700 -o 32 -t 32 -C 1 -f 0.000001 -siter 20 -stype 0 -q 0

Parameter Setting
-------------------------------------------------
Number of training examples=185453
Number of features=1900
Number of labels=4767
Number of train/test threads=32
Quiet=0
M=100
efConstruction=300
efSearch=300
Number of nearest neighbors=700
Number of threads for I/O=32
Cost co-efficient for discriminative classifier=1
Threshold for discriminative classifier=1e-06
Maximum number of iterations for the discriminative classifier=20
Separator Type=0
b_gen=0
-------------------------------------------------
Number of labels removed = 1403
Training generative model ...
Training time of ANNS datastructure = 0.699523
Finding the most confusing negatives ...
Time taken to find approx nearest neighbors = 17.476056
Training discriminative classifiers ...
Total training time: 629.28 s


## 5. 测试模型

In [24]:
! ./slice_predict ./data/slice/test.txt ./model ./results/slice_results.txt -o 32 -b 0 -t 32 -q 0

Time taken to find approx nearest neighbors = 1.567295
Total prediction time: 5.17639 s
Prediction time per point: 0.274348 ms


In [29]:
! ./precision_k ./results/slice_results.txt ./data/slice/test_lb.txt 20

score file read ./results/slice_results.txt
lbl file read ./data/slice/test_lb.txt
num_inst=18868 num_lbl=4767
Precision@1 = 0.369197
Precision@2 = 0.219525
Precision@3 = 0.157003
Precision@4 = 0.122840
Precision@5 = 0.100922
Precision@6 = 0.085992
Precision@7 = 0.074866
Precision@8 = 0.066561
Precision@9 = 0.059843
Precision@10 = 0.054431
Precision@11 = 0.049955
Precision@12 = 0.046167
Precision@13 = 0.042926
Precision@14 = 0.040109
Precision@15 = 0.037697
Precision@16 = 0.035526
Precision@17 = 0.033589
Precision@18 = 0.031859
Precision@19 = 0.030321
Precision@20 = 0.028935


In [30]:
result_slice = pd.read_csv('./results/slice_results.txt',  header = None, skiprows=1 ,sep=' ')
test_res = test[['id', 'name', 'ec_number', 'date_integraged','ec_specific_level','ec_appears', 'ec_label']]

# 获取比对结果
result_diamond=funclib.getblast(train,test)
# 给预测结果添加EC
id_ec_dict = {v: k for v,k in zip( train.id, train.ec_number)} 
result_diamond['diamond_ec_pred'] = result_diamond.sseqid.apply(lambda x : id_ec_dict.get(x))
ec_label = pd.concat([train[['ec_number', 'ec_label']], test[['ec_number', 'ec_label']]], axis=0)
label_ec_dict = {v: k for k,v in zip( ec_label.ec_label, ec_label.ec_number)} 
result_diamond['diamond_lb_pred'] = result_diamond.diamond_ec_pred.apply(lambda x : int(label_ec_dict.get(x)))
result_diamond_fed = result_diamond[['id', 'diamond_lb_pred', 'diamond_ec_pred']]

Write finished
Write finished
diamond makedb --in /tmp/train.fasta -d /tmp/train.dmnd
diamond blastp -d /tmp/train.dmnd  -q  /tmp/test.fasta -o /tmp/test_fasta_results.tsv -b5 -c1 -k 1


In [89]:
# 计算比对结果
pred_top, pred_top_pb = sort_results(result_slice)
cal_precision_k_single(pred_top, test_res)
cal_precision_k_integrated(pred_top,result_diamond_fed, test_res )



slice 单独模型 

Top1 		 Top2 		 Top3 		 Top4 		 Top5
0.369197 	 0.439050 	 0.471009 	 0.491308 	 0.504611


slice + 同源比对集成模型 

Top1 		 Top2 		 Top3 		 Top4 		 Top5 		 Macro_Precision 	 Macro_Recall 	 Macro_F1
0.661649 	 0.676542 	 0.685128 	 0.691912 	 0.696046 	 0.689518 		 0.513355 	 0.281952


In [92]:
train.iloc[:,1:10]

Unnamed: 0,id,name,isemzyme,isMultiFunctional,functionCounts,ec_number,ec_specific_level,date_integraged,date_sequence_update
0,P00693,AMY1_HORVU,True,False,1,3.2.1.1,4,1986-07-21,1986-07-21
1,P0A7A7,PLSB_ECOLI,True,False,1,2.3.1.15,4,1986-07-21,2007-01-23
2,P00829,ATPB_BOVIN,True,False,1,7.1.2.2,4,1986-07-21,1990-01-01
3,P00826,ATPB_TOBAC,True,False,1,7.1.2.2,4,1986-07-21,1986-07-21
4,P36369,K1B26_MOUSE,True,False,1,3.4.21.35,4,1986-07-21,1994-06-01
...,...,...,...,...,...,...,...,...,...
185448,Q7Z1V1,CP51_TRYCC,True,False,1,1.14.14.154,4,2009-11-24,2003-10-01
185449,O04086,Y1105_ARATH,True,False,1,2.7.11.1,4,2009-11-24,1997-07-01
185450,O82318,SKM1_ARATH,True,False,1,2.7.11.1,4,2009-11-24,1998-11-01
185451,Q9LT96,Y5977_ARATH,True,False,1,2.7.11.1,4,2009-11-24,2000-10-01
