##  EC预测结果分析
### 1. 导入必要的包

In [1]:
import pandas as pd
import numpy as np
import config as cfg
from tkinter import _flatten
import tools.funclib as funclib
import benchmark_evaluation as eva

### 2. 读入训练测试数据

In [2]:
data_task3_train = pd.read_feather(cfg.FILE_TASK3_TRAIN)

task3_test_20 = pd.read_feather(cfg.FILE_TASK3_TEST_2020)
task3_test_22 = pd.read_feather(cfg.FILE_TASK3_TEST_2022)

ec_task3_train = _flatten([item.split(',') for item in list(set(data_task3_train.ec_number))])

### 3. 定义操作函数

In [3]:
def cacl_t20_t22(seqlist_20, seqlist_22, seq):
    res=''
    if seq in seqlist_20:
        res = res +'_20'
    if seq in seqlist_22:
        res = res + '_22'
    return res

def isecpredictable(ec_str, ecset):
    ec_array =[item.strip() for item in ec_str.split(',')]
    intersection = set(ec_array) & ecset

    if len(intersection) == len(ec_array):
        return True
    else:
        return False

def make_rank_array(rank_str):
    # rank_str = rank_str.replace('[','').replace(']','').replace(')',' ').replace('(','').replace('\'','').split(' , ')
    # rank_str = [item.split(',') for item in rank_str]
    # res_df = pd.DataFrame(rank_str, columns=['ec','prob'])
    # res_df.prob = res_df.prob.astype('float').round(4)
    rank_str = rank_str.replace('[','').replace(']','').replace('\'','')
    return rank_str

def is_ec_pred_correct(ground, predict):
    # print(ground)
    # print(predict)
    if ground == predict:
        return True
    elif len(ground.split('.')) == 4 and len(predict.split('.'))==4:
        ground_array = ground.split('.')
        predict_array = predict.split('.')

        l3str_ground = '_'.join(ground_array[0:3])
        l3str_pred = '_'.join(predict_array[0:3])

        if (ground_array[3]=='-') & (predict_array[3]!='-') & (l3str_ground == l3str_pred):
            return True
        else:
            l2str_ground = '_'.join(ground_array[0:2])
            l2str_pred = '_'.join(predict_array[0:2])
            if (ground_array[2]=='-') & (predict_array[2]!='-') & (l2str_ground == l2str_pred):
                return True
            else:
                if (ground_array[1]=='-') & (predict_array[1]!='-') & (ground_array[0] == predict_array[0]):
                    return True
                else:
                    return False
    else:
        return False


### 4. 定义输出表格

In [4]:
test_res = pd.concat([task3_test_22,task3_test_20[~task3_test_20.seq.isin(task3_test_22.seq)]], axis=0).reset_index(drop=True)
test_res['ds_in'] = test_res.seq.apply(lambda x: cacl_t20_t22(task3_test_20.seq.values, task3_test_22.seq.values, x))
test_res['ec_predictable']=test_res.ec_number.apply(lambda x :isecpredictable(x, set(ec_task3_train)))
test_res

Unnamed: 0,id,seq,ec_number,ds_in,ec_predictable
0,P9WG69,MTTSVIVAGARTPIGKLMGSLKDFSASELGAIAIKGALEKANVPAS...,2.3.1.9,_22,True
1,P9WI52,MAERAPRGEVAVMVAVQSALVDRPGMLATARGLSHFGEHCIGWLIL...,3.1.3.-,_22,True
2,P9WQ38,MRALHVPAGSATALLLPALQRVLGGSDPALVAVPTQHESLLGALRV...,6.2.1.26,_22,True
3,P9WIA9,DAGVSWKVYRNKTLGPISSVLTYGSLVTSFKQSADPRSDLVRFGVA...,3.1.4.3,_20_22,True
4,D8GR70,MASYLTLFISAVVVNNYVLTRFLGLCIFFGVSKNLNASVGMGMAVT...,7.1.1.-,_20_22,False
...,...,...,...,...,...
5114,F1LXF1,MQMERKKSQQSAGQGLGEAPRPHYRGRSSESSCGLDGDYEDAELNP...,2.7.11.1,_20,True
5115,A0A0S4IJL0,MASSVLSSAAVATRSNVAQANMVAPFTGLKSAASFPVSRKQNLDIT...,4.1.1.39,_20,True
5116,W6QRN8,MRVEAGGDMRDKLMWIRLYILGNVGQTFGDMKRYIGMWSGMLFPIS...,1.-.-.-,_20,True
5117,C0HLR0,MRLTQGAFSYLPDLTDAQII,4.1.1.39,_20,True


### 5. 序列比对结果

In [5]:
diamond_task3 = funclib.getblast(train=data_task3_train[['id','seq']], test=test_res[['id','seq']])
res_task3_diamond = diamond_task3[['id','sseqid','pident']].merge(data_task3_train[['id','ec_number']], how='left', left_on='sseqid', right_on='id')
res_task3_diamond = res_task3_diamond[['id_x', 'ec_number','pident']].rename(columns={'id_x':'id','ec_number':'ec_diamond'})

f_merge_res = test_res[['id','ec_number', 'ds_in', 'ec_predictable']].merge(res_task3_diamond, on='id', how='left').drop_duplicates().reset_index(drop=True)
f_merge_res.ec_diamond = f_merge_res.ec_diamond.fillna('-')

Write finished
Write finished
diamond makedb --in /tmp/train.fasta -d /tmp/train.dmnd --quiet
diamond blastp -d /tmp/train.dmnd  -q  /tmp/test.fasta -o /tmp/test_fasta_results.tsv -b5 -c1 -k 1 --quiet


### 6. ECPred 结果

In [6]:
res_ecpred = pd.read_csv(f'{cfg.RESULTSDIR}ecpred/ecpred.txt', sep='\t', header=0)
res_ecpred=res_ecpred.rename(columns={'Protein ID':'id','EC Number':'ec_ecpred'})[['id','ec_ecpred']]
res_ecpred.ec_ecpred = np.where(res_ecpred.ec_ecpred=='no Prediction', '-', res_ecpred.ec_ecpred)
res_ecpred.ec_ecpred = np.where(res_ecpred.ec_ecpred=='non Enzyme', '-', res_ecpred.ec_ecpred)

f_merge_res = f_merge_res.merge(res_ecpred, on='id', how='left')
f_merge_res.ec_ecpred = f_merge_res.ec_ecpred.fillna('-')


### 7. DeepEC 结果

In [7]:
res_deepec = pd.read_csv(cfg.FILE_DEEPEC_RESULTS, sep='\t',names=['id', 'ec_number'], header=0 )
res_deepec.ec_number=res_deepec.apply(lambda x: x['ec_number'].replace('EC:',''), axis=1)
res_deepec.columns = ['id','ec_deepec']

res = []
for index, group in  res_deepec.groupby('id'):
    if len(group)==1:
        res = res + [[group.id.values[0], group.ec_deepec.values[0]]]
    else:
        ecs_str = ','.join(group.ec_deepec.values)
        res = res +[[group.id.values[0],ecs_str]] 
res_deepec = pd.DataFrame(res, columns=['id', 'ec_deepec'])

f_merge_res = f_merge_res.merge(res_deepec, how='left', on='id')
f_merge_res.ec_deepec = f_merge_res.ec_deepec.fillna('-')

### 8. CatFam 结果

In [8]:
res_catfam = pd.read_csv(cfg.RESULTSDIR+'catfam/catfam.txt', sep='\t', names=['id', 'ec_catfam'])
res = []
for index, group in  res_catfam.groupby('id'):
    if len(group)==1:
        res = res + [[group.id.values[0], group.ec_catfam.values[0]]]
    else:
        ecs_str = ','.join(group.ec_catfam.values)
        res = res +[[group.id.values[0],ecs_str]] 
res_catfam = pd.DataFrame(res, columns=['id', 'ec_catfam'])
res_catfam = res_catfam.fillna('-')


f_merge_res = f_merge_res.merge(res_catfam, on='id', how='left')
f_merge_res.ec_catfam = f_merge_res.ec_catfam.fillna('-')

### 9. PRIAM 结果

In [9]:
res_priam = eva.load_praim_res(resfile=cfg.RESULTSDIR+'priam/PRIAM_20221011103347/ANNOTATION/sequenceECs.txt')
f_merge_res = f_merge_res.merge(res_priam, on='id', how='left')
f_merge_res.ec_priam = f_merge_res.ec_priam.fillna('-')

### 10. DMLF-ours

In [10]:
res_dmlf = pd.read_csv(cfg.RESULTSDIR+'case2210/case_5122_results_pred.csv', sep=',',index_col=0)[['id', 'pred', 'listpred_20']]
res_dmlf = res_dmlf.rename(columns={'pred':'ec_ours'})
res_dmlf['ec_ours']=res_dmlf.ec_ours.apply(lambda x: x.split(',')[0].replace('(','').replace('\'',''))
res_dmlf['listpred_20'] = res_dmlf.listpred_20.apply(lambda x: make_rank_array(x))
f_merge_res = f_merge_res.merge(res_dmlf, on='id', how='left')
f_merge_res['is_full_level_ec']=f_merge_res.ec_number.apply(lambda x: False if '-' in x else True)
f_merge_res=f_merge_res.drop_duplicates().reset_index(drop=True)

### 11. 正确计算

In [11]:
f_merge_res['right_ec_diamond'] = f_merge_res.apply(lambda x:is_ec_pred_correct(ground=x.ec_number, predict=x.ec_diamond), axis=1)
f_merge_res['right_ec_ecpred']  = f_merge_res.apply(lambda x:is_ec_pred_correct(ground=x.ec_number, predict=x.ec_ecpred), axis=1)
f_merge_res['right_ec_deepec']  = f_merge_res.apply(lambda x:is_ec_pred_correct(ground=x.ec_number, predict=x.ec_deepec), axis=1)
f_merge_res['right_ec_catfam']  = f_merge_res.apply(lambda x:is_ec_pred_correct(ground=x.ec_number, predict=x.ec_catfam), axis=1)
f_merge_res['right_ec_priam']   = f_merge_res.apply(lambda x:is_ec_pred_correct(ground=x.ec_number, predict=x.ec_priam), axis=1)
f_merge_res['ec_ours']=f_merge_res.apply(lambda x: x.ec_diamond if x.pident>20 else x.ec_ours, axis=1)
f_merge_res['right_ec_ours']   = f_merge_res.apply(lambda x:is_ec_pred_correct(ground=x.ec_number, predict=x.ec_ours), axis=1)
f_merge_res.head(3)

In [14]:
f_merge_res.to_excel(cfg.RESULTSDIR+'case2210/case_5121_integrated_results.xlsx', index=None)