##  EC预测结果分析
### 1. 导入必要的包

In [1]:
import pandas as pd
from retrying import retry
import urllib
import gzip
import numpy as np
from tqdm import tqdm
from io import StringIO
import config as cfg
from tkinter import _flatten
import tools.funclib as funclib
import benchmark_evaluation as eva

### 2. 读入训练测试数据

In [2]:
data_task3_train = pd.read_feather(cfg.FILE_TASK3_TRAIN)

task3_test_20 = pd.read_feather(cfg.FILE_TASK3_TEST_2020)
task3_test_22 = pd.read_feather(cfg.FILE_TASK3_TEST_2022)

ec_task3_train = _flatten([item.split(',') for item in list(set(data_task3_train.ec_number))])

### 3. 定义操作函数

In [3]:
def cacl_t20_t22(seqlist_20, seqlist_22, seq):
    res=''
    if seq in seqlist_20:
        res = res +'_20'
    if seq in seqlist_22:
        res = res + '_22'
    return res

def isecpredictable(ec_str, ecset):
    ec_array =[item.strip() for item in ec_str.split(',')]
    intersection = set(ec_array) & ecset

    if len(intersection) == len(ec_array):
        return True
    else:
        return False

def make_rank_array(rank_str):
    # rank_str = rank_str.replace('[','').replace(']','').replace(')',' ').replace('(','').replace('\'','').split(' , ')
    # rank_str = [item.split(',') for item in rank_str]
    # res_df = pd.DataFrame(rank_str, columns=['ec','prob'])
    # res_df.prob = res_df.prob.astype('float').round(4)
    rank_str = rank_str.replace('[','').replace(']','').replace('\'','')
    return rank_str

def is_ec_pred_correct(ground, predict):
    # print(ground)
    # print(predict)
    if ground == predict:
        return True
    elif len(ground.split('.')) == 4 and len(predict.split('.'))==4:
        ground_array = ground.split('.')
        predict_array = predict.split('.')

        l3str_ground = '_'.join(ground_array[0:3])
        l3str_pred = '_'.join(predict_array[0:3])

        if (ground_array[3]=='-') & (predict_array[3]!='-') & (l3str_ground == l3str_pred):
            return True
        else:
            l2str_ground = '_'.join(ground_array[0:2])
            l2str_pred = '_'.join(predict_array[0:2])
            if (ground_array[2]=='-') & (predict_array[2]!='-') & (l2str_ground == l2str_pred):
                return True
            else:
                if (ground_array[1]=='-') & (predict_array[1]!='-') & (ground_array[0] == predict_array[0]):
                    return True
                else:
                    return False
    else:
        return False

@retry(stop_max_attempt_number=10, wait_random_min=10, wait_random_max=20)   
def grab_Uniprot_info_by_ids(uniprot_id=[], cache=False, got_return=True):
    """ 通过API， 使用Uniprot ID 获取序列信息

    Args:
        uniprot_id (string): uniprot ID

    Returns:
        array: [id, reviewed, organism,ec, length, sequence]
    """

    str_uniprotids = '%20OR%20'.join(uniprot_id)
    url = f'https://rest.uniprot.org/uniprotkb/search?query=accession:{str_uniprotids}&format=tsv&fields=accession,id,protein_name,gene_names,organism_name,annotation_score,lit_pubmed_id'
    # url = f'https://rest.uniprot.org/uniprotkb/search?query={str_uniprotids}&format=tsv&&fields=accession,id,reviewed,organism_name,ec,length,sequence'
    # print(f'开始获取网页数据, EC:{ec}')
    req = urllib.request.Request(url)
    try:
        response = urllib.request.urlopen(req)
        
        if response.info().get('Content-Encoding') == 'gzip':
            data = gzip.decompress(response.read()).decode("ISO-8859-1")
        else:
            data = response.read()

        data = data.decode('utf-8')
        if data !=b'':
            df_uniprot = pd.read_csv(StringIO(data), sep='\t')
            return df_uniprot
        else:
            return pd.DataFrame()
    except urllib.error.URLError as e:
        print(e.reason)
        
def grab_Uniprot_info_by_ids_inbatch(uniprot_list, batch_size=5):
    res_df = pd.DataFrame()
    batch_round = int( len(uniprot_list)/batch_size) + 1
    for i in tqdm(range(batch_round)):
        batch_df = grab_Uniprot_info_by_ids(uniprot_id=uniprot_list[i*batch_size:((i+1)*batch_size)], cache=False, got_return=True)
        res_df = pd.concat([res_df, batch_df], axis =0)
    res_df.reset_index(drop=True, inplace=True)
    return res_df

### 4. 定义输出表格

In [4]:
test_res = pd.concat([task3_test_22,task3_test_20[~task3_test_20.seq.isin(task3_test_22.seq)]], axis=0).reset_index(drop=True)
test_res['ds_in'] = test_res.seq.apply(lambda x: cacl_t20_t22(task3_test_20.seq.values, task3_test_22.seq.values, x))
test_res['ec_predictable']=test_res.ec_number.apply(lambda x :isecpredictable(x, set(ec_task3_train)))
test_res['seq_length']=test_res.seq.apply(lambda x:len(x))


### 5. 序列比对结果

In [5]:
diamond_task3 = funclib.getblast(train=data_task3_train[['id','seq']], test=test_res[['id','seq']])
res_task3_diamond = diamond_task3[['id','sseqid','pident']].merge(data_task3_train[['id','ec_number']], how='left', left_on='sseqid', right_on='id')
res_task3_diamond = res_task3_diamond[['id_x', 'ec_number','pident']].rename(columns={'id_x':'id','ec_number':'ec_diamond'})

f_merge_res = test_res[['id','ec_number','seq_length', 'ds_in', 'ec_predictable']].merge(res_task3_diamond, on='id', how='left').drop_duplicates().reset_index(drop=True)
f_merge_res.ec_diamond = f_merge_res.ec_diamond.fillna('-')

Write finished
Write finished
diamond makedb --in /tmp/train.fasta -d /tmp/train.dmnd --quiet
diamond blastp -d /tmp/train.dmnd  -q  /tmp/test.fasta -o /tmp/test_fasta_results.tsv -b5 -c1 -k 1 --quiet


### 6. ECPred 结果

In [6]:
res_ecpred = pd.read_csv(f'{cfg.RESULTSDIR}ecpred/ecpred.txt', sep='\t', header=0)
res_ecpred=res_ecpred.rename(columns={'Protein ID':'id','EC Number':'ec_ecpred'})[['id','ec_ecpred']]
res_ecpred.ec_ecpred = np.where(res_ecpred.ec_ecpred=='no Prediction', '-', res_ecpred.ec_ecpred)
res_ecpred.ec_ecpred = np.where(res_ecpred.ec_ecpred=='non Enzyme', '-', res_ecpred.ec_ecpred)

f_merge_res = f_merge_res.merge(res_ecpred, on='id', how='left')
f_merge_res.ec_ecpred = f_merge_res.ec_ecpred.fillna('-')


### 7. DeepEC 结果

In [7]:
res_deepec = pd.read_csv(cfg.FILE_DEEPEC_RESULTS, sep='\t',names=['id', 'ec_number'], header=0 )
res_deepec.ec_number=res_deepec.apply(lambda x: x['ec_number'].replace('EC:',''), axis=1)
res_deepec.columns = ['id','ec_deepec']

res = []
for index, group in  res_deepec.groupby('id'):
    if len(group)==1:
        res = res + [[group.id.values[0], group.ec_deepec.values[0]]]
    else:
        ecs_str = ','.join(group.ec_deepec.values)
        res = res +[[group.id.values[0],ecs_str]] 
res_deepec = pd.DataFrame(res, columns=['id', 'ec_deepec'])

f_merge_res = f_merge_res.merge(res_deepec, how='left', on='id')
f_merge_res.ec_deepec = f_merge_res.ec_deepec.fillna('-')

### 8. CatFam 结果

In [8]:
res_catfam = pd.read_csv(cfg.RESULTSDIR+'catfam/catfam.txt', sep='\t', names=['id', 'ec_catfam'])
res = []
for index, group in  res_catfam.groupby('id'):
    if len(group)==1:
        res = res + [[group.id.values[0], group.ec_catfam.values[0]]]
    else:
        ecs_str = ','.join(group.ec_catfam.values)
        res = res +[[group.id.values[0],ecs_str]] 
res_catfam = pd.DataFrame(res, columns=['id', 'ec_catfam'])
res_catfam = res_catfam.fillna('-')


f_merge_res = f_merge_res.merge(res_catfam, on='id', how='left')
f_merge_res.ec_catfam = f_merge_res.ec_catfam.fillna('-')

### 9. PRIAM 结果

In [9]:
res_priam = eva.load_praim_res(resfile=cfg.RESULTSDIR+'priam/PRIAM_20221011103347/ANNOTATION/sequenceECs.txt')
f_merge_res = f_merge_res.merge(res_priam, on='id', how='left')
f_merge_res.ec_priam = f_merge_res.ec_priam.fillna('-')

### 10. DMLF-ours

In [10]:
res_dmlf = pd.read_csv(cfg.RESULTSDIR+'case2210/case_5122_results_pred.csv', sep=',',index_col=0)[['id', 'pred', 'listpred_20']]
res_dmlf = res_dmlf.rename(columns={'pred':'ec_ours'})
res_dmlf['ec_ours']=res_dmlf.ec_ours.apply(lambda x: x.split(',')[0].replace('(','').replace('\'',''))
res_dmlf['listpred_20'] = res_dmlf.listpred_20.apply(lambda x: make_rank_array(x))
f_merge_res = f_merge_res.merge(res_dmlf, on='id', how='left')
f_merge_res['is_full_level_ec']=f_merge_res.ec_number.apply(lambda x: False if '-' in x else True)
f_merge_res=f_merge_res.drop_duplicates().reset_index(drop=True)

### 11. 正确计算

In [11]:
f_merge_res['right_ec_diamond'] = f_merge_res.apply(lambda x:is_ec_pred_correct(ground=x.ec_number, predict=x.ec_diamond), axis=1)
f_merge_res['right_ec_ecpred']  = f_merge_res.apply(lambda x:is_ec_pred_correct(ground=x.ec_number, predict=x.ec_ecpred), axis=1)
f_merge_res['right_ec_deepec']  = f_merge_res.apply(lambda x:is_ec_pred_correct(ground=x.ec_number, predict=x.ec_deepec), axis=1)
f_merge_res['right_ec_catfam']  = f_merge_res.apply(lambda x:is_ec_pred_correct(ground=x.ec_number, predict=x.ec_catfam), axis=1)
f_merge_res['right_ec_priam']   = f_merge_res.apply(lambda x:is_ec_pred_correct(ground=x.ec_number, predict=x.ec_priam), axis=1)
f_merge_res['ec_ours']=f_merge_res.apply(lambda x: x.ec_diamond if x.pident>20 else x.ec_ours, axis=1)
f_merge_res['right_ec_ours']   = f_merge_res.apply(lambda x:is_ec_pred_correct(ground=x.ec_number, predict=x.ec_ours), axis=1)
f_merge_res.head(3)

Unnamed: 0,id,ec_number,seq_length,ds_in,ec_predictable,ec_diamond,pident,ec_ecpred,ec_deepec,ec_catfam,ec_priam,ec_ours,listpred_20,is_full_level_ec,right_ec_diamond,right_ec_ecpred,right_ec_deepec,right_ec_catfam,right_ec_priam,right_ec_ours
0,P9WG69,2.3.1.9,393,_22,True,2.3.1.9,99.7,2.3.1.9,2.3.1.9,2.3.1.9,"2.3.1.9, 2.3.1.16, 2.3.1.174, 2.3.1.174, 2.3.1...",2.3.1.9,"(2.3.1.9, 0.9547642), (2.3.1.-, 0.04485531), (...",True,True,True,True,True,False,True
1,P9WI52,3.1.3.-,177,_22,True,3.1.3.-,100.0,3.1.3.-,-,-,-,3.1.3.-,"(3.1.3.-, 0.9928155), (2.7.13.3, 0.014787311),...",False,True,True,False,False,False,True
2,P9WQ38,6.2.1.26,383,_22,True,6.2.1.26,99.7,6.2.1.-,6.2.1.26,-,"6.2.1.26, 6.2.1.42, 6.2.1.41, 6.2.1.33, 6.2.1....",6.2.1.26,"(6.2.1.26, 0.9539515), (6.2.1.12, 0.06710984),...",True,True,False,True,False,False,True


In [12]:
sup_info = grab_Uniprot_info_by_ids_inbatch(uniprot_list=f_merge_res.id.values, batch_size=20)
sup_info = sup_info.rename(columns={'Annotation':'Annotation_score', 'Entry':'id'})
sup_info

100%|██████████| 257/257 [06:02<00:00,  1.41s/it]


Unnamed: 0,id,Entry Name,Protein names,Gene Names,Organism,Annotation_score,PubMed ID
0,Q9CZN7,GLYM_MOUSE,"Serine hydroxymethyltransferase, mitochondrial...",Shmt2,Mus musculus (Mouse),5.0,16141072; 19468303; 15489334; 21183079; 238063...
1,A0A0B4K7J2,RBP2_DROME,E3 SUMO-protein ligase RanBP2 (EC 2.3.2.-) (35...,Nup358 RanBP2 CG11856,Drosophila melanogaster (Fruit fly),5.0,10731132; 12537572; 12537569; 14729961; 176820...
2,Q6EZC2,SUBA_ECOLX,Subtilase cytotoxin subunit A (EC 3.4.21.-),subA,Escherichia coli,5.0,11598075; 17101670; 15226357; 18042253; 180052...
3,Q95QG8,FCP1_CAEEL,RNA polymerase II subunit A C-terminal domain ...,fcp-1 F36F2.6,Caenorhabditis elegans,4.0,9851916; 17291483; 23903194
4,P9WG69,FADA4_MYCTU,Probable acetyl-CoA acetyltransferase (EC 2.3....,fadA4 Rv1323 MTCY130.08,Mycobacterium tuberculosis (strain ATCC 25618 ...,4.0,9634230; 34915127; 21969609
...,...,...,...,...,...,...,...
5114,Q5RF96,SPCS1_PONAB,Signal peptidase complex subunit 1 (Microsomal...,SPCS1,Pongo abelii (Sumatran orangutan) (Pongo pygma...,3.0,
5115,V5NAL9,TLR4_PINIB,Toll-like receptor 4 (PmTLR4),,Pinctada imbricata (Atlantic pearl-oyster) (Pi...,3.0,28893645
5116,A0A0S2DN66,Y3070_LYSEN,Probable ATPase FE772_23070 (EC 3.6.4.-),FE772_23070 Ga0399710_4916 GLE_4746,Lysobacter enzymogenes,2.0,26597042; 31540995; 35025633
5117,C0HLR0,RBS_CHAMQ,Ribulose bisphosphate carboxylase small subuni...,rbcS,Chattonella marina var. antiqua (Red tide flag...,2.0,23291769


In [13]:
f_merge_res = f_merge_res.merge(sup_info, on='id')
# make id clickable
f_merge_res.id = f_merge_res.id.apply(lambda x: f'=HYPERLINK("https://www.uniprot.org/uniprotkb/{x}/entry", "{x}")')

f_merge_res.to_excel(cfg.RESULTSDIR+'case2210/case_5121_integrated_results.xlsx', index=None)