In [18]:
import pandas as pd
import numpy as np
import joblib
import os
import benchmark_common as bcommon
import config as cfg
import benchmark_test as btest
import argparse
import tools.funclib as funclib
import tools.embedding_esm as esmebd
import time
import benchmark_evaluation as eva
from pandarallel import pandarallel #  import pandaralle
pandarallel.initialize() 

INFO: Pandarallel will run on 80 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [5]:
#region Integrate output
def integrate_out_put(existing_table, blast_table, isEnzyme_pred_table, how_many_table, ec_table, mode='p', topnum=1):
    """[Integrate output]

    Args:
        existing_table ([DataFrame]): [db search results table]
        blast_table ([DataFrame]): [sequence alignment results table]
        isEnzyme_pred_table (DataFrame): [isEnzyme prediction results table]
        how_many_table ([DataFrame]): [function counts prediction results table]
        ec_table ([DataFrame]): [ec prediction table]

    Returns:
        [DataFrame]: [final results]
    """
    existing_table['res_type'] = 'db_match'
    blast_table['res_type']='blast_match'
    results_df = ec_table.merge(blast_table, on='id', how='left')

    function_df = how_many_table.copy()
    function_df = function_df.merge(isEnzyme_pred_table, on='id', how='left')
    function_df = function_df.merge(blast_table[['id', 'ec_number']], on='id', how='left')
    function_df['pred_function_counts']=function_df.parallel_apply(lambda x :integrate_enzyme_functioncounts(x.ec_number, x.isEnzyme_pred, x.pred_s, x.pred_m), axis=1)
    results_df = results_df.merge(function_df[['id','pred_function_counts']],on='id',how='left')

    results_df.loc[results_df[results_df.res_type.isnull()].index,'res_type']='dmlf_pred'
    results_df['pred_ec']=results_df.parallel_apply(lambda x: gather_ec_by_fc(x.iloc[3:23],x.ec_number, x.pred_function_counts), axis=1)
    results_df = results_df.iloc[:,np.r_[0,23,1,2,32,27:31]].rename(columns={'seq_x':'seq','seqlength_x':'seqlength'})

    

    if mode=='p':
        existing_table['pred_ec']=''
        result_set = pd.concat([existing_table, results_df], axis=0)
        result_set = result_set.drop_duplicates(subset=['id'], keep='first').sort_values(by='res_type')
        result_set['ec_number'] = result_set.apply(lambda x: x.pred_ec if str(x.ec_number)=='nan' else x.ec_number, axis=1)
        result_set.reset_index(drop=True, inplace=True)
        result_set = result_set.iloc[:,0:9]
    if mode =='r':
        result_set= results_df.merge(ec_table, on=['id'], how='left')
        result_set=result_set.iloc[:,np.r_[0:3,30,5:9, 4,10:30]]
        result_set = result_set.rename(columns=dict({'seq_x': 'seq','pred_ec': 'top0','top0_y': 'top1' },  **{'top'+str(i) : 'top'+str(i+1) for i in range(0, 20)}))
        result_set = result_set.iloc[:,0:(8+topnum)]
        result_set.loc[result_set[result_set.id.isin(existing_table.id)].index.values,'res_type']= 'db_match'

    return result_set
#endregion

#region Predict Function Counts
def predict_function_counts(test_data):
    """[Predict Function Counts]

    Args:
        test_data ([DataFrame]): [DF contain protein ID and Seq]

    Returns:
        [DataFrame]: [col1:id, col2: single or multi; col3: multi counts]
    """
    res=pd.DataFrame()
    res['id']=test_data.id
    model_s = joblib.load(cfg.MODELDIR+'/single_multi.model')
    model_m = joblib.load(cfg.MODELDIR+'/multi_many.model')
    pred_s=model_s.predict(np.array(test_data.iloc[:,1:]))
    pred_m=model_m.predict(np.array(test_data.iloc[:,1:]))
    res['pred_s']=1-pred_s
    res['pred_m']=pred_m+2

    return res
#endregion

#region Integrate function counts by blast, single and multi
def integrate_enzyme_functioncounts(blast, isEnzyme, single, multi):
    """[Integrate function counts by blast, single and multi]

    Args:
        blast ([type]): [blast results]
        s ([type]): [single prediction]
        m ([type]): [multi prediction]

    Returns:
        [type]: [description]
    """
    if str(blast)!='nan':
        if str(blast)=='-':
            return 0
        else:
            return len(blast.split(','))
    if isEnzyme == 0:
        return 0
    if single ==1:
        return 1
    return multi
#endregion

#region format finnal ec by function counts
def gather_ec_by_fc(toplist, ec_blast ,counts):
    """[format finnal ec by function counts]

    Args:
        toplist ([list]): [top 20 predicted EC]
        ec_blast ([string]): [blast results]
        counts ([int]): [function counts]

    Returns:
        [string]: [comma sepreated ec string]
    """
    if counts==0:
        return '-'
    elif str(ec_blast)!='nan':
        return str(ec_blast)
    else:
        return ','.join(toplist[0:counts])
#endregion

#region GOT EC PREDICTION BY SLICE
def predict_ec_slice(test_data):
    """[GOT EC PREDICTION BY SLICE]

    Args:
        test_data ([DataFrame]): [esm32 format DataFrame]
    """
    pr_X = test_data.iloc[:,1:]
    timestr = time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime())
    xfile = cfg.TEMPDIR+'ptest_'+timestr+'.txt'
    xpred = cfg.TEMPDIR+'ptest_'+timestr+'.tsv'
    cfg.FEATURE_NUM = 1280
    bcommon.prepare_slice_file_onlyx(x_data=pr_X,  x_file=xfile)
    dict_ec_label = np.load(cfg.FILE_EC_LABEL_DICT, allow_pickle=True).item()
    slice_pred_ec = btest.get_slice_res(slice_query_file=xfile, 
                                        model_path= cfg.MODELDIR+'/slice_esm32', 
                                        dict_ec_label=dict_ec_label, 
                                        test_set=test_data,  
                                        res_file=xpred)
    
    return slice_pred_ec
#endregion

In [19]:
input_fasta='./data/sample_sequences_0118.fasta' 
output_tsv='results/sample_sequences_0118.tsv'
mode='r'
topnum=15

In [20]:
start = time.process_time()
if mode =='p':
    print('run in annoation mode')
if mode =='r':
    print('run in recommendation mode')

# 1. 读入数据
print('step 1: loading data')
input_df = funclib.load_fasta_to_table(input_fasta) # test fasta
latest_sprot = pd.read_feather(cfg.FILE_LATEST_SPROT_FEATHER) #sprot db

# 2. 查找数据
print('step 2: find existing data')
find_data =input_df.merge(latest_sprot, on='seq', how='left')
exist_data= find_data[~find_data.id_y.isnull()].iloc[:,np.r_[0,2,1,12,7,9:12]].rename(columns={'id_x':'id','id_y':'id_uniprot'})
noExist_data = find_data[find_data.name.isnull()]
noExist_data.reset_index(drop=True, inplace=True)
noExist_data = noExist_data.iloc[:,np.r_[0,2,1,12,7,9:12]].rename(columns={'id_x':'id','id_y':'id_uniprot'})

if len(noExist_data) == 0:
    exist_data.to_csv(output_tsv, sep='\t')
    end = time.process_time()
    print('All done running time: %s Seconds'%(end-start))
    

# 3. EMBedding
print('step 3: Embedding')
if mode =='p':
    rep0, rep32, rep33 = esmebd.get_rep_multi_sequence(sequences=noExist_data, model='esm1b_t33_650M_UR50S',seqthres=1022)
if mode == 'r':
    rep0, rep32, rep33 = esmebd.get_rep_multi_sequence(sequences=input_df, model='esm1b_t33_650M_UR50S',seqthres=1022)

# 4. sequence alignment
print('step 4: sequence alignment')
if ~os.path.exists(cfg.FILE_BLAST_PRODUCTION_DB):
    funclib.table2fasta(latest_sprot, cfg.FILE_BLAST_PRODUCTION_FASTA)
    cmd = r'diamond makedb --in {0} -d {1}'.format(cfg.FILE_BLAST_PRODUCTION_FASTA, cfg.FILE_BLAST_PRODUCTION_DB)
    os.system(cmd)
if mode =='p':
    blast_res = funclib.getblast_usedb(db=cfg.FILE_BLAST_PRODUCTION_DB, test=noExist_data)
if mode == 'r':
    blast_res = funclib.getblast_usedb(db=cfg.FILE_BLAST_PRODUCTION_DB, test=input_df)
blast_res = blast_res[['id', 'sseqid']].merge(latest_sprot, left_on='sseqid', right_on='id', how='left').iloc[:,np.r_[0,2:14]]
blast_res = blast_res.iloc[:,np.r_[0,1,11,12,6,8:11]].rename(columns={'id_x':'id','id_y':'id_uniprot'})

# 5. isEnzyme Prediction
print('step 5: predict isEnzyme')
model_isEnzyme = joblib.load(cfg.ISENZYME_MODEL)
pred_isEnzyme = pd.DataFrame()
pred_isEnzyme['id']=rep32.id
pred_isEnzyme['isEnzyme_pred'] = model_isEnzyme.predict(rep32.iloc[:,1:])

# 6. How many Prediction
print('step 6: predict function counts')
pred_howmany = predict_function_counts(rep32)


# 7. EC Prediction
print('step 7: predict EC')
pred_ec = predict_ec_slice(test_data=rep32)
if mode=='p':
    pred_ec = noExist_data[['id','seq']].merge(pred_ec, on='id', how='left')
if mode == 'r':
    pred_ec = input_df[['id', 'seq']].merge(pred_ec, on='id', how='left')

pred_ec['seqlength']=pred_ec.seq.parallel_apply(lambda x: len(x) )

run in recommendation mode
step 1: loading data
step 2: find existing data
step 3: Embedding
Transferred model to GPU


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:01<00:00,  8.58it/s]


step 4: sequence alignment
Write finished


diamond v2.0.8.146 (C) Max Planck Society for the Advancement of Science
Documentation, support and updates available at http://www.diamondsearch.org

#CPU threads: 80
Scoring parameters: (Matrix=BLOSUM62 Lambda=0.267 K=0.041 Penalties=11/1)
Database input file: /home/shizhenkun/codebase/DMLF/data/production_blast.fasta
Opening the database file...  [0.099s]
Loading sequences...  [0.944s]
Masking sequences...  [0.493s]
Writing sequences...  [0.174s]
Hashing sequences...  [0.065s]
Loading sequences...  [0s]
Writing trailer...  [0.003s]
Closing the input file...  [0.004s]
Closing the database file...  [0.24s]
Database hash = f54aa5d51d1b3cf829f9cee4cd719a7e
Processed 477917 sequences, 181218071 letters.
Total time = 2.024s


Write finished
diamond blastp -d /home/shizhenkun/codebase/DMLF/data/production_blast.dmnd  -q  /tmp/test.fasta -o /tmp/test_fasta_results.tsv -b5 -c1 -k 1 --quiet
step 5: predict isEnzyme
step 6: predict function counts
step 7: predict EC
slice files prepared success
./slice_predict /home/shizhenkun/codebase/DMLF/tmp/ptest_2022_01_19_03_23_55.txt /home/shizhenkun/codebase/DMLF/model/slice_esm32 /home/shizhenkun/codebase/DMLF/tmp/ptest_2022_01_19_03_23_55.tsv -o 32 -b 0 -t 32 -q 0


Your CPU supports instructions that this binary was not compiled to use: SSE3 SSE4.1 SSE4.2 AVX AVX2
For maximum performance, you can install NMSLIB from sources 
pip install --no-binary :all: nmslib


Time taken to find approx nearest neighbors = 0.005458
Total prediction time: 0.464663 s
Prediction time per point: 46.4663 ms


In [10]:
output_df = integrate_out_put(existing_table=exist_data,
                              blast_table=blast_res,
                              isEnzyme_pred_table = pred_isEnzyme, 
                              how_many_table = pred_howmany, 
                              ec_table = pred_ec,
                              mode=mode,
                              topnum=topnum
                            )

In [4]:

print('step 8: integrate results')

output_df = integrate_out_put(existing_table=exist_data,
                              blast_table=blast_res,
                              isEnzyme_pred_table = pred_isEnzyme, 
                              how_many_table = pred_howmany, 
                              ec_table = pred_ec,
                              mode=mode,
                              topnum=topnum
                            )
print('step 9: writting results')                
output_df.to_csv(output_tsv, sep='\t', index=False)

end = time.process_time()
print('All done running time: %s Seconds'%(end-start))


In [11]:
output_df

Unnamed: 0,id,id_uniprot,seq,seqlength,ec_number,date_integraged,date_sequence_update,date_annotation_update,res_type
0,S1,A0A2Z5XAL7,MTSSEPIAIIGSACRFPGGADTPSKLWELLKEPRDLLQKVPEKRRW...,3961.0,"2.3.1.-,6.3.2.-",2021-09-29,2018-10-10,2021-09-29,db_match
1,S3,A0A0U5GJ41,MTILDKRQIQRFASDTDIETLSKAIDDDGVAIVRSVVSRDVIQRLQ...,297.0,1.14.-.-,2021-09-29,2016-03-16,2021-09-29,db_match
2,S4,Q4WAW9,MTVDSKPQLQRLAADADVDRMCRLLEEDGAFILKGLLPFDVVESFN...,291.0,1.14.11.38,2013-10-16,2005-07-05,2021-09-29,db_match
3,S5,A2R1P9,MSTNRRFDPNFTPYVVNSMGPKTPERARVVLGALIRHIHDFAREVE...,315.0,1.13.11.3,2021-09-29,2007-03-06,2021-09-29,db_match
4,S6,A0A7T1FRB0,MNLEKFVDELPIPEVAEPVKKNPRQTYYEIAMEEVFLKVHRDLPPT...,510.0,1.10.3.2,2021-09-29,2021-06-02,2021-09-29,db_match
5,S7,Q7V2C8,MSTKTSREIALERRKAMSDGGKKAALHSSSTKDRVRSSQDINSTGA...,765.0,-,2021-02-10,2003-10-01,2021-06-02,db_match
6,S8,Q31HD5,MKIYKVDKTLVSTNRIAMMEHKPLLVVREKDGGTPQVAVDPVGCKP...,90.0,-,2021-02-10,2005-12-06,2021-06-02,db_match
7,S2,,PIRGFCSTRIWENVPWGGPETYGTGKVAWDFPWNTCAIVKWWLCDD...,,-,NaT,NaT,NaT,dmlf_pred
8,S9,,CWNKMYIGTQYGGFHYSCIMEDVYMDARNMLAFGQCQVIEHFWCVW...,,-,NaT,NaT,NaT,dmlf_pred
9,S10,,VKQRQASLCVDMDNGWSVMDLHITWPVWLIVLLEDFKCTDGMWHRP...,,-,NaT,NaT,NaT,dmlf_pred


In [40]:
#region Integrate output
def integrate_out_put(existing_table, blast_table, isEnzyme_pred_table, how_many_table, ec_table, mode='p', topnum=1):
    """[Integrate output]

    Args:
        existing_table ([DataFrame]): [db search results table]
        blast_table ([DataFrame]): [sequence alignment results table]
        isEnzyme_pred_table (DataFrame): [isEnzyme prediction results table]
        how_many_table ([DataFrame]): [function counts prediction results table]
        ec_table ([DataFrame]): [ec prediction table]

    Returns:
        [DataFrame]: [final results]
    """
    existing_table['res_type'] = 'db_match'
    blast_table['res_type']='blast_match'
    results_df = ec_table.merge(blast_table, on='id', how='left')

    function_df = how_many_table.copy()
    function_df = function_df.merge(isEnzyme_pred_table, on='id', how='left')
    function_df = function_df.merge(blast_table[['id', 'ec_number']], on='id', how='left')
    function_df['pred_function_counts']=function_df.parallel_apply(lambda x :integrate_enzyme_functioncounts(x.ec_number, x.isEnzyme_pred, x.pred_s, x.pred_m), axis=1)
    results_df = results_df.merge(function_df[['id','pred_function_counts']],on='id',how='left')

    results_df.loc[results_df[results_df.res_type.isnull()].index,'res_type']='dmlf_pred'
    results_df['pred_ec']=results_df.parallel_apply(lambda x: gather_ec_by_fc(x.iloc[3:23],x.ec_number, x.pred_function_counts), axis=1)
    results_df = results_df.iloc[:,np.r_[0,23,1,2,32,27:31]].rename(columns={'seq_x':'seq','seqlength_x':'seqlength'})

    

    if mode=='p':
        existing_table['pred_ec']=''
        result_set = pd.concat([existing_table, results_df], axis=0)
        result_set = result_set.drop_duplicates(subset=['id'], keep='first').sort_values(by='res_type')
        result_set['ec_number'] = result_set.apply(lambda x: x.pred_ec if str(x.ec_number)=='nan' else x.ec_number, axis=1)
        result_set.reset_index(drop=True, inplace=True)
        result_set = result_set.iloc[:,0:9]
        
        result_set['seqlength'] = result_set.seq.apply(lambda x: len(x))
        result_set['ec_number'] = result_set.ec_number.apply(lambda x: 'Non-Enzyme' if len(x)==1 else x)
        result_set = result_set.rename(columns={'ec_number':'ecrecer_pred_ec_number'})
        
        result_set = result_set[['id','ecrecer_pred_ec_number','seq','seqlength']]
    
    if mode =='r':
        result_set= results_df.merge(ec_table, on=['id'], how='left')
        result_set=result_set.iloc[:,np.r_[0:3,30,5:9, 4,10:30]]
        result_set = result_set.rename(columns=dict({'seq_x': 'seq','pred_ec': 'top0','top0_y': 'top1' },  **{'top'+str(i) : 'top'+str(i+1) for i in range(0, 20)}))
#         result_set = result_set.iloc[:,0:(8+topnum)]
#         result_set.loc[result_set[result_set.id.isin(existing_table.id)].index.values,'res_type']= 'db_match'
      
        result_set = result_set.iloc[:,np.r_[0, 2:4,8:(8+topnum)]]
    
    return result_set

#endregion

output_df = integrate_out_put(existing_table=exist_data,
                              blast_table=blast_res,
                              isEnzyme_pred_table = pred_isEnzyme, 
                              how_many_table = pred_howmany, 
                              ec_table = pred_ec,
                              mode=mode,
                              topnum=topnum
                            )

output_df

Unnamed: 0,id,seq,seqlength,top0,top1,top2,top3,top4,top5,top6,top7,top8,top9,top10,top11,top12,top13,top14
0,S1,MTSSEPIAIIGSACRFPGGADTPSKLWELLKEPRDLLQKVPEKRRW...,3961,"2.3.1.-,6.3.2.-",2.1.1.375,1.-.-.-,5.4.3.-,2.3.1.201,5.3.1.4,1.8.1.19,2.3.1.232,2.7.8.40,2.1.1.88,3.2.1.18,2.3.1.198,2.3.1.199,1.21.98.3,1.2.99.-
1,S2,PIRGFCSTRIWENVPWGGPETYGTGKVAWDFPWNTCAIVKWWLCDD...,195,-,3.1.3.21,2.8.2.14,2.7.7.68,3.2.1.18,2.8.2.23,1.8.1.19,3.2.1.97,2.1.1.302,2.1.1.303,2.7.1.8,3.1.2.1,3.3.2.14,3.1.4.56,2.4.1.328
2,S3,MTILDKRQIQRFASDTDIETLSKAIDDDGVAIVRSVVSRDVIQRLQ...,297,1.14.-.-,1.13.12.13,1.-.-.-,1.13.12.23,1.14.11.42,1.14.11.25,3.5.4.19,2.4.1.337,1.8.1.16,1.14.11.20,1.14.11.-,1.2.1.91,1.14.14.152,1.14.11.2,1.13.11.75
3,S4,MTVDSKPQLQRLAADADVDRMCRLLEEDGAFILKGLLPFDVVESFN...,291,1.14.11.38,1.13.12.13,1.-.-.-,1.14.11.42,1.13.12.23,1.14.11.25,3.5.4.19,2.4.1.337,1.2.1.91,1.8.1.16,3.4.21.76,1.14.11.20,1.14.11.-,1.14.11.2,1.14.14.152
4,S5,MSTNRRFDPNFTPYVVNSMGPKTPERARVVLGALIRHIHDFAREVE...,315,1.13.11.3,1.12.99.6,3.4.23.-,1.11.2.5,1.11.2.4,1.13.11.29,3.4.22.37,1.13.11.11,2.3.1.13,1.5.1.6,2.7.8.41,1.97.1.4,2.4.1.337,4.1.1.112,1.5.3.10
5,S6,MNLEKFVDELPIPEVAEPVKKNPRQTYYEIAMEEVFLKVHRDLPPT...,510,1.10.3.2,1.1.99.36,1.-.-.-,5.3.1.26,3.4.22.37,1.7.6.1,1.5.3.16,1.14.15.15,1.14.19.67,2.1.1.102,3.9.1.-,3.1.3.53,1.1.99.3,1.1.99.38,1.1.99.29
6,S7,MSTKTSREIALERRKAMSDGGKKAALHSSSTKDRVRSSQDINSTGA...,765,-,2.7.7.58,2.5.1.23,2.1.1.375,3.1.3.21,3.2.1.18,2.7.1.30,3.2.2.20,3.1.7.11,4.2.3.66,3.4.21.39,3.1.4.12,5.3.1.8,5.3.1.24,2.1.1.288
7,S8,MKIYKVDKTLVSTNRIAMMEHKPLLVVREKDGGTPQVAVDPVGCKP...,90,-,3.4.24.72,2.1.1.50,3.5.3.15,3.4.24.-,3.2.1.93,3.5.1.48,3.4.24.61,2.7.1.43,2.4.2.54,2.7.7.27,3.5.4.19,5.4.99.57,1.3.8.1,2.7.1.231
8,S9,CWNKMYIGTQYGGFHYSCIMEDVYMDARNMLAFGQCQVIEHFWCVW...,76,-,3.2.1.18,3.1.3.21,3.2.2.20,2.3.1.207,2.8.2.14,3.5.1.46,3.4.11.5,3.3.2.14,3.2.1.31,3.1.1.64,3.2.1.75,3.2.1.55,2.1.1.303,1.1.1.56
9,S10,VKQRQASLCVDMDNGWSVMDLHITWPVWLIVLLEDFKCTDGMWHRP...,221,-,2.1.1.302,2.7.1.33,2.7.1.130,3.1.3.21,3.3.2.13,3.5.1.46,3.5.4.4,3.2.1.18,2.1.1.303,2.7.1.169,2.1.1.327,2.8.2.14,2.7.7.68,2.3.1.85


### task2 几功能酶数据集

In [2]:
train = pd.read_feather(cfg.DATADIR+'task2/train.feather')
test = pd.read_feather(cfg.DATADIR+'task2/test.feather')
print('train size: {0}\ntest size: {1}'.format(len(train), len(test)))

train size: 222567
test size: 3304


In [3]:
funclib.table2fasta(table=test, file_out=cfg.DATADIR+'task2/test.fasta')

Write finished


In [4]:
cfg.DATADIR

'/home/shizhenkun/codebase/DMLF/data/'

In [9]:
file_deepec = './results/task2/deepec/DeepEC_Result.txt'
res_deepec = pd.read_csv(file_deepec, sep='\t',names=['id', 'ec_number'], header=0 )
res_deepec.ec_number=res_deepec.apply(lambda x: x['ec_number'].replace('EC:',''), axis=1)
res_deepec.columns = ['id','ec_deepec']
res_deepec['isemzyme_deepec']=res_deepec.ec_deepec.apply(lambda x: True if str(x)!='nan' else False)
res_deepec['functionCounts_deepec'] = res_deepec.ec_deepec.apply(lambda x :len(str(x).split(',')))
# big_res = big_res.merge(res_deepec, on='id', how='left').drop_duplicates(subset='id')

In [None]:
#  java -jar ECPred.jar weighted /home/shizhenkun/codebase/uniprot/data/sprot_with_ec_query.fasta /home/shizhenkun/codebase/uniprot/deeppred/ECPred/ /home/shizhenkun/codebase/uniprot/temp/ /home/shizhenkun/codebase/uniprot/results/ecpred/sprot_with_ec_query_ecpred_results.tsv

In [27]:
res_data.head

<bound method NDFrame.head of               id                                                seq  \
0         Q5RF96  MARGGDTGCTGPSETSASGAVAIAFPGLEGPPADAQYQTLALTVPK...   
1         P9WIA9  DAGVSWKVYRNKTLGPISSVLTYGSLVTSFKQSADPRSDLVRFGVA...   
2         H2E7Q7  MARTPWLPNAYPPARRSDHVDIYKSALRGDVRVQDPYQWLEEYTDE...   
3     A0A0D4BSN8  MKLDDKRILIIGAGEVGTAVAEDLVNRSDPTEIIIHTSRQQTMDMR...   
4         E2JFG2  MPPTPWAPHSYPPTRRSDHVDVYQSASRGEVPVPDPYQWLEENSNE...   
...          ...                                                ...   
3353  A0A2R6Q324  MASFPPSLVFTVRRKEPTLVLPSKPTPRELKQLSDIDDQEGLRFQV...   
3354      Q6VE93  MGNVCVGGSRMSHQVYSPDRADTPPRSERNTPDRRQRAAGDAERTQ...   
3355  A0A509AKI1  MVLLNGKLKYIAVVAIFYNLIILLVKEKFPYICTKKKFHAISNRIL...   
3356      Q6NRV0  MPIRAYCTICSDFFDNARDVAAITCGHTFHQECLLQWFHSAPHRTC...   
3357      C5DLH0  MGLSADLFVQKRSAASSLKQPKELGFYSKTQEGQFLVNDSSKLSYY...   

      functionCounts  functionCounts_deepec  
0                  1                    0.0  
1                  1     

In [11]:
res_data=res_deepec[res_deepec.functionCounts_deepec>1]

Unnamed: 0,id,ec_deepec,isemzyme_deepec,functionCounts_deepec


In [19]:
res_data=test.merge(res_deepec[['id', 'functionCounts_deepec']], on='id', how='left').fillna(value=0)

In [21]:
eva.caculateMetrix(groundtruth=res_data.functionCounts, predict=res_data.functionCounts_deepec, baselineName='deepec', type='multi')

      deepec  		0.221560  	0.857020 		0.155097 	0.046982


In [37]:
# CATFAM

file_catfam='./results/task2/catfam/catfam.out'

res_catfam = pd.read_csv(file_catfam, sep='\t', names=['id', 'ec_catfam'])
res_catfam['isenzyme_catfam']=res_catfam.ec_catfam.apply(lambda x: True if str(x)!='nan' else False)
res_catfam['functionCounts_catfam'] = res_catfam.ec_catfam.apply(lambda x :len(str(x).split(',')))

In [45]:
res_data=test.merge(res_catfam[['id', 'functionCounts_catfam']], on='id', how='left').fillna(value=0)

In [53]:
res_cat_data = res_data.drop_duplicates()

In [89]:
res_cat_data[res_cat_data.functionCounts_catfam>1]

Unnamed: 0,id,seq,functionCounts,functionCounts_catfam


In [60]:
res_cat_data

Unnamed: 0,id,seq,functionCounts,functionCounts_catfam
0,Q5RF96,MARGGDTGCTGPSETSASGAVAIAFPGLEGPPADAQYQTLALTVPK...,1,1
1,P9WIA9,DAGVSWKVYRNKTLGPISSVLTYGSLVTSFKQSADPRSDLVRFGVA...,1,1
2,H2E7Q7,MARTPWLPNAYPPARRSDHVDIYKSALRGDVRVQDPYQWLEEYTDE...,1,1
3,A0A0D4BSN8,MKLDDKRILIIGAGEVGTAVAEDLVNRSDPTEIIIHTSRQQTMDMR...,1,1
4,E2JFG2,MPPTPWAPHSYPPTRRSDHVDVYQSASRGEVPVPDPYQWLEENSNE...,1,1
...,...,...,...,...
3374,A0A2R6Q324,MASFPPSLVFTVRRKEPTLVLPSKPTPRELKQLSDIDDQEGLRFQV...,1,1
3375,Q6VE93,MGNVCVGGSRMSHQVYSPDRADTPPRSERNTPDRRQRAAGDAERTQ...,1,1
3376,A0A509AKI1,MVLLNGKLKYIAVVAIFYNLIILLVKEKFPYICTKKKFHAISNRIL...,1,1
3377,Q6NRV0,MPIRAYCTICSDFFDNARDVAAITCGHTFHQECLLQWFHSAPHRTC...,1,1


In [54]:
eva.caculateMetrix(groundtruth=res_cat_data.functionCounts, predict=res_cat_data.functionCounts_catfam, baselineName='catfam', type='multi')

      catfam  		0.923729  	0.989104 		0.142857 	0.137193


In [81]:

file_priam='/home/shizhenkun/codebase/DMLF/results/priam/PRIAM_20210819134344/ANNOTATION/sequenceECs.txt'

#PRIAM
res_priam = eva.load_praim_res(resfile=file_priam)
res_priam['functionCounts_priam'] = res_priam.ec_priam.apply(lambda x :len(str(x).split(',')))
# big_res = big_res.merge(res_priam, on='id', how='left')
# big_res['isenzyme_priam'] = big_res.ec_priam.apply(lambda x: True if str(x)!='nan' else False)
# big_res['functionCounts_priam'] = big_res.ec_priam.apply(lambda x :len(str(x).split(',')))

In [82]:
res_data_priam=test.merge(res_priam[['id', 'functionCounts_priam']], on='id', how='left').fillna(value=0)

In [88]:
res_data_priam[res_data_priam.functionCounts_priam>1]

Unnamed: 0,id,seq,functionCounts,functionCounts_priam
0,Q5RF96,MARGGDTGCTGPSETSASGAVAIAFPGLEGPPADAQYQTLALTVPK...,1,2.0
1,P9WIA9,DAGVSWKVYRNKTLGPISSVLTYGSLVTSFKQSADPRSDLVRFGVA...,1,2.0
3,A0A0D4BSN8,MKLDDKRILIIGAGEVGTAVAEDLVNRSDPTEIIIHTSRQQTMDMR...,1,3.0
4,E2JFG2,MPPTPWAPHSYPPTRRSDHVDVYQSASRGEVPVPDPYQWLEENSNE...,1,2.0
5,Q6EZC2,MLKILWTYILFLLFISASARAEKPWYFDAIGLTETTMSLTDKNTPV...,1,2.0
...,...,...,...,...
3293,I6X8D2,MADVAESQENAPAERAELTVPEMRQWLRNWVGKAVGKAPDSIDESV...,1,21.0
3296,A0A059TC02,MRSVSGQVVCVTGAGGFIASWLVKILLEKGYTVRGTVRNPDDPKNG...,1,10.0
3297,Q753P9,MVFESELLLQRRLATTALKQPKELGYYSTNVGGELKVMDESNLSYY...,2,2.0
3298,A0A068BGA5,MASFPPSLVFTVRRKEPILVLPSKPTPRELKQLSDIDDQEGLRFQV...,1,17.0


In [65]:
eva.caculateMetrix(groundtruth=res_data_priam.functionCounts, predict=res_data_priam.functionCounts_priam, baselineName='priam', type='multi')

       priam  		0.131053  	0.011062 		0.928857 	0.003553


In [62]:
res_data[]

Unnamed: 0,id,seq,functionCounts,functionCounts_priam
0,Q5RF96,MARGGDTGCTGPSETSASGAVAIAFPGLEGPPADAQYQTLALTVPK...,1,2.0
1,P9WIA9,DAGVSWKVYRNKTLGPISSVLTYGSLVTSFKQSADPRSDLVRFGVA...,1,2.0
2,H2E7Q7,MARTPWLPNAYPPARRSDHVDIYKSALRGDVRVQDPYQWLEEYTDE...,1,1.0
3,A0A0D4BSN8,MKLDDKRILIIGAGEVGTAVAEDLVNRSDPTEIIIHTSRQQTMDMR...,1,3.0
4,E2JFG2,MPPTPWAPHSYPPTRRSDHVDVYQSASRGEVPVPDPYQWLEENSNE...,1,2.0
...,...,...,...,...
3299,A0A2R6Q324,MASFPPSLVFTVRRKEPTLVLPSKPTPRELKQLSDIDDQEGLRFQV...,1,17.0
3300,Q6VE93,MGNVCVGGSRMSHQVYSPDRADTPPRSERNTPDRRQRAAGDAERTQ...,1,0.0
3301,A0A509AKI1,MVLLNGKLKYIAVVAIFYNLIILLVKEKFPYICTKKKFHAISNRIL...,1,0.0
3302,Q6NRV0,MPIRAYCTICSDFFDNARDVAAITCGHTFHQECLLQWFHSAPHRTC...,1,1.0


In [66]:
file_ecpred= '/home/shizhenkun/codebase/BioUniprot/data/benchmark/results/ecpred/ecpred.tsv'

# ECpred
res_ecpred = pd.read_csv(file_ecpred, sep='\t', header=0)
res_ecpred['isemzyme_ecpred'] = ''
with pd.option_context('mode.chained_assignment', None):
    res_ecpred.isemzyme_ecpred[res_ecpred['EC Number']=='non Enzyme'] = False
    res_ecpred.isemzyme_ecpred[res_ecpred['EC Number']!='non Enzyme'] = True

res_ecpred.columns = ['id','ec_ecpred', 'conf', 'isemzyme_ecpred']
res_ecpred = res_ecpred.iloc[:,np.r_[0,1,3]]
res_ecpred['functionCounts_ecpred'] = res_ecpred.ec_ecpred.apply(lambda x :len(str(x).split(',')))
# big_res = big_res.merge(res_ecpred, on='id', how='left').drop_duplicates(subset='id')

In [80]:
res_ecpred[res_ecpred.functionCounts_ecpred==2]

Unnamed: 0,id,ec_ecpred,isemzyme_ecpred,functionCounts_ecpred


In [68]:
res_data_ecpred=test.merge(res_ecpred[['id', 'functionCounts_ecpred']], on='id', how='left').fillna(value=0)

In [73]:
res_data_ecpred

Unnamed: 0,id,seq,functionCounts,functionCounts_ecpred
0,Q5RF96,MARGGDTGCTGPSETSASGAVAIAFPGLEGPPADAQYQTLALTVPK...,1,1.0
1,P9WIA9,DAGVSWKVYRNKTLGPISSVLTYGSLVTSFKQSADPRSDLVRFGVA...,1,1.0
2,H2E7Q7,MARTPWLPNAYPPARRSDHVDIYKSALRGDVRVQDPYQWLEEYTDE...,1,1.0
3,A0A0D4BSN8,MKLDDKRILIIGAGEVGTAVAEDLVNRSDPTEIIIHTSRQQTMDMR...,1,1.0
4,E2JFG2,MPPTPWAPHSYPPTRRSDHVDVYQSASRGEVPVPDPYQWLEENSNE...,1,1.0
...,...,...,...,...
3300,A0A2R6Q324,MASFPPSLVFTVRRKEPTLVLPSKPTPRELKQLSDIDDQEGLRFQV...,1,1.0
3301,Q6VE93,MGNVCVGGSRMSHQVYSPDRADTPPRSERNTPDRRQRAAGDAERTQ...,1,1.0
3302,A0A509AKI1,MVLLNGKLKYIAVVAIFYNLIILLVKEKFPYICTKKKFHAISNRIL...,1,1.0
3303,Q6NRV0,MPIRAYCTICSDFFDNARDVAAITCGHTFHQECLLQWFHSAPHRTC...,1,1.0


In [74]:
3037/3305

0.918910741301059

In [70]:
eva.caculateMetrix(groundtruth=res_data_ecpred.functionCounts, predict=res_data_ecpred.functionCounts_ecpred, baselineName='ecpred', type='multi')

      ecpred  		0.918911  	0.865423 		0.249345 	0.119718


In [78]:
res_data_ecpred[(res_data_ecpred.functionCounts == res_data_ecpred.functionCounts_ecpred) & (res_data_ecpred.functionCounts ==2)]

Unnamed: 0,id,seq,functionCounts,functionCounts_ecpred


In [79]:
res_data_ecpred[(res_data_ecpred.functionCounts == res_data_ecpred.functionCounts_ecpred)]

Unnamed: 0,id,seq,functionCounts,functionCounts_ecpred
0,Q5RF96,MARGGDTGCTGPSETSASGAVAIAFPGLEGPPADAQYQTLALTVPK...,1,1.0
1,P9WIA9,DAGVSWKVYRNKTLGPISSVLTYGSLVTSFKQSADPRSDLVRFGVA...,1,1.0
2,H2E7Q7,MARTPWLPNAYPPARRSDHVDIYKSALRGDVRVQDPYQWLEEYTDE...,1,1.0
3,A0A0D4BSN8,MKLDDKRILIIGAGEVGTAVAEDLVNRSDPTEIIIHTSRQQTMDMR...,1,1.0
4,E2JFG2,MPPTPWAPHSYPPTRRSDHVDVYQSASRGEVPVPDPYQWLEENSNE...,1,1.0
...,...,...,...,...
3300,A0A2R6Q324,MASFPPSLVFTVRRKEPTLVLPSKPTPRELKQLSDIDDQEGLRFQV...,1,1.0
3301,Q6VE93,MGNVCVGGSRMSHQVYSPDRADTPPRSERNTPDRRQRAAGDAERTQ...,1,1.0
3302,A0A509AKI1,MVLLNGKLKYIAVVAIFYNLIILLVKEKFPYICTKKKFHAISNRIL...,1,1.0
3303,Q6NRV0,MPIRAYCTICSDFFDNARDVAAITCGHTFHQECLLQWFHSAPHRTC...,1,1.0


In [76]:
test[test.functionCounts==2]

Unnamed: 0,id,seq,functionCounts
10,L8EUQ6,MRYDVVIAGAGPTGLMLACELRLAGARTLVLERLAEPVDFSKALGV...,2
102,Q6LY38,MSETDFEYMFFLGCIAPNRYPGIESATYKALDKLGIQLHPFEQASC...,2
106,Q6LYD7,MMKAEELNKGFVNEIIEAGTPVPGEKEVASLKSCYQCGTCTGSCPS...,2
107,Q6LYD8,MKYAFFLGCIMPNRYAGVESATRTVMEKLGVELVDMPGASCCPAPG...,2
109,Q6LY39,MVLKSSEFNPDFPKQIIESGEWIFGDHASSFQKCYQCGTCTGACPS...,2
...,...,...,...
3226,Q84UB4,MEVVEVLHMNGGNGDSSYANNSLVQQKVILMTKPITEQAMIDLYSS...,2
3238,Q84UB5,MEVVEVLHMNGGNGDSSYANNSLVQQKVILMTKPITEQAMIDLYSS...,2
3273,Q6E593,MDSKQSSELVFTVRRQEPELIAPAKPTPRETKFLSDIDDQEGLRFQ...,2
3283,A0A167LUQ4,MSPTTQHEPGSPRVPEPIAIVGSACRFPGGSSSPSKLWDLLREPRD...,2
