# npyblast baseline

For the first baselines we want to generate accuracy, precision, recall and F1 score for each of the EC numbers.

The first dataset is the price dataset.

Requires: npysearch, sciutil

In [39]:
import numpy as np
import pandas as pd
import seaborn as sns

In [75]:
from sciutil import SciUtil
u = SciUtil()

base_dir = '/disk1/ariane/pycharm/CARE/'

# Input a ranked ordered list of EC numbers from best to worst.

# I'll 
    
def compute_accuracy_baseline(predicted_ecs, true_ecs):
    pred_level1 = np.zeros(len(predicted_ecs))
    pred_level2 = np.zeros(len(predicted_ecs))
    pred_level3 = np.zeros(len(predicted_ecs))
    pred_level4 = np.zeros(len(predicted_ecs))

    # Basically given we have a highly multiclass problem doing precision recall probably doesn't make sense
    for i, predicted_ec in enumerate(predicted_ecs):
        true_ec = true_ecs[i]
        # First check level 1
        pred1 = predicted_ec.split('.')[0]
        true_ecs1 = [ec.split('.')[0] for ec in true_ec.split(';')]
        if pred1 in true_ecs1:
            pred_level1[i] = 1

        # Do the same for each other level
        pred2 = predicted_ec.split('.')[1]
        true_ecs2 = [ec.split('.')[1] for ec in true_ec.split(';')]
        if pred2 in true_ecs2:
            # Check also that pred1 was correct
            if pred_level1[i] == 1:
                pred_level2[i] = 1
        
        # Do the same for each other level
        pred3 = predicted_ec.split('.')[2]
        true_ecs3 = [ec.split('.')[2] for ec in true_ec.split(';')]
        if pred3 in true_ecs3:
            # Check previous levels were correct
            if pred_level1[i] == 1 and pred_level2[i] == 1:
                pred_level3[i] = 1
        
        # Do the same for each other level
        pred4 = predicted_ec.split('.')[3]
        true_ecs4 = [ec.split('.')[3] for ec in true_ec.split(';')]
        if pred4 in true_ecs4:
            if pred_level1[i] == 1 and pred_level2[i] == 1 and pred_level3[i] == 1:
                pred_level4[i] = 1
    # Print out the accuracy
    u.dp(['Acc level 1:', round(np.mean(pred_level1)*100, 2), 
          '\nAcc level 2:', round(np.mean(pred_level2)*100, 2), 
          '\nAcc level 3:', round(np.mean(pred_level3)*100, 2), 
          '\nAcc level 4:', round(np.mean(pred_level4)*100, 2)])
    
    return np.mean(pred_level1), np.mean(pred_level2), np.mean(pred_level3), np.mean(pred_level4)


def make_default_training_fasta():
    swissprot = pd.read_csv(f'{base_dir}processed_data/protein2EC.csv')
    train_indices = np.loadtxt(f'{base_dir}splits/task1/protein2EC_train_indices.txt', dtype=int)
    swissprot_train = swissprot.iloc[train_indices]
    entries = swissprot_train['Entry'].values
    sequences = swissprot_train['Sequence'].values
    #save sequences to fasta
    with open(f'{base_dir}protein2EC_train.fasta', 'w') as f:
        for entry, sequence in zip(entries, sequences):
            f.write('>{}\n{}\n'.format(entry, sequence))
    u.dp(['Default training dataset built using: ', f'{base_dir}processed_data/protein2EC.csv', 
          '\nIncides:', f'{base_dir}splits/task1/protein2EC_train_indices.txt', 
          '\nOutput:', f'{base_dir}protein2EC_train.fasta'])

def make_default_price_fasta():
    df = pd.read_csv(f'{base_dir}splits/task1/price_protein_test.csv', sep='\t')
    #write seqs to fasta format
    with open(f'{base_dir}splits/task1/price_protein_test.fasta', 'w') as f:
        for i, (entry, seq) in enumerate(zip(entries, seqs)):
            f.write('>{}\n{}\n'.format(entry, seq))
    u.dp(['Default price dataset built using: ', f'{base_dir}splits/task1/price_protein_test.csv', 
      '\nOutput:', f'{base_dir}splits/task1/price_protein_test.fasta'])

def get_uniprot2ec():
    swissprot = pd.read_csv(f'{base_dir}processed_data/protein2EC.csv')
    train_indices = np.loadtxt(f'{base_dir}splits/task1/protein2EC_train_indices.txt', dtype=int)
    swissprot_train = swissprot.iloc[train_indices]
    id2ec = swissprot_train.set_index('Entry')['EC number'].to_dict()
    return id2ec

def get_price2ec():
    df = pd.read_csv(f'{base_dir}splits/task1/price_protein_test.csv', sep='\t')
    id2ec = df.set_index('Entry')['EC number'].to_dict()
    return id2ec
    
def get_default_training_fasta_path():
    return f'{base_dir}protein2EC_train.fasta'
    
def get_default_price_fasta_path():
    return f'{base_dir}price_protein_test.fasta'

def get_default_price_fasta_path():
    return f'{base_dir}price_protein_test.fasta'

def get_default_price_fasta_path():
    return f'{base_dir}price_protein_test.fasta'

def get_default_price_fasta_path():
    return f'{base_dir}price_protein_test.fasta'

## Make the default datasets

In [41]:
make_default_training_fasta()
make_default_price_fasta()

[94m--------------------------------------------------------------------------------[0m
[94mDefault training dataset built using: 	/disk1/ariane/pycharm/CARE/processed_data/protein2EC.csv	
Incides:	/disk1/ariane/pycharm/CARE/splits/task1/protein2EC_train_indices.txt	
Output:	/disk1/ariane/pycharm/CARE/protein2EC_train.fasta	[0m
[94m--------------------------------------------------------------------------------[0m
[94m--------------------------------------------------------------------------------[0m
[94mDefault price dataset built using: 	/disk1/ariane/pycharm/CARE/splits/task1/price_protein_test.csv	
Output:	/disk1/ariane/pycharm/CARE/splits/task1/price_protein_test.fasta	[0m
[94m--------------------------------------------------------------------------------[0m


## Perform npysearch

In [55]:
import npysearch as npy

# Lets also look at the protein our query is the query genome and our database is going to be ecoli.
results_prot = npy.blast(query=get_default_price_fasta_path(),
                         database=get_default_training_fasta_path(),
                         minIdentity=0.1,
                         maxAccepts=1,
                         alphabet="protein")
results = pd.DataFrame(results_prot)  # Convert this into a dataframe so that we can see it more easily





   Read database: 100.0% (47 MB)                    
Analyze database: 100.0% (149k)                    
  Index database: 100.0% (149k)                    
    Read queries: 100.0% (58 kB)                      
 Search database: 100.0% (148.0)                    
      Write hits: 100.0% (147.0)                    

Unnamed: 0,QueryId,TargetId,QueryMatchStart,QueryMatchEnd,TargetMatchStart,TargetMatchEnd,QueryMatchSeq,TargetMatchSeq,NumColumns,NumMatches,NumMismatches,NumGaps,Identity,Alignment
111,WP_063324031,Q4ZSC0,1,318,1,318,MSFITDIKTFAALGSGVIGSGWISRALAHGLDVIAWDPAPGAEAAL...,MSFITEIKTFAALGSGVIGSGWVSRALAHGLDVVAWDPAPGAEAAL...,318,284,34,0,0.893,5=1X16=1X10=1X13=1X3=2X5=1X3=1X5=1X7=1X22=1X2=...
66,WP_063343423,Q888H1,1,270,5,274,MTNTSPFNRLLLTGAAGGLGKVLRERLRPYANVLRLSDIANMAPAI...,HTTQTPFNRLLLTGAAGGLGKVLRETLRPYSHILRLSDIAEMAPAV...,270,234,36,0,0.867,1X1=3X20=1X4=3X7=1X4=2X1=1X3=2X6=1X3=1X60=2X1=...
90,WP_060739768,Q88RB9,1,425,1,424,MSKTNADLMARRTAAVPRGVGQIHPIFAESAKNATVTDVEGREFID...,MSKTNESLMQRRVAAVPRGVGQIHPIFVDTAKNSTVIDVEGRELID...,425,367,57,1,0.864,5=2X2=1X2=1X14=3X3=1X2=1X6=1X14=1X3=2X3=2X2=1X...
13,WP_010208316,Q885K0,1,405,1,405,MSVEQAPVQRADFDQVMVPNYAPAAFIPVRGEGSRVWDQAGRELID...,MSVEHAAVQRADFDQVMVPNYAPAGFIPVRGAGSRVWDQAGRELVD...,405,344,61,0,0.849,4=1X1=1X17=1X6=1X12=1X2=1X10=1X12=2X12=1X4=1X2...
10,WP_010209382,Q888H1,1,274,1,274,MTTTPTTPVPFNRLLLTGAAGGLGKVLRERLRPYAQVLRLSDIANM...,MASAHTTQTPFNRLLLTGAAGGLGKVLRETLRPYSHILRLSDIAEM...,274,227,47,0,0.828,1=4X2=2X20=1X4=3X7=1X4=4X4=1X6=1X3=1X18=1X3=1X...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110,WP_063324030,P43848,1,158,168,338,MPTLTTYQTRILPEWVDYNGHLRDAFYLLIFSYATDALMDRLGMDS...,IIDGTAVKTGDTLIALGSSGAHSNGYSLIRKVLEVSGANPTDLLEG...,173,20,136,17,0.116,4X1=3X1=10X1=7X1=1X2D10X1=1X1=20X1=9X1=14X1=11...
145,NP_384733,B8ZNS6,81,430,1,363,LEDCAVIQQLTRATPAVSLHIPWDKVSDLGALKEKGSALGLSFDAM...,MQYSEIMIRYGELSTKGKNRMRFINKLRNNISDVLSIYPQVKVTAD...,363,42,308,13,0.116,44X1=10X1=8X2=18X2=2X1=13X1=2X1=3X1=3X1D2X2=24...
137,NP_437227,Q0AIP9,1,109,121,235,MQRMGMVIGLEPSKIAEYKRLHAAVWPEILALISECNITNYSIFLK...,KHVLLMGAGGAASGVILPLLQQKPGLLAIANRTPDKAIALQQQFVN...,115,13,96,6,0.113,5X1=2X1=3X1=15X1=8X1=5X1=10X1=4X1=2X1=8X1=3X1=...
74,WP_053163627,A8G9Y3,1,450,16,476,MSVPPRAVQLNEANAFLKEHPEVLYVDLLIADMNGVVRGKRIERTS...,LVVGGGEVAARKVDLLLRAGAEIRIVAQSLSPILEQLSQQGQIHWL...,467,51,393,23,0.109,2X1=4X1=8X1=4X1=3X1=12X8D1=7X1=2X1=9X1=15X1=5X...


## Map the targetID which is the prediction to the the predicted EC number

In [56]:
results['predicted_ecs'] = results['TargetId'].map(get_uniprot2ec())
results['true_ecs'] = results['QueryId'].map(get_price2ec())

results

Unnamed: 0,QueryId,TargetId,QueryMatchStart,QueryMatchEnd,TargetMatchStart,TargetMatchEnd,QueryMatchSeq,TargetMatchSeq,NumColumns,NumMatches,NumMismatches,NumGaps,Identity,Alignment,predicted_ecs,true_ecs
0,NP_384124,P42604,4,501,1,495,PSSILLSPDDNVVVATAAIAPGDRLAGGVSAVARIEPGHKAAIRRI...,MQYIKIHALDNVAVALADLAEGTEVSVDNQTVTLRQDVARGHKFAL...,502,171,320,11,0.341,3X1=5X3=1X2=1X1=2X1=1X1=9X1=1X1D1=2D3X3=1X1=3X...,4.2.1.7,4.2.1.5
1,NP_384306,P21633,164,294,1,146,AYDGIHQGSGVTDAVLAEFNDDPRDIVDFARLAKPGEFGRYAPRVF...,MSAPIVHGGGITEAAARYGGRPEDWLDLSTGINPCPVALPAVPERA...,147,27,103,17,0.184,4X1=2X1=1X1=1X1=1X1=1X1I1=5X1=1X1=2X1=6X1=6X1=...,4.1.1.81,2.7.1.59
2,NP_384741,Q16D10,114,462,1,367,PPFTETGSARLPMGLNVGAQLFWQQRMFPEHFARVATILTYAQYWS...,MSAPKKVVLAYSGGLDTSIILKWLQTEYGCEVVTFTADLGQGEELE...,367,47,302,18,0.128,13X2=5X1=1X1=1X1=13X1=37X1=8X1D1=1X1D5X1=7X1=3...,6.3.4.5,2.7.1.5
3,WP_010211220,Q47N85,94,578,1,470,TAMLTRNLASMDVEEAIRGNPIDGVVLLTGCDKTTPALLMGAASCD...,MAHEPDSQTVRLWGGRFSGGPSEALARLSQSTHFDWRLARYDIAGS...,486,59,410,17,0.121,1X1=16X1=1X1=6X1=9X1=1X1D3X1=5X1=1X1=6X1=6X1=4...,4.3.2.1,4.2.1.25
4,WP_010211217,Q56694,14,524,1,510,RSANGSVTLQSVDATSGEALPQHFYQATPQEVDAAAKAAAQAYPAY...,MNPQTDNVFYATNAFTGEALPLAFPVHTEVEVNQAATAAAKVARDF...,513,225,283,5,0.439,13X1=2X5=2X1=3X1=2X2=2X2=1X3=6X1=1X1=4X2=2X1=3...,1.2.1.4,1.2.1.26
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142,NP_384884,Q7VUQ3,1,261,88,357,MIVHFEPASEEGIASVVRSAAAERVTLAVVGGGTRAGLGNPVRADR...,RLADIRPDVVLGMGGYVAFPGGVMAALRRTPLVVHEQNAVAGTANR...,276,48,207,21,0.174,6X1=4X1=4X1=9X1=16X1=1X1=1X1=1X1I1=9X1=9X1=1X2...,2.4.1.227,1.1.3.15
143,NP_384885,P52074,24,438,1,407,ADPHVAESETILRKCVHCGFCTATCPTYVVLGDELDSPRGRIYLIK...,MQTQLTEEMRQNARALEADSILRACVHCGFCTATCPTYQLLGDELD...,419,122,281,16,0.291,6X1=17X1=1I5X1I5X1=2I3X1=2X1=1X1=3X1=1X3I1X1I3...,1.1.99.14,1.1.3.15
144,NP_384521,Q652Q8,1,314,17,329,MSSARKIIIDTDPGQDDAAAIMLALGSPEEIEVLGITAVAGNVPLT...,PPTEEKVIIDTDPGIDDSVAIMMAFEAPGVKVVGLTTIFGNCTTSH...,315,104,208,3,0.330,5X1=1X7=1X2=2X3=1X1=3X1=1X1I2X1=1X1=1X1=3X2=7X...,3.2.2.3,3.2.2.3
145,NP_384733,B8ZNS6,81,430,1,363,LEDCAVIQQLTRATPAVSLHIPWDKVSDLGALKEKGSALGLSFDAM...,MQYSEIMIRYGELSTKGKNRMRFINKLRNNISDVLSIYPQVKVTAD...,363,42,308,13,0.116,44X1=10X1=8X2=18X2=2X1=13X1=2X1=3X1=3X1D2X2=24...,2.8.1.4,5.3.1.14


## Compute accuracy for the prediction vs the true values for each level

In [74]:
compute_accuracy_baseline1(blast_results['predicted_ecs'].values, blast_results['true_ecs'].values)

[94m--------------------------------------------------------------------------------[0m
[94mAcc level 1:	64.63	
Acc level 2:	60.54	
Acc level 3:	50.34	
Acc level 4:	27.89	 [0m
[94m--------------------------------------------------------------------------------[0m


(0.6462585034013606,
 0.6054421768707483,
 0.5034013605442177,
 0.2789115646258503)