# npyblast baseline

For the first baselines we want to generate accuracy, precision, recall and F1 score for each of the EC numbers.

The first dataset is the price dataset.

Requires: npysearch, sciutil

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns

In [3]:
from sciutil import SciUtil
u = SciUtil()

base_dir = '/disk1/ariane/pycharm/CARE/'
output_folder = f'{base_dir}task1_baselines/results_summary/BLAST/'

def make_fastas():
    filenames = [f'{base_dir}splits/task1/30-50_protein_test.csv', 
                 f'{base_dir}splits/task1/30_protein_test.csv', 
                 f'{base_dir}splits/task1/50-70_protein_test.csv',
                 f'{base_dir}splits/task1/70-90_protein_test.csv',
                 f'{base_dir}splits/task1/promiscuous_protein_test.csv',
                 f'{base_dir}splits/task1/protein_train.csv',
                 f'{base_dir}splits/task1/price_protein_test.csv']

    for filename in filenames:
        with open(filename.replace('.csv', '.fasta'), 'w') as f:
            df = pd.read_csv(filename)
            for entry, seq in df[['Entry', 'Sequence']].values:
                f.write('>{}\n{}\n'.format(entry, seq))
                     
def get_uniprot2ec():
    swissprot = pd.read_csv(f'{base_dir}processed_data/protein2EC.csv')
    id2ec = swissprot.set_index('Entry')['EC number'].to_dict()
    return id2ec

def get_price2ec():
    df = pd.read_csv(f'{base_dir}splits/task1/price_protein_test.csv')
    id2ec = df.set_index('Entry')['EC number'].to_dict()
    return id2ec
    
def get_default_training_fasta_path():
    return f'{base_dir}splits/task1/protein_train.fasta'
    
def get_default_price_fasta_path():
    return f'{base_dir}splits/task1/price_protein_test.fasta'

def get_validation30():
    return f'{base_dir}splits/task1/30_protein_test.fasta'

def get_validation50():
    return f'{base_dir}splits/task1/30-50_protein_test.fasta'
    
def get_validation70():
    return f'{base_dir}splits/task1/50-70_protein_test.fasta'

def get_validation90():
    return f'{base_dir}splits/task1/70-90_protein_test.fasta'

def get_promisc():
    return f'{base_dir}splits/task1/promiscuous_protein_test.fasta'

In [4]:
pd.read_csv(f'{base_dir}splits/task1/promiscuous_protein_test.csv')

Unnamed: 0,index,Entry,Sequence,EC number,Surprise Level,Number of ECs,Duplicated EC,Duplicated Sequence
0,6834,Q7TS56,MDKVCAVFGGSRGIGKAVAQLMAQKGYRLAIVARNLEVAKATASEL...,1.1.1.100;1.6.5.10,3,2,True,False
1,4267,Q09851,MEAEKFVLITGCSEGGIGNALALKFHQEGFQVLATARQVERMDNLT...,1.1.1.101;3.1.1.3,4,2,True,False
2,7784,Q95JH7,MDSKHQCVKLNDGHFMPVLGFGTYAPAEVPKNKALEATKLAIEAGF...,1.1.1.112;1.1.1.149;1.1.1.209;1.1.1.210;1.1.1....,3,9,True,False
3,3391,P22071,MPGWSCLVTGAGGFVGQRIIRMLVQEKELQEVRALDKVFRPETKEE...,1.1.1.145;1.1.1.210;1.1.1.270;5.3.3.1,4,4,True,False
4,8380,Q9XWF0,MSIKRLSMRLKKGIHRSWNRMTSLEAGLEEEKEIKIVEEPEPRPWK...,1.1.1.145;5.3.3.1,4,2,True,False
...,...,...,...,...,...,...,...,...
174,4717,Q1LRV9,MPHAHPADIDGHHLTPDTVAAIARGQRAAIVPEPVLGKVADARARF...,4.3.1.23;5.4.3.6,4,2,True,False
175,3506,P30904,MPMFIVNTNVPRASVPEGFLSELTQQLAQATGKPAQYIAVHVVPDQ...,5.3.2.1;5.3.3.12,2,2,True,False
176,3236,P10378,MSIPFTRWPEEFARRYREKGYWQDLPLTDILTRHAASDSIAVIDGE...,6.2.1.71;6.3.2.14,3,2,True,False
177,6973,Q80WS1,MCSSVTGKLWFLTDRRIREDYPQKEILRALKAKCCEEELDFRAVVM...,6.3.1.17;6.3.2.41,2,2,True,False


## Make the default datasets

In [56]:

make_fastas()

## Perform npysearch

In [6]:
import npysearch as npy

# task1_splits = ['30', '30-50', 'price', 'promiscuous']

def get_test_fasta(label):
    if label == 'price':
        return get_default_price_fasta_path()
    elif label == '30':
        return get_validation30()
    elif label == '30-50':
        return get_validation50()
    elif label == 'promiscuous':
        return get_promisc()
    else:
        print(f'{label} not a valid dataset select one of ' + ' '.join(['30', '30-50', 'price', 'promiscuous']))

def get_test_df(label):
    return pd.read_csv(f'{base_dir}splits/task1/{label}_protein_test.csv')

def get_blast(test_label, num_ecs=1, min_identity=0.1, save=False):
    """
    Gets the results for blast for a series of ECs and formats it correctly for the paper
    """
    # Lets also look at the protein our query is the query genome and our database is going to be ecoli.
    
    results_prot = npy.blast(query=get_test_fasta(test_label),
                             database=get_default_training_fasta_path(),
                             minIdentity=min_identity,
                             maxAccepts=num_ecs,
                             alphabet="protein")
    results = pd.DataFrame(results_prot)  # Convert this into a dataframe so that we can see it more easily
    results['predicted_ecs'] = results['TargetId'].map(get_uniprot2ec())
    
    if test_label == 'price':
        results['true_ecs'] = results['QueryId'].map(get_price2ec())
    else:
        results['true_ecs'] = results['QueryId'].map(get_uniprot2ec())
    grped = results.groupby('QueryId')
    rows = []
    # Get the raw test set and then build the new dataset based on that! 
    test_df = get_test_df(test_label)
    # Now we want to iterate through and get the predicted EC numbers
    entry_to_ec = dict(zip(test_df['Entry'], test_df['EC number']))
    entry_to_seq = dict(zip(test_df['Entry'], test_df['Sequence']))

    for query in test_df['Entry'].values:
            try:
                grp = grped.get_group(query)
                # Get all the ECs for all the seqs and join them!
                true_ec = ';'.join(set([ec for ec in grp['true_ecs'].values]))
                rows.append([query, true_ec, grp['QueryMatchSeq'].values[0]] + list(grp['predicted_ecs'].values))
            except:
                u.warn_p([query, f'Had no sequences within {min_identity}.'])
                rows.append([query, entry_to_ec[query], entry_to_seq[query]])
    new_df = pd.DataFrame(rows)
    new_df.columns = ['Entry', 'EC number', 'Sequence'] + list(range(0, num_ecs))

    # Since we may have no similar ones we'll add in these as a dummy
    new_df = new_df.fillna('0.0.0.0')
    
    # Save to a file in the default location
    if save:
        new_df.to_csv(f'{output_folder}{test_label}_protein_test_results_df.csv', index=False)
    return new_df

## Map the targetID which is the prediction to the the predicted EC number

In [7]:
# Save in the required format
for split in ['30', '30-50', 'price', 'promiscuous']:
    get_blast(split, 10, save=True)


   Read database: 100.0% (58 MB)                    
Analyze database: 100.0% (168k)                    
  Index database: 100.0% (168k)                    
    Read queries: 100.0% (59 kB)                      
 Search database: 100.0% (175.0)                    
      Write hits: 100.0% (1595.0)                    


[93m--------------------------------------------------------------------------------[0m
[93m                      Q8U0Z7	Had no sequences within 0.1.	                      [0m
[93m--------------------------------------------------------------------------------[0m




   Read database: 100.0% (58 MB)                    
Analyze database: 100.0% (168k)                    
  Index database: 100.0% (168k)                    
    Read queries: 100.0% (74 kB)                      
 Search database: 100.0% (196.0)                    
      Write hits: 100.0% (1911.0)                    
   Read database: 100.0% (58 MB)                    
Analyze database: 100.0% (168k)                    
  Index database: 100.0% (168k)                    
    Read queries: 100.0% (57 kB)                      
 Search database: 100.0% (146.0)                    
      Write hits: 100.0% (1415.0)                    






   Read database: 100.0% (58 MB)                    
Analyze database: 100.0% (168k)                    
  Index database: 100.0% (168k)                    
    Read queries: 100.0% (71 kB)                      
 Search database: 100.0% (179.0)                    
      Write hits: 100.0% (1764.0)                    

In [14]:
pd.read_csv(f'{output_folder}promiscuous_protein_test_results_df.csv')

Unnamed: 0,Entry,EC number,Sequence,0,1,2,3,4,5,6,7,8,9
0,Q7TS56,1.1.1.100;1.6.5.10,MDKVCAVFGGSRGIGKAVAQLMAQKGYRLAIVARNLEVAKATASEL...,1.6.5.10,1.6.5.10,1.6.5.10,1.6.5.10,1.6.5.10,1.6.5.10,1.1.1.100,1.1.1.100,1.1.1.100,1.1.1.100
1,Q09851,1.1.1.101;3.1.1.3,MEAEKFVLITGCSEGGIGNALALKFHQEGFQVLATARQVERMDNLT...,6.1.1.23,1.1.1.53,2.7.7.24,1.1.1.62,1.1.1.146,4.2.1.33,1.1.1.53,1.1.1.53,6.1.1.23,1.1.1.276
2,Q95JH7,1.1.1.112;1.1.1.149;1.1.1.209;1.1.1.210;1.1.1....,MDSKHQCVKLNDGHFMPVLGFGTYAPAEVPKNKALEATKLAIEAGF...,1.3.1.20,1.3.1.20,1.3.1.20,1.3.1.20,1.3.1.20,1.3.1.20,1.3.1.20,1.3.1.20,1.3.1.20,1.3.1.20
3,P22071,1.1.1.145;1.1.1.210;1.1.1.270;5.3.3.1,MPGWSCLVTGAGGFVGQRIIRMLVQEKELQEVRALDKVFRPETKEE...,5.3.3.1,5.3.3.1,5.3.3.1,5.3.3.1,5.3.3.1,5.3.3.1,5.3.3.1,5.3.3.1,5.3.3.1,5.3.3.1
4,Q9XWF0,1.1.1.145;5.3.3.1,KLEEMTRDSIRPKIREMLEKEMPAVISTKVDKEVEKRLPMYIQIVL...,5.3.3.1,5.3.3.1,1.1.1.51,3.5.4.31,3.5.4.31,5.3.3.1,5.3.3.1,5.3.3.1,5.3.3.1,5.3.3.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
174,Q1LRV9,4.3.1.23;5.4.3.6,MPHAHPADIDGHHLTPDTVAAIARGQRAAIVPEPVLGKVADARARF...,5.4.3.6,5.4.3.6,5.4.3.6,5.4.3.6,5.4.3.6,5.4.3.6,4.3.1.3,4.3.1.3,4.3.1.3,4.3.1.3
175,P30904,5.3.2.1;5.3.3.12,MPMFIVNTNVPRASVPEGFLSELTQQLAQATGKPAQYIAVHVVPDQ...,5.3.3.12,5.3.3.12,5.3.3.12,5.3.3.12,5.3.3.12,5.3.3.12,5.3.3.12,5.3.3.12,5.3.3.12,5.3.3.12
176,P10378,6.2.1.71;6.3.2.14,MSIPFTRWPEEFARRYREKGYWQDLPLTDILTRHAASDSIAVIDGE...,6.3.2.14,6.3.2.14,6.2.1.61,6.2.1.71,6.2.1.61,6.2.1.61,6.2.1.71,6.2.1.65,6.2.1.43,1.2.1.41
177,Q80WS1,6.3.1.17;6.3.2.41,MCSSVTGKLWFLTDRRIREDYPQKEILRALKAKCCEEELDFRAVVM...,6.3.2.41,6.3.2.41,6.3.2.41,6.3.2.41,6.3.2.41,6.3.2.41,6.3.2.42,6.3.2.42,6.3.2.42,6.3.2.42
