# npyblast baseline

For the first baselines we want to generate accuracy, precision, recall and F1 score for each of the EC numbers.

The first dataset is the price dataset.

Requires: npysearch, sciutil

In [34]:
import numpy as np
import pandas as pd
import seaborn as sns

In [55]:
from sciutil import SciUtil
u = SciUtil()

base_dir = '/disk1/ariane/pycharm/CARE/'
output_folder = f'{base_dir}task1_baselines/results_summary/BLAST/'

def make_fastas():
    filenames = [f'{base_dir}splits/task1/30-50_protein_test.csv', 
                 f'{base_dir}splits/task1/30_protein_test.csv', 
                 f'{base_dir}splits/task1/50-70_protein_test.csv',
                 f'{base_dir}splits/task1/70-90_protein_test.csv',
                 f'{base_dir}splits/task1/promiscuous_protein_test.csv',
                 f'{base_dir}splits/task1/protein_train.csv',
                 f'{base_dir}splits/task1/price_protein_test.csv']

    for filename in filenames:
        with open(filename.replace('.csv', '.fasta'), 'w') as f:
            df = pd.read_csv(filename)
            for entry, seq in df[['Entry', 'Sequence']].values:
                f.write('>{}\n{}\n'.format(entry, seq))
                     
def get_uniprot2ec():
    swissprot = pd.read_csv(f'{base_dir}processed_data/protein2EC.csv')
    id2ec = swissprot.set_index('Entry')['EC number'].to_dict()
    return id2ec

def get_price2ec():
    df = pd.read_csv(f'{base_dir}splits/task1/price_protein_test.csv')
    id2ec = df.set_index('Entry')['EC number'].to_dict()
    return id2ec
    
def get_default_training_fasta_path():
    return f'{base_dir}splits/task1/protein_train.fasta'
    
def get_default_price_fasta_path():
    return f'{base_dir}splits/task1/price_protein_test.fasta'

def get_validation30():
    return f'{base_dir}splits/task1/30_protein_test.fasta'

def get_validation50():
    return f'{base_dir}splits/task1/30-50_protein_test.fasta'
    
def get_validation70():
    return f'{base_dir}splits/task1/50-70_protein_test.fasta'

def get_validation90():
    return f'{base_dir}splits/task1/70-90_protein_test.fasta'

def get_promisc():
    return f'{base_dir}splits/task1/promiscuous_protein_test.fasta'

## Make the default datasets

In [56]:

make_fastas()

## Perform npysearch

In [60]:
import npysearch as npy

# task1_splits = ['30', '30-50', 'price', 'promiscuous']

def get_test_fasta(label):
    if label == 'price':
        return get_default_price_fasta_path()
    elif label == '30':
        return get_validation30()
    elif label == '30-50':
        return get_validation50()
    elif label == 'promiscuous':
        return get_promisc()
    else:
        print(f'{label} not a valid dataset select one of ' + ' '.join(['30', '30-50', 'price', 'promiscuous']))


def get_blast(test_label, num_ecs=1, min_identity=0.1, save=False):
    """
    Gets the results for blast for a series of ECs and formats it correctly for the paper
    """
    # Lets also look at the protein our query is the query genome and our database is going to be ecoli.
    
    results_prot = npy.blast(query=get_test_fasta(test_label),
                             database=get_default_training_fasta_path(),
                             minIdentity=min_identity,
                             maxAccepts=num_ecs,
                             alphabet="protein")
    results = pd.DataFrame(results_prot)  # Convert this into a dataframe so that we can see it more easily
    results['predicted_ecs'] = results['TargetId'].map(get_uniprot2ec())
    
    if test_label == 'price':
        results['true_ecs'] = results['QueryId'].map(get_price2ec())
    else:
        results['true_ecs'] = results['QueryId'].map(get_uniprot2ec())
    grped = results.groupby('QueryId')
    rows = []
    for query, grp in grped:
        # Always will be the same for the grouped 
        rows.append([query, grp['true_ecs'].values[0], grp['QueryMatchSeq'].values[0]] + list(grp['predicted_ecs'].values))
    new_df = pd.DataFrame(rows)
    new_df.columns = ['Entry', 'EC number', 'Sequence'] + list(range(0, num_ecs))

    # Since we may have no similar ones we'll add in these as a dummy
    new_df = new_df.fillna('0.0.0.0')
    # Save to a file in the default location
    if save:
        new_df.to_csv(f'{output_folder}{test_label}_protein_test_results_df.csv', index=False)
    return new_df

## Map the targetID which is the prediction to the the predicted EC number

In [61]:
# Save in the required format
for split in ['30', '30-50', 'price', 'promiscuous']:
    get_blast(split, 10, save=True)


   Read database: 100.0% (58 MB)                    
Analyze database: 100.0% (168k)                    
  Index database: 100.0% (168k)                    
    Read queries: 100.0% (59 kB)                      
 Search database: 100.0% (175.0)                    
      Write hits: 100.0% (1595.0)                    





   Read database: 100.0% (58 MB)                    
Analyze database: 100.0% (168k)                    
  Index database: 100.0% (168k)                    
    Read queries: 100.0% (74 kB)                      
 Search database: 100.0% (196.0)                    
      Write hits: 100.0% (1911.0)                    





   Read database: 100.0% (58 MB)                    
Analyze database: 100.0% (168k)                    
  Index database: 100.0% (168k)                    
    Read queries: 100.0% (57 kB)                      
 Search database: 100.0% (146.0)                    
      Write hits: 100.0% (1415.0)                    






   Read database: 100.0% (58 MB)                    
Analyze database: 100.0% (168k)                    
  Index database: 100.0% (168k)                    
    Read queries: 100.0% (71 kB)                      
 Search database: 100.0% (179.0)                    
      Write hits: 100.0% (1764.0)                    

## Compute accuracy for the prediction vs the true values for each level

In [28]:
u.dp(['Price dataset'])
compute_accuracy_baseline1(results['predicted_ecs'].values, results['true_ecs'].values)


[94m--------------------------------------------------------------------------------[0m
[94m                                 Price dataset	                                 [0m
[94m--------------------------------------------------------------------------------[0m
[94m--------------------------------------------------------------------------------[0m
[94mAcc level 1:	73.97	
Acc level 2:	71.23	
Acc level 3:	62.33	
Acc level 4:	35.62	 [0m
[94m--------------------------------------------------------------------------------[0m


(0.7397260273972602,
 0.7123287671232876,
 0.6232876712328768,
 0.3561643835616438)

## Do the same for each of the percentage splits

In [29]:
import npysearch as npy

# Lets also look at the protein our query is the query genome and our database is going to be ecoli.
results_prot = npy.blast(query=get_validation30(),
                         database=get_default_training_fasta_path(),
                         minIdentity=0.1,
                         maxAccepts=1,
                         alphabet="protein")
results = pd.DataFrame(results_prot)  # Convert this into a dataframe so that we can see it more easily

results['predicted_ecs'] = results['TargetId'].map(get_uniprot2ec())
results['true_ecs'] = results['QueryId'].map(get_uniprot2ec())
u.dp(['30% dataset'])

compute_accuracy_baseline1(results['predicted_ecs'].values, results['true_ecs'].values)





   Read database: 100.0% (58 MB)                    
Analyze database: 100.0% (168k)                    
  Index database: 100.0% (168k)                    
    Read queries: 100.0% (59 kB)                      
 Search database: 100.0% (175.0)                    
      Write hits: 100.0% (174.0)                    

[94m--------------------------------------------------------------------------------[0m
[94m                                  30% dataset	                                  [0m
[94m--------------------------------------------------------------------------------[0m
[94m--------------------------------------------------------------------------------[0m
[94mAcc level 1:	63.22	
Acc level 2:	55.75	
Acc level 3:	53.45	
Acc level 4:	50.57	 [0m
[94m--------------------------------------------------------------------------------[0m


(0.632183908045977, 0.5574712643678161, 0.5344827586206896, 0.5057471264367817)

In [30]:
import npysearch as npy

# Lets also look at the protein our query is the query genome and our database is going to be ecoli.
results_prot = npy.blast(query=get_validation50(),
                         database=get_default_training_fasta_path(),
                         minIdentity=0.1,
                         maxAccepts=1,
                         alphabet="protein")
results = pd.DataFrame(results_prot)  # Convert this into a dataframe so that we can see it more easily

results['predicted_ecs'] = results['TargetId'].map(get_uniprot2ec())
results['true_ecs'] = results['QueryId'].map(get_uniprot2ec())
u.dp(['50% dataset'])

compute_accuracy_baseline1(results['predicted_ecs'].values, results['true_ecs'].values)





   Read database: 100.0% (58 MB)                    
Analyze database: 100.0% (168k)                    
  Index database: 100.0% (168k)                    
    Read queries: 100.0% (74 kB)                      
 Search database: 100.0% (196.0)                    
      Write hits: 100.0% (196.0)                    

[94m--------------------------------------------------------------------------------[0m
[94m                                  50% dataset	                                  [0m
[94m--------------------------------------------------------------------------------[0m
[94m--------------------------------------------------------------------------------[0m
[94mAcc level 1:	93.88	
Acc level 2:	90.31	
Acc level 3:	87.24	
Acc level 4:	84.18	 [0m
[94m--------------------------------------------------------------------------------[0m


(0.9387755102040817,
 0.9030612244897959,
 0.8724489795918368,
 0.8418367346938775)

In [31]:

# Lets also look at the protein our query is the query genome and our database is going to be ecoli.
results_prot = npy.blast(query=get_validation70(),
                         database=get_default_training_fasta_path(),
                         minIdentity=0.1,
                         maxAccepts=1,
                         alphabet="protein")
results = pd.DataFrame(results_prot)  # Convert this into a dataframe so that we can see it more easily
u.dp(['30-70% dataset'])

results['predicted_ecs'] = results['TargetId'].map(get_uniprot2ec())
results['true_ecs'] = results['QueryId'].map(get_uniprot2ec())
compute_accuracy_baseline1(results['predicted_ecs'].values, results['true_ecs'].values)


[94m--------------------------------------------------------------------------------[0m
[94m                                30-70% dataset	                                 [0m
[94m--------------------------------------------------------------------------------[0m



   Read database: 100.0% (58 MB)                    
Analyze database: 100.0% (168k)                    
  Index database: 100.0% (168k)                    
    Read queries: 100.0% (74 kB)                      
 Search database: 100.0% (204.0)                    
      Write hits: 100.0% (204.0)                    

[94m--------------------------------------------------------------------------------[0m
[94m Acc level 1:	99.02	
Acc level 2:	96.08	
Acc level 3:	94.61	
Acc level 4:	90.2	 [0m
[94m--------------------------------------------------------------------------------[0m


(0.9901960784313726, 0.9607843137254902, 0.946078431372549, 0.9019607843137255)

In [32]:

# Lets also look at the protein our query is the query genome and our database is going to be ecoli.
results_prot = npy.blast(query=get_validation90(),
                         database=get_default_training_fasta_path(),
                         minIdentity=0.1,
                         maxAccepts=1,
                         alphabet="protein")
results = pd.DataFrame(results_prot)  # Convert this into a dataframe so that we can see it more easily
u.dp(['70-90% dataset'])

results['predicted_ecs'] = results['TargetId'].map(get_uniprot2ec())
results['true_ecs'] = results['QueryId'].map(get_uniprot2ec())
compute_accuracy_baseline1(results['predicted_ecs'].values, results['true_ecs'].values)


[94m--------------------------------------------------------------------------------[0m
[94m                                70-90% dataset	                                 [0m
[94m--------------------------------------------------------------------------------[0m



   Read database: 100.0% (58 MB)                    
Analyze database: 100.0% (168k)                    
  Index database: 100.0% (168k)                    
    Read queries: 100.0% (75 kB)                      
 Search database: 100.0% (206.0)                    
      Write hits: 100.0% (206.0)                    

[94m--------------------------------------------------------------------------------[0m
[94mAcc level 1:	99.03	
Acc level 2:	99.03	
Acc level 3:	99.03	
Acc level 4:	97.57	 [0m
[94m--------------------------------------------------------------------------------[0m


(0.9902912621359223,
 0.9902912621359223,
 0.9902912621359223,
 0.9757281553398058)

In [33]:

# Lets also look at the protein our query is the query genome and our database is going to be ecoli.
results_prot = npy.blast(query=get_promisc(),
                         database=get_default_training_fasta_path(),
                         minIdentity=0.1,
                         maxAccepts=1,
                         alphabet="protein")
results = pd.DataFrame(results_prot)  # Convert this into a dataframe so that we can see it more easily
u.dp(['Promisuous dataset'])

results['predicted_ecs'] = results['TargetId'].map(get_uniprot2ec())
results['true_ecs'] = results['QueryId'].map(get_uniprot2ec())
compute_accuracy_baseline1(results['predicted_ecs'].values, results['true_ecs'].values)


[94m--------------------------------------------------------------------------------[0m
[94m                              Promisuous dataset	                               [0m
[94m--------------------------------------------------------------------------------[0m



   Read database: 100.0% (58 MB)                    
Analyze database: 100.0% (168k)                    
  Index database: 100.0% (168k)                    
    Read queries: 100.0% (71 kB)                      
 Search database: 100.0% (179.0)                    
      Write hits: 100.0% (179.0)                    

[94m--------------------------------------------------------------------------------[0m
[94mAcc level 1:	94.97	
Acc level 2:	92.74	
Acc level 3:	92.18	
Acc level 4:	91.62	 [0m
[94m--------------------------------------------------------------------------------[0m


(0.9497206703910615,
 0.9273743016759777,
 0.9217877094972067,
 0.9162011173184358)