# Install protinfer 

```
git clone https://github.com/google-research/proteinfer
cd ~/proteinfer
pip3 install -r requirements.txt
python3 install_models.py
```

## Train the model
Unfortunatly insufficient data is provided to train the model.
```
python train.py --data_base_path=./testdata/ \
--label_vocab_path=./data/vocabs/EC.tsv \
--hparams_set=small_test_model \
--output_dir=trained_baseline1
```
Next run the following:

```
python3 proteinfer.py -i /disk1/ariane/pycharm/CARE/CLEAN/app/data/price.fasta -o ~/hemoglobin_predictions.tsv
```


In [2]:
from sciutil import SciUtil
import numpy as np
import pandas as pd
import seaborn as sns
u = SciUtil()

base_dir = '/disk1/ariane/pycharm/CARE/'

    
def compute_accuracy_baseline1(predicted_ecs, true_ecs):
    pred_level1 = np.zeros(len(predicted_ecs))
    pred_level2 = np.zeros(len(predicted_ecs))
    pred_level3 = np.zeros(len(predicted_ecs))
    pred_level4 = np.zeros(len(predicted_ecs))

    # Basically given we have a highly multiclass problem doing precision recall probably doesn't make sense
    for i, predicted_ec in enumerate(predicted_ecs):
        true_ec = true_ecs[i]
        # First check level 1
        pred1 = predicted_ec.split('.')[0]
        true_ecs1 = [ec.split('.')[0] for ec in true_ec.split(';')]
        if pred1 in true_ecs1:
            pred_level1[i] = 1

        # Do the same for each other level
        pred2 = predicted_ec.split('.')[1]
        true_ecs2 = [ec.split('.')[1] for ec in true_ec.split(';')]
        if pred2 in true_ecs2:
            # Check also that pred1 was correct
            if pred_level1[i] == 1:
                pred_level2[i] = 1
        
        # Do the same for each other level
        pred3 = predicted_ec.split('.')[2]
        true_ecs3 = [ec.split('.')[2] for ec in true_ec.split(';')]
        if pred3 in true_ecs3:
            # Check previous levels were correct
            if pred_level1[i] == 1 and pred_level2[i] == 1:
                pred_level3[i] = 1
        
        # Do the same for each other level
        pred4 = predicted_ec.split('.')[3]
        true_ecs4 = [ec.split('.')[3] for ec in true_ec.split(';')]
        if pred4 in true_ecs4:
            if pred_level1[i] == 1 and pred_level2[i] == 1 and pred_level3[i] == 1:
                pred_level4[i] = 1
    # Print out the accuracy
    u.dp(['Acc level 1:', round(np.mean(pred_level1)*100, 2), 
          '\nAcc level 2:', round(np.mean(pred_level2)*100, 2), 
          '\nAcc level 3:', round(np.mean(pred_level3)*100, 2), 
          '\nAcc level 4:', round(np.mean(pred_level4)*100, 2)])
    
    return np.mean(pred_level1), np.mean(pred_level2), np.mean(pred_level3), np.mean(pred_level4)


def make_default_training_fasta():
    swissprot = pd.read_csv(f'{base_dir}processed_data/protein2EC.csv')
    train_indices = np.loadtxt(f'{base_dir}splits/task1/protein2EC_train_indices.txt', dtype=int)
    swissprot_train = swissprot.iloc[train_indices]
    entries = swissprot_train['Entry'].values
    sequences = swissprot_train['Sequence'].values
    #save sequences to fasta
    with open(f'{base_dir}protein2EC_train.fasta', 'w') as f:
        for entry, sequence in zip(entries, sequences):
            f.write('>{}\n{}\n'.format(entry, sequence))
    u.dp(['Default training dataset built using: ', f'{base_dir}processed_data/protein2EC.csv', 
          '\nIncides:', f'{base_dir}splits/task1/protein2EC_train_indices.txt', 
          '\nOutput:', f'{base_dir}protein2EC_train.fasta'])

def make_default_price_fasta():
    df = pd.read_csv(f'{base_dir}splits/task1/price_protein_test.csv', sep='\t')
    #write seqs to fasta format
    with open(f'{base_dir}splits/task1/price_protein_test.fasta', 'w') as f:
        for i, (entry, seq) in enumerate(zip(entries, seqs)):
            f.write('>{}\n{}\n'.format(entry, seq))
    u.dp(['Default price dataset built using: ', f'{base_dir}splits/task1/price_protein_test.csv', 
      '\nOutput:', f'{base_dir}splits/task1/price_protein_test.fasta'])

def get_uniprot2ec():
    swissprot = pd.read_csv(f'{base_dir}processed_data/protein2EC.csv')
    train_indices = np.loadtxt(f'{base_dir}splits/task1/protein2EC_train_indices.txt', dtype=int)
    swissprot_train = swissprot.iloc[train_indices]
    id2ec = swissprot_train.set_index('Entry')['EC number'].to_dict()
    return id2ec

def get_price2ec():
    df = pd.read_csv(f'{base_dir}splits/task1/price_protein_test.csv', sep='\t')
    id2ec = df.set_index('Entry')['EC number'].to_dict()
    return id2ec
    
def get_default_training_fasta_path():
    return f'{base_dir}protein2EC_train.fasta'
    
def get_default_price_fasta_path():
    return f'{base_dir}splits/task1/price_protein_test.fasta'


## Run protinfer

Note protinfer was not re-trained so may be slightly biased.

Protinfer needs to be installed as per the guidelines and then the following command can be run from the protinfer folder:

In [8]:
import os

protinfer_price = f'{base_dir}output/protinfer_price.tsv'
print(f'python3 proteinfer.py -i {get_default_price_fasta_path()} -o {protinfer_price}')

python3 proteinfer.py -i /disk1/ariane/pycharm/CARE/splits/task1/price_protein_test.fasta -o /disk1/ariane/pycharm/CARE/output/protinfer_price.tsv


## Compute accuracy for the prediction vs the true values for each level

In [17]:
results = pd.read_csv(protinfer_price, sep='\t')
results['predicted_ecs'] = [ec.split(':')[1] if 'EC:' in ec else 'None' for ec in results['predicted_label'].values]
# Remove ones without a predicted ec
results = results[results['predicted_ecs'] != 'None']

# Add in the actual ec
results['true_ecs'] = results['sequence_name'].map(get_price2ec())
results = results.drop_duplicates(subset='sequence_name', keep='last')
results

Unnamed: 0,sequence_name,predicted_label,confidence,description,predicted_ecs,true_ecs
276,WP_063460136,EC:3.2.1.-,1.00,and S-glycosyl compounds.,3.2.1.-,5.3.1.7
278,WP_063462980,EC:4.2.1.43,0.93,2-dehydro-3-deoxy-L-arabinonate dehydratase.,4.2.1.43,4.2.1.43
281,WP_063462990,EC:1.1.1.-,1.00,With NAD(+) or NADP(+) as acceptor.,1.1.1.-,1.1.1.48
285,WP_041412631,EC:4.2.1.25,0.86,L-arabinonate dehydratase.,4.2.1.25,4.2.1.25
287,WP_011717048,EC:5.1.3.3,0.98,Aldose 1-epimerase.,5.1.3.3,5.1.3.3
...,...,...,...,...,...,...
562,WP_038453710,EC:3.-.-.-,1.00,Hydrolases.,3.-.-.-,5.3.1.7
565,WP_010207013,EC:1.3.8.-,0.99,With a flavin as acceptor.,1.3.8.-,1.3.8.7
567,WP_010207016,EC:1.3.8.-,0.96,With a flavin as acceptor.,1.3.8.-,1.3.8.7
571,WP_010207340,EC:2.6.1.113,1.00,Putrescine--pyruvate transaminase.,2.6.1.113,2.6.1.19


In [18]:
compute_accuracy_baseline1(results['predicted_ecs'].values, results['true_ecs'].values)

[94m--------------------------------------------------------------------------------[0m
[94mAcc level 1:	90.74	
Acc level 2:	75.93	
Acc level 3:	63.89	
Acc level 4:	18.52	 [0m
[94m--------------------------------------------------------------------------------[0m


(0.9074074074074074,
 0.7592592592592593,
 0.6388888888888888,
 0.18518518518518517)