# Install protinfer 

```
git clone https://github.com/google-research/proteinfer
cd ~/proteinfer
pip3 install -r requirements.txt
python3 install_models.py
```

## Train the model
Unfortunatly insufficient data is provided to train the model.
```
python train.py --data_base_path=./testdata/ \
--label_vocab_path=./data/vocabs/EC.tsv \
--hparams_set=small_test_model \
--output_dir=trained_baseline1
```
Next run the following:

```
python3 proteinfer.py -i /disk1/ariane/pycharm/CARE/CLEAN/app/data/price.fasta -o ~/hemoglobin_predictions.tsv
```


In [33]:
from sciutil import SciUtil
u = SciUtil()

base_dir = '/disk1/ariane/pycharm/CARE/'
output_folder = f'{base_dir}task1_baselines/results_summary/ProteInfer/'
task1_splits = ['30', '30-50', 'price', 'promiscuous']

def get_test_fasta(label):
    if label == 'price':
        return get_default_price_fasta_path()
    elif label == '30':
        return get_validation30()
    elif label == '30-50':
        return get_validation50()
    elif label == 'promiscuous':
        return get_promisc()
    else:
        print(f'{label} not a valid dataset select one of ' + ' '.join(['30', '30-50', 'price', 'promiscuous']))
    
def compute_accuracy_baseline1(predicted_ecs, true_ecs):
    pred_level1 = np.zeros(len(predicted_ecs))
    pred_level2 = np.zeros(len(predicted_ecs))
    pred_level3 = np.zeros(len(predicted_ecs))
    pred_level4 = np.zeros(len(predicted_ecs))

    # Basically given we have a highly multiclass problem doing precision recall probably doesn't make sense
    for i, predicted_ec in enumerate(predicted_ecs):
        true_ec = true_ecs[i]
        # First check level 1
        pred1 = predicted_ec.split('.')[0]
        true_ecs1 = [ec.split('.')[0] for ec in true_ec.split(';')]
        if pred1 in true_ecs1:
            pred_level1[i] = 1

        # Do the same for each other level
        pred2 = predicted_ec.split('.')[1]
        true_ecs2 = [ec.split('.')[1] for ec in true_ec.split(';')]
        if pred2 in true_ecs2:
            # Check also that pred1 was correct
            if pred_level1[i] == 1:
                pred_level2[i] = 1
        
        # Do the same for each other level
        pred3 = predicted_ec.split('.')[2]
        true_ecs3 = [ec.split('.')[2] for ec in true_ec.split(';')]
        if pred3 in true_ecs3:
            # Check previous levels were correct
            if pred_level1[i] == 1 and pred_level2[i] == 1:
                pred_level3[i] = 1
        
        # Do the same for each other level
        pred4 = predicted_ec.split('.')[3]
        true_ecs4 = [ec.split('.')[3] for ec in true_ec.split(';')]
        if pred4 in true_ecs4:
            if pred_level1[i] == 1 and pred_level2[i] == 1 and pred_level3[i] == 1:
                pred_level4[i] = 1
    # Print out the accuracy
    u.dp(['Acc level 1:', round(np.mean(pred_level1)*100, 2), 
          '\nAcc level 2:', round(np.mean(pred_level2)*100, 2), 
          '\nAcc level 3:', round(np.mean(pred_level3)*100, 2), 
          '\nAcc level 4:', round(np.mean(pred_level4)*100, 2)])
    
    return np.mean(pred_level1), np.mean(pred_level2), np.mean(pred_level3), np.mean(pred_level4)

def make_fastas():
    filenames = [f'{base_dir}splits/task1/30-50_protein_test.csv', 
                 f'{base_dir}splits/task1/30_protein_test.csv', 
                 f'{base_dir}splits/task1/50-70_protein_test.csv',
                 f'{base_dir}splits/task1/70-90_protein_test.csv',
                 f'{base_dir}splits/task1/promiscuous_protein_test.csv',
                 f'{base_dir}splits/task1/protein_train.csv',
                 f'{base_dir}splits/task1/price_protein_test.csv']

    for filename in filenames:
        with open(filename.replace('.csv', '.fasta'), 'w') as f:
            df = pd.read_csv(filename)
            for entry, seq in df[['Entry', 'Sequence']].values:
                f.write('>{}\n{}\n'.format(entry, seq))
                     
def get_uniprot2ec():
    swissprot = pd.read_csv(f'{base_dir}processed_data/protein2EC.csv')
    id2ec = swissprot.set_index('Entry')['EC number'].to_dict()
    return id2ec

def get_price2ec():
    df = pd.read_csv(f'{base_dir}splits/task1/price_protein_test.csv')
    id2ec = df.set_index('Entry')['EC number'].to_dict()
    return id2ec
    
def get_default_training_fasta_path():
    return f'{base_dir}splits/task1/protein_train.fasta'
    
def get_default_price_fasta_path():
    return f'{base_dir}splits/task1/price_protein_test.fasta'

def get_validation30():
    return f'{base_dir}splits/task1/30_protein_test.fasta'

def get_validation50():
    return f'{base_dir}splits/task1/30-50_protein_test.fasta'
    
def get_validation70():
    return f'{base_dir}splits/task1/50-70_protein_test.fasta'

def get_validation90():
    return f'{base_dir}splits/task1/70-90_protein_test.fasta'

def get_promisc():
    return f'{base_dir}splits/task1/promiscuous_protein_test.fasta'

## Run protinfer

Note protinfer was not re-trained so may be slightly biased.

Protinfer needs to be installed as per the guidelines and then the following command can be run from the protinfer folder:

In [22]:
import os

filenames = [f'{base_dir}splits/task1/30-50_protein_test.csv', 
             f'{base_dir}splits/task1/30_protein_test.csv', 
             f'{base_dir}splits/task1/50-70_protein_test.csv',
             f'{base_dir}splits/task1/70-90_protein_test.csv',
             f'{base_dir}splits/task1/promiscuous_protein_test.csv',
             f'{base_dir}splits/task1/protein_train.csv',
             f'{base_dir}splits/task1/price_protein_test.csv']
for f in filenames:
    print(f'python3 proteinfer.py -i {f.replace(".csv", ".fasta")} -o /disk1/ariane/pycharm/CARE/output/protinfer/{f.split("/")[-1].replace(".csv", ".tsv")}')

python3 proteinfer.py -i /disk1/ariane/pycharm/CARE/splits/task1/30-50_protein_test.fasta -o /disk1/ariane/pycharm/CARE/output/protinfer/30-50_protein_test.tsv
python3 proteinfer.py -i /disk1/ariane/pycharm/CARE/splits/task1/30_protein_test.fasta -o /disk1/ariane/pycharm/CARE/output/protinfer/30_protein_test.tsv
python3 proteinfer.py -i /disk1/ariane/pycharm/CARE/splits/task1/50-70_protein_test.fasta -o /disk1/ariane/pycharm/CARE/output/protinfer/50-70_protein_test.tsv
python3 proteinfer.py -i /disk1/ariane/pycharm/CARE/splits/task1/70-90_protein_test.fasta -o /disk1/ariane/pycharm/CARE/output/protinfer/70-90_protein_test.tsv
python3 proteinfer.py -i /disk1/ariane/pycharm/CARE/splits/task1/promiscuous_protein_test.fasta -o /disk1/ariane/pycharm/CARE/output/protinfer/promiscuous_protein_test.tsv
python3 proteinfer.py -i /disk1/ariane/pycharm/CARE/splits/task1/protein_train.fasta -o /disk1/ariane/pycharm/CARE/output/protinfer/protein_train.tsv
python3 proteinfer.py -i /disk1/ariane/pycha

In [32]:
pd.read_csv('/disk1/ariane/pycharm/CARE/output/protinfer/70-90_protein_test.tsv', sep='\t')

Unnamed: 0,sequence_name,predicted_label,confidence,description
0,Q9HJ16,Pfam:CL0224,1.00,DHQS
1,Q9HJ16,Pfam:PF13685,1.00,Iron-containing alcohol dehydrogenase
2,Q9Z4J7,Pfam:CL0186,1.00,Beta_propeller
3,Q9Z4J7,Pfam:PF01011,1.00,PQQ enzyme repeat
4,Q9Z4J7,Pfam:PF13360,1.00,PQQ-like domain
...,...,...,...,...
10822,Q8XZX8,GO:0022857,1.00,transmembrane transporter activity
10823,Q8XZX8,GO:0006810,1.00,transport
10824,Q8XZX8,GO:0005215,1.00,transporter activity
10825,Q8XZX8,GO:0016020,0.83,membrane


## Format the dataframe

In [47]:

def get_proteinfer(test_label, save=False):
    """
    Gets the results for a series of ECs and formats it correctly for the paper
    """
    # Lets also look at the protein our query is the query genome and our database is going to be ecoli.
    # ToDo: make more modula.
    results = pd.read_csv(f'/disk1/ariane/pycharm/CARE/output/protinfer/{test_label}_protein_test.tsv', sep='\t')
    results['predicted_ecs'] = [ec.split(':')[1] if 'EC:' in ec else 'None' for ec in results['predicted_label'].values]
    if test_label == 'price':
        results['true_ecs'] = results['sequence_name'].map(get_price2ec())
    else:
        results['true_ecs'] = results['sequence_name'].map(get_uniprot2ec())

    grped = results.groupby('sequence_name')
    max_ecs = 0
    rows = []
    for query, grp in grped:
        # Always will be the same for the grouped 
        true_ec = grp['true_ecs'].values[0]
        # Filter to only include rows which were not null
        grp = grp[~grp['predicted_ecs'].isna()]
        grp = grp[grp['predicted_ecs'] != 'None']
        grp = grp.sort_values(by='predicted_ecs', ascending=False)

        if len(list(grp['predicted_ecs'].values)) > max_ecs:
            max_ecs = len(list(grp['predicted_ecs'].values))
        if len(list(grp['predicted_ecs'].values)) == 0:
            rows.append([query, true_ec, '', None])
        else:
            rows.append([query, true_ec, ''] + list(grp['predicted_ecs'].values))
    new_df = pd.DataFrame(rows)
    new_df.columns = ['Entry', 'EC number', 'Sequence'] + list(range(0, max_ecs))

    # Since we may have no similar ones we'll add in these as a dummy
    new_df = new_df.fillna('0.0.0.0')
    # Save to a file in the default location
    if save:
        new_df.to_csv(f'{output_folder}{test_label}_protein_test_results_df.csv', index=False)
    return new_df

In [48]:
# Save in the required format
for split in ['30', '30-50', 'price', 'promiscuous']:
    get_proteinfer(split, save=True)

## Compute accuracy for the prediction vs the true values for each level

In [29]:
filenames = [f'{base_dir}splits/task1/30-50_protein_test.csv', 
             f'{base_dir}splits/task1/30_protein_test.csv', 
             f'{base_dir}splits/task1/50-70_protein_test.csv',
             f'{base_dir}splits/task1/70-90_protein_test.csv',
             f'{base_dir}splits/task1/promiscuous_protein_test.csv',
             f'{base_dir}splits/task1/price_protein_test.csv']


for f in filenames:
    u.dp(['Test set:', f.split('/')[-1]])
    results = pd.read_csv(f'/disk1/ariane/pycharm/CARE/output/protinfer/{f.split("/")[-1].replace(".csv", ".tsv")}', sep='\t')
    results['predicted_ecs'] = [ec.split(':')[1] if 'EC:' in ec else 'None' for ec in results['predicted_label'].values]
    # Remove ones without a predicted ec
    results = results[results['predicted_ecs'] != 'None']
    
    # Add in the actual ec
    if 'price' in f:
        results['true_ecs'] = results['sequence_name'].map(get_price2ec())
    else:
        results['true_ecs'] = results['sequence_name'].map(get_uniprot2ec())
        
    results = results.drop_duplicates(subset='sequence_name', keep='last')
    compute_accuracy_baseline1(results['predicted_ecs'].values, results['true_ecs'].values)
    

[94m--------------------------------------------------------------------------------[0m
[94m                       Test set:	30-50_protein_test.csv	                        [0m
[94m--------------------------------------------------------------------------------[0m
[94m--------------------------------------------------------------------------------[0m
[94mAcc level 1:	96.15	
Acc level 2:	92.86	
Acc level 3:	90.11	
Acc level 4:	84.07	 [0m
[94m--------------------------------------------------------------------------------[0m
[94m--------------------------------------------------------------------------------[0m
[94m                         Test set:	30_protein_test.csv	                         [0m
[94m--------------------------------------------------------------------------------[0m
[94m--------------------------------------------------------------------------------[0m
[94m Acc level 1:	96.95	
Acc level 2:	91.6	
Acc level 3:	87.79	
Acc level 4:	81.68	 [0m
[94m-----

In [26]:
compute_accuracy_baseline1(results['predicted_ecs'].values, results['true_ecs'].values)

[94m--------------------------------------------------------------------------------[0m
[94mAcc level 1:	96.15	
Acc level 2:	92.86	
Acc level 3:	90.11	
Acc level 4:	84.07	 [0m
[94m--------------------------------------------------------------------------------[0m


(0.9615384615384616,
 0.9285714285714286,
 0.9010989010989011,
 0.8406593406593407)