# Install protinfer 

```
git clone https://github.com/google-research/proteinfer
cd ~/proteinfer
pip3 install -r requirements.txt
python3 install_models.py
```

## Train the model
Unfortunatly insufficient data is provided to train the model.
```
python train.py --data_base_path=./testdata/ \
--label_vocab_path=./data/vocabs/EC.tsv \
--hparams_set=small_test_model \
--output_dir=trained_baseline1
```
Next run the following:

```
python3 proteinfer.py -i /disk1/ariane/pycharm/CARE/CLEAN/app/data/price.fasta -o ~/hemoglobin_predictions.tsv
```


In [2]:
import pandas as pd
from sciutil import SciUtil

u = SciUtil()

base_dir = '/disk1/ariane/pycharm/CARE/'
output_folder = f'{base_dir}task1_baselines/results_summary/ProteInfer/'
task1_splits = ['30', '30-50', 'price', 'promiscuous']

## Run protinfer

Note protinfer was not re-trained so may be slightly biased.

Protinfer needs to be installed as per the guidelines and then the following command can be run from the protinfer folder:

In [7]:
import os

filenames = [f'{base_dir}splits/task1/30-50_protein_test.csv', 
             f'{base_dir}splits/task1/30_protein_test.csv', 
             f'{base_dir}splits/task1/50-70_protein_test.csv',
             f'{base_dir}splits/task1/70-90_protein_test.csv',
             f'{base_dir}splits/task1/promiscuous_protein_test.csv',
             f'{base_dir}splits/task1/protein_train.csv',
             f'{base_dir}splits/task1/price_protein_test.csv']
for f in filenames:
    print(f'python3 proteinfer.py -i {f.replace(".csv", ".fasta")} -o /disk1/ariane/pycharm/CARE/output/protinfer/{f.split("/")[-1].replace(".csv", ".tsv")}')

python3 proteinfer.py -i /disk1/ariane/pycharm/CARE/splits/task1/30-50_protein_test.fasta -o /disk1/ariane/pycharm/CARE/output/protinfer/30-50_protein_test.tsv
python3 proteinfer.py -i /disk1/ariane/pycharm/CARE/splits/task1/30_protein_test.fasta -o /disk1/ariane/pycharm/CARE/output/protinfer/30_protein_test.tsv
python3 proteinfer.py -i /disk1/ariane/pycharm/CARE/splits/task1/50-70_protein_test.fasta -o /disk1/ariane/pycharm/CARE/output/protinfer/50-70_protein_test.tsv
python3 proteinfer.py -i /disk1/ariane/pycharm/CARE/splits/task1/70-90_protein_test.fasta -o /disk1/ariane/pycharm/CARE/output/protinfer/70-90_protein_test.tsv
python3 proteinfer.py -i /disk1/ariane/pycharm/CARE/splits/task1/promiscuous_protein_test.fasta -o /disk1/ariane/pycharm/CARE/output/protinfer/promiscuous_protein_test.tsv
python3 proteinfer.py -i /disk1/ariane/pycharm/CARE/splits/task1/protein_train.fasta -o /disk1/ariane/pycharm/CARE/output/protinfer/protein_train.tsv
python3 proteinfer.py -i /disk1/ariane/pycha

## Format the dataframe

In [3]:
def get_uniprot2ec():
    swissprot = pd.read_csv(f'{base_dir}processed_data/protein2EC.csv')
    id2ec = swissprot.set_index('Entry')['EC number'].to_dict()
    return id2ec

def get_price2ec():
    df = pd.read_csv(f'{base_dir}splits/task1/price_protein_test.csv')
    id2ec = df.set_index('Entry')['EC number'].to_dict()
    return id2ec

def get_test_df(label):
    return pd.read_csv(f'{base_dir}splits/task1/{label}_protein_test.csv')


def get_proteinfer(test_label, save=False):
    """
    Gets the results for a series of ECs and formats it correctly for the paper
    """
    # Lets also look at the protein our query is the query genome and our database is going to be ecoli.
    # ToDo: make more modula.
    results = pd.read_csv(f'/disk1/ariane/pycharm/CARE/output/protinfer/{test_label}_protein_test.tsv', sep='\t')
    results['predicted_ecs'] = [ec.split(':')[1] if 'EC:' in ec else 'None' for ec in results['predicted_label'].values]
    if test_label == 'price':
        results['true_ecs'] = results['sequence_name'].map(get_price2ec())
    else:
        results['true_ecs'] = results['sequence_name'].map(get_uniprot2ec())

    grped = results.groupby('sequence_name')
    max_ecs = 0
    rows = []
    # Get the raw test set and then build the new dataset based on that! 
    test_df = get_test_df(test_label)
    # Now we want to iterate through and get the predicted EC numbers
    entry_to_ec = dict(zip(test_df['Entry'], test_df['EC number']))
    entry_to_seq = dict(zip(test_df['Entry'], test_df['Sequence']))

    for query in test_df['Entry'].values:
        try:
            grp = grped.get_group(query)
            # Always will be the same for the grouped 
            # Always will be the same for the grouped 
            true_ec = ';'.join(set([ec for ec in grp['true_ecs'].values]))
            # Filter to only include rows which were not null
            grp = grp[~grp['predicted_ecs'].isna()]
            grp = grp[grp['predicted_ecs'] != 'None']
            grp = grp.sort_values(by='predicted_ecs', ascending=False)

            if len(list(grp['predicted_ecs'].values)) > max_ecs:
                max_ecs = len(list(grp['predicted_ecs'].values))
            if len(list(grp['predicted_ecs'].values)) == 0:
                rows.append([query, true_ec, '', None])
            else:
                rows.append([query, true_ec, ''] + list(grp['predicted_ecs'].values))
        except:
            rows.append([query, entry_to_ec[query], entry_to_seq[query]])

    new_df = pd.DataFrame(rows)
    new_df.columns = ['Entry', 'EC number', 'Sequence'] + list(range(0, max_ecs))

    # Since we may have no similar ones we'll add in these as a dummy
    new_df = new_df.fillna('0.0.0.0')
    # Save to a file in the default location
    if save:
        new_df.to_csv(f'{output_folder}{test_label}_protein_test_results_df.csv', index=False)
    return new_df

In [4]:
# Save in the required format
for split in ['30', '30-50', 'price', 'promiscuous']:
    get_proteinfer(split, save=True)

NameError: name 'get_uniprot2ec' is not defined