## Test chatGPTs ability to predict EC number

In [11]:
from openai import OpenAI
import pandas as pd

# Update as needed
base_dir = '../'
api_key = ''

In [20]:

filenames = {'30': f'{base_dir}splits/task1/30_protein_test.csv',
             '30-50': f'{base_dir}splits/task1/30-50_protein_test.csv',  
             'price': f'{base_dir}splits/task1/price_protein_test.csv',
             'promiscuous': f'{base_dir}splits/task1/promiscuous_protein_test.csv',
            }

def get_ChatGPT(test_label, n=10, save=False):
    """
    Gets the results for a series of ECs and formats it correctly for the paper
    """
    # Lets also look at the protein our query is the query genome and our database is going to be ecoli.
    # ToDo: make more modula.
    model = "gpt-4"
    client = OpenAI(api_key=api_key)
    df = pd.read_csv(filenames.get(test_label))
    rows = []
    for entry, true_ec, seq in df[['Entry', 'EC number', 'Sequence']].values:
        text = f"Return the top {n} most likely EC numbers as a comma separated list for this enzyme sequence: {seq}"

        completion = client.chat.completions.create(
            model='gpt-4',
            messages=[
                {"role": "system",
                "content": 
                "You are protein engineer capable of predicting EC numbers from a protein seqeunce alone."
                + "You are also a skilled programmer and able to execute the code necessary to predict an EC number when you can't use reason alone." 
                + "Given a protein sequence you are able to determine the most likely enzyme class for a seqeunce." 
                + "You don't give up when faced with a sequence you don't know, you will use tools to resolve the most likely enzyme sequence."
                + "You only return enzyme commission numbers in a comma separated list, no other text is returned, you have failed if you do "
                + " not return the EC numbers. You only return the exact number of EC numbers that a user has provided requested, ordered by their likelihood of being correct."},
                {"role": "user", "content": text}
            ]
        )
        preds = completion.choices[0].message.content.replace(" ", "").split(',')
        for p in preds:
            rows.append([entry, true_ec, p, seq]) # Costs you ~1c per query
    results = pd.DataFrame(rows)
    results.columns = ['entry',  'true_ecs', 'predicted_ecs', 'seq']
    grped = results.groupby('entry')
    max_ecs = 0
    rows = []
    for query, grp in grped:
        # Always will be the same for the grouped 
        true_ec = grp['true_ecs'].values[0]
        seq = grp['seq'].values[0]
        # Filter to only include rows which were not null
        grp = grp[~grp['predicted_ecs'].isna()]
        grp = grp[grp['predicted_ecs'] != 'None']
        grp = grp.sort_values(by='predicted_ecs', ascending=False)

        if len(list(grp['predicted_ecs'].values)) > max_ecs:
            max_ecs = len(list(grp['predicted_ecs'].values))
        if len(list(grp['predicted_ecs'].values)) == 0:
            rows.append([query, true_ec, seq, ''])
        else:
            rows.append([query, true_ec, seq] + list(grp['predicted_ecs'].values))
    new_df = pd.DataFrame(rows)
    new_df.columns = ['Entry', 'EC number', 'Sequence'] + list(range(0, max_ecs))

    # Since we may have no similar ones we'll add in these as a dummy
    new_df = new_df.fillna('0.0.0.0')
    # Save to a file in the default location
    if save:
        new_df.to_csv(f'{output_folder}{test_label}_protein_test_results_df.csv', index=False)
    return new_df


output_folder = f'{base_dir}task1_baselines/results_summary/ChatGPT/'
# Save in the required format
for split in ['30-50', 'price', 'promiscuous']:
    get_ChatGPT(split, save=True)
    print("Done split")

Done split
Done split
Done split
