## Test chatGPTs ability to predict EC number from reaction and EC term

In [5]:
# Ensure NLTK Wordnet is downloaded
from openai import OpenAI
import numpy as np
import pandas as pd
from tqdm import tqdm
api_key = ''
base_dir = '/disk1/ariane/pycharm/CARE/'

In [8]:
ec_to_text = pd.read_csv('../../processed_data/text2EC.csv')
ec_to_text = dict(zip(ec_to_text['EC number'], ec_to_text['Text']))
ec_to_text

{'1.1.1.1': 'oxidoreductase; oxidoreductase, acting on CH-OH group of donors; oxidoreductase, acting on the CH-OH group of donors, NAD or NADP as acceptor; alcohol dehydrogenase (NAD+)',
 '1.1.1.10': 'oxidoreductase; oxidoreductase, acting on CH-OH group of donors; oxidoreductase, acting on the CH-OH group of donors, NAD or NADP as acceptor; L-xylulose reductase (NADPH)',
 '1.1.1.100': 'oxidoreductase; oxidoreductase, acting on CH-OH group of donors; oxidoreductase, acting on the CH-OH group of donors, NAD or NADP as acceptor; 3-oxoacyl-[acyl-carrier-protein] reductase (NADPH)',
 '1.1.1.101': 'oxidoreductase; oxidoreductase, acting on CH-OH group of donors; oxidoreductase, acting on the CH-OH group of donors, NAD or NADP as acceptor; 1-acyl dihydroxyacetone phosphate reductase',
 '1.1.1.102': 'oxidoreductase; oxidoreductase, acting on CH-OH group of donors; oxidoreductase, acting on the CH-OH group of donors, NAD or NADP as acceptor; 3-dehydrosphinganine reductase',
 '1.1.1.103': 'oxid

In [29]:
filenames = {'easy': f'{base_dir}splits/task2/easy_reaction_test.csv',
             'medium': f'{base_dir}splits/task2/medium_reaction_test.csv',  
             'hard': f'{base_dir}splits/task2/hard_reaction_test.csv',
            }

def get_ChatGPT(test_label, n=10, save=False):
    """
    Gets the results for a series of ECs and formats it correctly for the paper
    """
    # Lets also look at the protein our query is the query genome and our database is going to be ecoli.
    # ToDo: make more modula.
    client = OpenAI(api_key=api_key)
    df = pd.read_csv(filenames.get(test_label))
    df['Text'] = [ec_to_text.get(ec) for ec in df['EC number'].values]
    
    rows = []
    for entry, true_ec, text_annot, reaction in tqdm(df[['Reaction Text', 'EC number', 'Text', 'Reaction']].values):
        text = f"Return the top {n} most likely EC numbers as a comma separated list for this reaction: {entry}, which associates with the following text: {text_annot}."
        completion = client.chat.completions.create(
            model='gpt-4',
            messages=[
                {"role": "system",
                "content": 
                "You are protein engineer capable of predicting EC numbers from a combination of textual information and a reaction that corresponds to a specific protein."
                + "You are also a skilled programmer and able to execute the code necessary to predict an EC number when you can't use reason alone." 
                + "Given a reaction and text information of an EC you are able to determine the most likely enzyme class for a reaction." 
                + "You don't give up when faced with a reaction you don't know, you will use tools to resolve the most likely enzyme number."
                + "You only return enzyme commission numbers in a comma separated list, no other text is returned, you have failed if you do "
                + " not return the EC numbers. You only return the exact number of EC numbers that a user has provided requested, ordered by their likelihood of being correct."},
                {"role": "user", "content": text}
            ]
        )
        preds = completion.choices[0].message.content.replace(" ", "").split(',')
        for p in preds:
            rows.append([reaction, true_ec, p, text_annot]) # Costs you ~1c per query

    results = pd.DataFrame(rows)
    results.columns = ['entry',  'true_ecs', 'predicted_ecs', 'seq']
    grped = results.groupby('entry')
    max_ecs = 0
    rows = []
    for query, grp in grped:
        # Always will be the same for the grouped 
        true_ec = grp['true_ecs'].values[0]
        seq = grp['seq'].values[0]
        # Filter to only include rows which were not null
        grp = grp[~grp['predicted_ecs'].isna()]
        grp = grp[grp['predicted_ecs'] != 'None']
        grp = grp.sort_values(by='predicted_ecs', ascending=False)

        if len(list(grp['predicted_ecs'].values)) > max_ecs:
            max_ecs = len(list(grp['predicted_ecs'].values))
        if len(list(grp['predicted_ecs'].values)) == 0:
            rows.append([query, true_ec, seq, ''])
        else:
            rows.append([query, true_ec, seq] + list(grp['predicted_ecs'].values))
    new_df = pd.DataFrame(rows)
    new_df.columns = ['Entry', 'EC number', 'Text'] + list(range(0, max_ecs))

    # Since we may have no similar ones we'll add in these as a dummy
    new_df = new_df.fillna('0.0.0.0')
    # Save to a file in the default location
    if save:
        new_df.to_csv(f'{output_folder}{test_label}_reaction_test_results_df.csv', index=False)
    return new_df


output_folder = f'{base_dir}task2_baselines/results_summary/ChatGPT/'
# Save in the required format
for split in ['easy', 'medium', 'hard']:
    get_ChatGPT(split, save=True)
    print("done")

  0%|          | 0/177 [00:00<?, ?it/s]

100%|██████████| 177/177 [15:08<00:00,  5.13s/it]


done


100%|██████████| 177/177 [15:23<00:00,  5.22s/it]


done


100%|██████████| 163/163 [13:39<00:00,  5.03s/it]

done





In [30]:
# Also do chatGPT with only the reaction information

filenames = {'easy': f'{base_dir}splits/task2/easy_reaction_test.csv',
             'medium': f'{base_dir}splits/task2/medium_reaction_test.csv',  
             'hard': f'{base_dir}splits/task2/hard_reaction_test.csv',
            }

def get_ChatGPT(test_label, n=10, save=False):
    """
    Gets the results for a series of ECs and formats it correctly for the paper
    """
    # Lets also look at the protein our query is the query genome and our database is going to be ecoli.
    # ToDo: make more modula.
    client = OpenAI(api_key=api_key)
    df = pd.read_csv(filenames.get(test_label))
    df['Text'] = [ec_to_text.get(ec) for ec in df['EC number'].values]
    
    rows = []
    for entry, true_ec, text_annot, reaction in tqdm(df[['Reaction Text', 'EC number', 'Text', 'Reaction']].values):
        text = f"Return the top {n} most likely EC numbers as a comma separated list for this reaction: {entry}."
        completion = client.chat.completions.create(
            model='gpt-4',
            messages=[
                {"role": "system",
                "content": 
                "You are protein engineer capable of predicting EC numbers from a reaction that corresponds to a specific enzyme."
                + "You are also a skilled programmer and able to execute the code necessary to predict an EC number when you can't use reason alone." 
                + "Given a reaction you are able to determine the most likely enzyme class for a reaction." 
                + "You don't give up when faced with a reaction you don't know, you will use tools to resolve the most likely enzyme number."
                + "You only return enzyme commission numbers in a comma separated list, no other text is returned, you have failed if you do "
                + " not return the EC numbers. You only return the exact number of EC numbers that a user has provided requested, ordered by their likelihood of being correct."},
                {"role": "user", "content": text}
            ]
        )
        preds = completion.choices[0].message.content.replace(" ", "").split(',')
        for p in preds:
            rows.append([reaction, true_ec, p, text_annot]) # Costs you ~1c per query

    results = pd.DataFrame(rows)
    results.columns = ['entry',  'true_ecs', 'predicted_ecs', 'seq']
    grped = results.groupby('entry')
    max_ecs = 0
    rows = []
    for query, grp in grped:
        # Always will be the same for the grouped 
        true_ec = grp['true_ecs'].values[0]
        seq = grp['seq'].values[0]
        # Filter to only include rows which were not null
        grp = grp[~grp['predicted_ecs'].isna()]
        grp = grp[grp['predicted_ecs'] != 'None']
        grp = grp.sort_values(by='predicted_ecs', ascending=False)

        if len(list(grp['predicted_ecs'].values)) > max_ecs:
            max_ecs = len(list(grp['predicted_ecs'].values))
        if len(list(grp['predicted_ecs'].values)) == 0:
            rows.append([query, true_ec, seq, ''])
        else:
            rows.append([query, true_ec, seq] + list(grp['predicted_ecs'].values))
    new_df = pd.DataFrame(rows)
    new_df.columns = ['Entry', 'EC number', 'Text'] + list(range(0, max_ecs))

    # Since we may have no similar ones we'll add in these as a dummy
    new_df = new_df.fillna('0.0.0.0')
    # Save to a file in the default location
    if save:
        new_df.to_csv(f'{output_folder}{test_label}_reaction_test_results_df.csv', index=False)
    return new_df


output_folder = f'{base_dir}task2_baselines/results_summary/ChatGPT_reaction/'
# Save in the required format
for split in ['easy', 'medium', 'hard']:
    get_ChatGPT(split, save=True)
    print("done")

100%|██████████| 177/177 [14:58<00:00,  5.07s/it]


done


100%|██████████| 177/177 [15:16<00:00,  5.18s/it]


done


100%|██████████| 163/163 [13:27<00:00,  4.95s/it]

done



