## Test chatGPTs ability to predict EC number from reaction and EC term

In [2]:
from openai import OpenAI
import numpy as np
import pandas as pd
from tqdm import tqdm

api_key = ''
base_dir = '../../'

## Get the text for the EC numbers

In [3]:
ec_to_text = pd.read_csv('../../processed_data/text2EC.csv')
ec_to_text = dict(zip(ec_to_text['EC number'], ec_to_text['Text']))

## Data input to ChatGPT


Given chatGPT likes text, we use the textual form of the reaction rather than smiles we use the ytext format.

In [40]:
import os 

def get_ChatGPT(df, split, query_type='reaction', output_folder='.', save=True, api_key=None, subsample=None):
        """
        Gets the results for a series of ECs and formats it correctly for the paper
        """
        client = OpenAI(api_key=api_key)
        rows = []
        for entry, true_ec, text_annot, reaction in tqdm(df[['Reaction Text', 'EC number', 'Text', 'Reaction']].values):
            if query_type == 'reaction':
                text = f"Return the most likely EC number for this reaction: {entry}."
                completion = client.chat.completions.create(
                    model='gpt-4o-mini',
                    messages=[
                        {"role": "system",
                        "content": 
                        "You are protein engineer capable of predicting EC numbers from a reaction that corresponds to a specific enzyme."
                        + "You are also a skilled programmer and able to execute the code necessary to predict an EC number when you can't use reason alone." 
                        + "Given a reaction you are able to determine the most likely enzyme class for a reaction." 
                        + "You don't give up when faced with a reaction you don't know, you will use tools to resolve the most likely enzyme number."
                        + "You only return enzyme commission numbers in a comma separated list, no other text is returned, you have failed if you do "
                        + " not return the EC numbers. You only return the most likely EC number."},
                        {"role": "user", "content": text}
                    ]
                )
            elif query_type == 'reaction+text':
                text = f"Return the most likely EC number annotation for this reaction: {entry}, which associates with the following text: {text_annot}."
                completion = client.chat.completions.create(
                    model='gpt-4o-mini',
                    messages=[
                        {"role": "system",
                        "content": 
                        "You are protein engineer capable of predicting EC numbers from a combination of textual information and a reaction that corresponds to a specific protein."
                        + "You are also a skilled programmer and able to execute the code necessary to predict an EC number when you can't use reason alone." 
                        + "Given a reaction and text information of an EC you are able to determine the most likely enzyme class for a reaction." 
                        + "You don't give up when faced with a reaction you don't know, you will use tools to resolve the most likely enzyme number."
                        + "You only return enzyme commission numbers in a comma separated list, no other text is returned, you have failed if you do "
                        + " not return the EC numbers. You only return the most likely EC number."},
                        {"role": "user", "content": text}
                    ]
                )
            #print(completion.choices[0].message.content)
            # Only ever take the first one
            preds = completion.choices[0].message.content.replace(" ", "").split(',')[0]
            rows.append(preds)
        df['0'] = rows
       
        # Save to a file in the default location
        if save:
            df.to_csv(os.path.join(output_folder, f'{split}_reaction_test_results_df.csv'), index=False)
        return df

In [41]:
filenames = {'easy': f'{base_dir}splits/task2/easy_reaction_test.csv',
             'medium': f'{base_dir}splits/task2/medium_reaction_test.csv',  
             'hard': f'{base_dir}splits/task2/hard_reaction_test.csv',
            }
# Save in the required format
output_folder = f'{base_dir}task2_baselines/results_summary/ChatGPT_reaction/'
for split in ['easy', 'medium', 'hard']:
    df = pd.read_csv(filenames[split])
    df['Text'] = [ec_to_text.get(ec) for ec in df['EC number'].values]
    gpt_df = get_ChatGPT(df, split=split, query_type='reaction+text', 
                         api_key='', 
                         output_folder=output_folder)

100%|██████████| 393/393 [03:15<00:00,  2.01it/s]
100%|██████████| 393/393 [03:24<00:00,  1.92it/s]
100%|██████████| 460/460 [03:53<00:00,  1.97it/s]


In [38]:
filenames = {'easy': f'{base_dir}splits/task2/easy_reaction_test.csv',
             'medium': f'{base_dir}splits/task2/medium_reaction_test.csv',  
             'hard': f'{base_dir}splits/task2/hard_reaction_test.csv',
            }
# Save in the required format
output_folder = f'{base_dir}task2_baselines/results_summary/ChatGPT/'
for split in ['easy', 'medium', 'hard']:
    df = pd.read_csv(filenames[split])
    df['Text'] = [ec_to_text.get(ec) for ec in df['EC number'].values]
    gpt_df = get_ChatGPT(df, split=split, query_type='reaction', 
                         api_key='', 
                         output_folder=output_folder)

100%|██████████| 393/393 [03:29<00:00,  1.87it/s]
100%|██████████| 393/393 [03:25<00:00,  1.92it/s]
100%|██████████| 460/460 [04:04<00:00,  1.88it/s]
