## Test chatGPTs ability to predict EC number from reaction and EC term

In [11]:
# Ensure NLTK Wordnet is downloaded
from openai import OpenAI
import numpy as np
import pandas as pd
from tqdm import tqdm
api_key = ''
base_dir = '/disk1/ariane/pycharm/CARE/'

In [12]:
ec_to_text = pd.read_csv('../../processed_data/text2EC.csv')
ec_to_text = dict(zip(ec_to_text['EC number'], ec_to_text['Text']))

In [13]:
filenames = {'easy': f'{base_dir}splits/task2/easy_reaction_test.csv',
             'medium': f'{base_dir}splits/task2/medium_reaction_test.csv',  
             'hard': f'{base_dir}splits/task2/hard_reaction_test.csv',
            }

def get_ChatGPT(test_label, n=10, save=False):
    """
    Gets the results for a series of ECs and formats it correctly for the paper
    """
    # Lets also look at the protein our query is the query genome and our database is going to be ecoli.
    # ToDo: make more modula.
    client = OpenAI(api_key=api_key)
    df = pd.read_csv(filenames.get(test_label))
    df['Text'] = [ec_to_text.get(ec) for ec in df['EC number'].values]
    
    rows = []
    for entry, true_ec, text_annot, reaction in tqdm(df[['Reaction Text', 'EC number', 'Text', 'Reaction']].values):
        text = f"Return the top {n} most likely EC numbers as a comma separated list for this reaction: {entry}, which associates with the following text: {text_annot}."
        completion = client.chat.completions.create(
            model='gpt-4',
            messages=[
                {"role": "system",
                "content": 
                "You are protein engineer capable of predicting EC numbers from a combination of textual information and a reaction that corresponds to a specific protein."
                + "You are also a skilled programmer and able to execute the code necessary to predict an EC number when you can't use reason alone." 
                + "Given a reaction and text information of an EC you are able to determine the most likely enzyme class for a reaction." 
                + "You don't give up when faced with a reaction you don't know, you will use tools to resolve the most likely enzyme number."
                + "You only return enzyme commission numbers in a comma separated list, no other text is returned, you have failed if you do "
                + " not return the EC numbers. You only return the exact number of EC numbers that a user has provided requested, ordered by their likelihood of being correct."},
                {"role": "user", "content": text}
            ]
        )
        preds = completion.choices[0].message.content.replace(" ", "").split(',')
        for p in preds:
            rows.append([reaction, true_ec, p, text_annot]) # Costs you ~1c per query

    results = pd.DataFrame(rows)
    results.columns = ['entry',  'true_ecs', 'predicted_ecs', 'seq']
    grped = results.groupby('entry')
    max_ecs = 0
    rows = []
    for query, grp in grped:
        # Always will be the same for the grouped 
        true_ec = grp['true_ecs'].values[0]
        seq = grp['seq'].values[0]
        # Filter to only include rows which were not null
        grp = grp[~grp['predicted_ecs'].isna()]
        grp = grp[grp['predicted_ecs'] != 'None']
        grp = grp.sort_values(by='predicted_ecs', ascending=False)

        if len(list(grp['predicted_ecs'].values)) > max_ecs:
            max_ecs = len(list(grp['predicted_ecs'].values))
        if len(list(grp['predicted_ecs'].values)) == 0:
            rows.append([query, true_ec, seq, ''])
        else:
            rows.append([query, true_ec, seq] + list(grp['predicted_ecs'].values))
    new_df = pd.DataFrame(rows)
    new_df.columns = ['Entry', 'EC number', 'Text'] + list(range(0, max_ecs))

    # Since we may have no similar ones we'll add in these as a dummy
    new_df = new_df.fillna('0.0.0.0')
    # Save to a file in the default location
    if save:
        new_df.to_csv(f'{output_folder}{test_label}_reaction_test_results_df.csv', index=False)
    return new_df


output_folder = f'{base_dir}task2_baselines/results_summary/ChatGPT/'
# Save in the required format
for split in ['easy', 'medium', 'hard']:
    get_ChatGPT(split, save=True)
    print("done")

  0%|          | 0/177 [00:00<?, ?it/s]

  0%|          | 0/177 [00:02<?, ?it/s]


APIConnectionError: Connection error.

In [None]:
# Also do chatGPT with only the reaction information

filenames = {'easy': f'{base_dir}splits/task2/easy_reaction_test.csv',
             'medium': f'{base_dir}splits/task2/medium_reaction_test.csv',  
             'hard': f'{base_dir}splits/task2/hard_reaction_test.csv',
            }

def get_ChatGPT(test_label, n=10, save=False):
    """
    Gets the results for a series of ECs and formats it correctly for the paper
    """
    # Lets also look at the protein our query is the query genome and our database is going to be ecoli.
    # ToDo: make more modula.
    client = OpenAI(api_key=api_key)
    df = pd.read_csv(filenames.get(test_label))
    df['Text'] = [ec_to_text.get(ec) for ec in df['EC number'].values]
    
    rows = []
    for entry, true_ec, text_annot, reaction in tqdm(df[['Reaction Text', 'EC number', 'Text', 'Reaction']].values):
        text = f"Return the top {n} most likely EC numbers as a comma separated list for this reaction: {entry}."
        completion = client.chat.completions.create(
            model='gpt-4',
            messages=[
                {"role": "system",
                "content": 
                "You are protein engineer capable of predicting EC numbers from a reaction that corresponds to a specific enzyme."
                + "You are also a skilled programmer and able to execute the code necessary to predict an EC number when you can't use reason alone." 
                + "Given a reaction you are able to determine the most likely enzyme class for a reaction." 
                + "You don't give up when faced with a reaction you don't know, you will use tools to resolve the most likely enzyme number."
                + "You only return enzyme commission numbers in a comma separated list, no other text is returned, you have failed if you do "
                + " not return the EC numbers. You only return the exact number of EC numbers that a user has provided requested, ordered by their likelihood of being correct."},
                {"role": "user", "content": text}
            ]
        )
        preds = completion.choices[0].message.content.replace(" ", "").split(',')
        for p in preds:
            rows.append([reaction, true_ec, p, text_annot]) # Costs you ~1c per query

    results = pd.DataFrame(rows)
    results.columns = ['entry',  'true_ecs', 'predicted_ecs', 'seq']
    grped = results.groupby('entry')
    max_ecs = 0
    rows = []
    for query, grp in grped:
        # Always will be the same for the grouped 
        true_ec = grp['true_ecs'].values[0]
        seq = grp['seq'].values[0]
        # Filter to only include rows which were not null
        grp = grp[~grp['predicted_ecs'].isna()]
        grp = grp[grp['predicted_ecs'] != 'None']
        grp = grp.sort_values(by='predicted_ecs', ascending=False)

        if len(list(grp['predicted_ecs'].values)) > max_ecs:
            max_ecs = len(list(grp['predicted_ecs'].values))
        if len(list(grp['predicted_ecs'].values)) == 0:
            rows.append([query, true_ec, seq, ''])
        else:
            rows.append([query, true_ec, seq] + list(grp['predicted_ecs'].values))
    new_df = pd.DataFrame(rows)
    new_df.columns = ['Entry', 'EC number', 'Text'] + list(range(0, max_ecs))

    # Since we may have no similar ones we'll add in these as a dummy
    new_df = new_df.fillna('0.0.0.0')
    # Save to a file in the default location
    if save:
        new_df.to_csv(f'{output_folder}{test_label}_reaction_test_results_df.csv', index=False)
    return new_df


output_folder = f'{base_dir}task2_baselines/results_summary/ChatGPT_reaction/'
# Save in the required format
for split in ['easy', 'medium', 'hard']:
    get_ChatGPT(split, save=True)
    print("done")

100%|██████████| 177/177 [14:58<00:00,  5.07s/it]


done


100%|██████████| 177/177 [15:16<00:00,  5.18s/it]


done


100%|██████████| 163/163 [13:27<00:00,  4.95s/it]

done





## Reformat the outputs to be in the same format as the other test cases

In [28]:
import os
from collections import defaultdict

def get_reaction2EC():
    df = pd.read_csv(os.path.join('/disk1/ariane/vscode/CARE/splits/task2/', 'reaction2EC.csv'))
    entry_to_ec = defaultdict(list)
    for entry, ec in df[['Reaction', 'EC number']].values:
        entry_to_ec[entry].append(ec)
    for entry, ecs in entry_to_ec.items():
        entry_to_ec[entry] = ';'.join(ecs)
    return entry_to_ec
    
def reformat(test_label, new_df):
    if not os.path.exists(f'{output_folder}{test_label}_reaction_test_results_df_unformatted.csv'):
        new_df.to_csv(f'{output_folder}{test_label}_reaction_test_results_df_unformatted.csv')
    df = pd.read_csv(f'{output_folder}{test_label}_reaction_test_results_df.csv')
    # Basically just need to ensure that the ECs are all correct for a given reaction (not just the first one)
    reaction2EC = get_reaction2EC()
    df['EC number'] = [reaction2EC.get(e) for e in df['Entry'].values]
    print("Saving to", f'{output_folder}{test_label}_reaction_test_results_df.csv')
    df.to_csv(f'{output_folder}{test_label}_reaction_test_results_df.csv', index=False)


In [21]:
base_dir = '/disk1/ariane/vscode/CARE/'

In [29]:

output_folder = f'{base_dir}task2_baselines/results_summary/ChatGPT_reaction/'
# Save in the required format
for split in ['easy', 'medium', 'hard']:
    reformat(split, pd.read_csv(f'{output_folder}{split}_reaction_test_results_df.csv'))
    print("done")

Saving to /disk1/ariane/vscode/CARE/task2_baselines/results_summary/ChatGPT_reaction/easy_reaction_test_results_df.csv
done
Saving to /disk1/ariane/vscode/CARE/task2_baselines/results_summary/ChatGPT_reaction/medium_reaction_test_results_df.csv
done
Saving to /disk1/ariane/vscode/CARE/task2_baselines/results_summary/ChatGPT_reaction/hard_reaction_test_results_df.csv
done


In [30]:
output_folder = f'{base_dir}task2_baselines/results_summary/ChatGPT/'
# Save in the required format
for split in ['easy', 'medium', 'hard']:
    reformat(split, pd.read_csv(f'{output_folder}{split}_reaction_test_results_df.csv'))
    print("done")

Saving to /disk1/ariane/vscode/CARE/task2_baselines/results_summary/ChatGPT/easy_reaction_test_results_df.csv
done
Saving to /disk1/ariane/vscode/CARE/task2_baselines/results_summary/ChatGPT/medium_reaction_test_results_df.csv
done
Saving to /disk1/ariane/vscode/CARE/task2_baselines/results_summary/ChatGPT/hard_reaction_test_results_df.csv
done


In [32]:
pd.read_csv(f'{output_folder}easy_reaction_test_results_df.csv')

Unnamed: 0,Entry,EC number,Text,0,1,2,3,4,5,6,7,8,9
0,C#N.OCCS>>CCO.N#CS,2.8.1.2,"transferase; transferase, transferring sulphur...",4.4.1.9,4.4.1.13,4.4.1.1,2.8.1.7,2.8.1.6,2.8.1.5,2.8.1.3,2.8.1.2,2.8.1.1,2.5.1.65
1,C.C[C@@H](OP(=O)(O)O)[C@H](NC(=O)CCCCCCCSSCCS(...,2.8.4.1,"transferase; transferase, transferring sulphur...",2.8.4.8,2.8.4.5,2.8.4.1,2.8.1.7,2.8.1.6,2.8.1.5,2.8.1.2,2.8.1.1,1.8.4.8,1.8.4.12
2,C/C(=C\C(=O)O)C(=O)SCCNC(=O)CCNC(=O)[C@H](O)C(...,5.4.1.3,isomerase; intramolecular transferase; intramo...,5.4.99.61,3.1.1.85,2.1.1.297,2.1.1.283,2.1.1.279,2.1.1.245,2.1.1.223,2.1.1.144,2.1.1.111,1.3.1.34
3,C/C(=C\C[C@]12O[C@@]1(C)C(=O)c1ccccc1C2=O)CCCC...,1.17.4.4,"oxidoreductase; oxidoreductase, acting on CH o...",5.3.99.2,1.8.1.9,1.8.1.8,1.8.1.7,1.8.1.2,1.17.4.2,1.17.4.1,1.16.1.1,1.10.3.13,1.1.1.42
4,C/C(C=O)=C\CC/C(C)=C/C=O.NC(=O)C1=CN([C@@H]2O[...,1.3.1.123,"oxidoreductase; oxidoreductase, acting on the ...",1.1.1.49,1.1.1.375,1.1.1.332,1.1.1.274,1.1.1.267,1.1.1.219,1.1.1.195,1.1.1.184,1.1.1.175,1.1.1.100
...,...,...,...,...,...,...,...,...,...,...,...,...,...
172,O=O.OC[C@H]1OC[C@H](O)[C@@H](O)[C@@H]1O>>O=C1C...,1.1.3.10,"oxidoreductase; oxidoreductase, acting on CH-O...",1.1.3.9,1.1.3.8,1.1.3.7,1.1.3.6,1.1.3.5,1.1.3.4,1.1.3.3,1.1.3.2,1.1.3.10,1.1.3.1
173,O=c1ccn([C@@H]2O[C@H](COP(=O)(O)OP(=O)(O)O[C@H...,2.4.2.42,transferase; glycosyltransferase; pentosyltran...,2.7.7.9,2.7.7.75,2.7.7.64,2.7.7.23,2.7.7.10,2.4.2.38,2.4.2.1,2.4.1.243,2.4.1.240,2.4.1.17
174,OC[C@@H]1O[C@](CO)(OC[C@@H]2O[C@@](O)(CO)[C@H]...,3.2.1.154,"hydrolase; hydrolase, acting on glycosyl bonds...",3.2.1.86,3.2.1.55,3.2.1.26,3.2.1.22,3.2.1.196,3.2.1.180,3.2.1.178,3.2.1.172,3.2.1.154,3.2.1.10
175,OO.Oc1ccc(Cl)cc1Cc1cc(Cl)ccc1O>>O.Oc1ccc(Cl)cc...,1.11.1.7,"oxidoreductase; oxidoreductase, acting on pero...",1.8.5.3,1.14.19.1,1.14.14.1,1.13.11.39,1.11.1.9,1.11.1.7,1.11.1.6,1.11.1.21,1.11.1.15,1.10.3.1
