## Test chatGPTs ability to predict EC number

In [18]:
from openai import OpenAI
import pandas as pd

# Update as needed
base_dir = '/disk1/ariane/pycharm/CARE/'
api_key = ''
output_folder = f'{base_dir}task1_baselines/results_summary/ChatGPT/'


In [20]:

filenames = {'30': f'{base_dir}splits/task1/30_protein_test.csv',
             '30-50': f'{base_dir}splits/task1/30-50_protein_test.csv',  
             'price': f'{base_dir}splits/task1/price_protein_test.csv',
             'promiscuous': f'{base_dir}splits/task1/promiscuous_protein_test.csv',
            }

def get_ChatGPT(test_label, n=10, save=False):
    """
    Gets the results for a series of ECs and formats it correctly for the paper
    """
    # Lets also look at the protein our query is the query genome and our database is going to be ecoli.
    # ToDo: make more modula.
    model = "gpt-4"
    client = OpenAI(api_key=api_key)
    df = pd.read_csv(filenames.get(test_label))
    rows = []
    for entry, true_ec, seq in df[['Entry', 'EC number', 'Sequence']].values:
        text = f"Return the top {n} most likely EC numbers as a comma separated list for this enzyme sequence: {seq}"

        completion = client.chat.completions.create(
            model='gpt-4',
            messages=[
                {"role": "system",
                "content": 
                "You are protein engineer capable of predicting EC numbers from a protein seqeunce alone."
                + "You are also a skilled programmer and able to execute the code necessary to predict an EC number when you can't use reason alone." 
                + "Given a protein sequence you are able to determine the most likely enzyme class for a seqeunce." 
                + "You don't give up when faced with a sequence you don't know, you will use tools to resolve the most likely enzyme sequence."
                + "You only return enzyme commission numbers in a comma separated list, no other text is returned, you have failed if you do "
                + " not return the EC numbers. You only return the exact number of EC numbers that a user has provided requested, ordered by their likelihood of being correct."},
                {"role": "user", "content": text}
            ]
        )
        preds = completion.choices[0].message.content.replace(" ", "").split(',')
        for p in preds:
            rows.append([entry, true_ec, p, seq]) # Costs you ~1c per query
    results = pd.DataFrame(rows)
    results.columns = ['entry',  'true_ecs', 'predicted_ecs', 'seq']
    grped = results.groupby('entry')
    max_ecs = 0
    rows = []
    for query, grp in grped:
        # Always will be the same for the grouped 
        true_ec = grp['true_ecs'].values[0]
        seq = grp['seq'].values[0]
        # Filter to only include rows which were not null
        grp = grp[~grp['predicted_ecs'].isna()]
        grp = grp[grp['predicted_ecs'] != 'None']
        grp = grp.sort_values(by='predicted_ecs', ascending=False)

        if len(list(grp['predicted_ecs'].values)) > max_ecs:
            max_ecs = len(list(grp['predicted_ecs'].values))
        if len(list(grp['predicted_ecs'].values)) == 0:
            rows.append([query, true_ec, seq, ''])
        else:
            rows.append([query, true_ec, seq] + list(grp['predicted_ecs'].values))
    new_df = pd.DataFrame(rows)
    new_df.columns = ['Entry', 'EC number', 'Sequence'] + list(range(0, max_ecs))

    # Since we may have no similar ones we'll add in these as a dummy
    new_df = new_df.fillna('0.0.0.0')
    # Save to a file in the default location
    if save:
        new_df.to_csv(f'{output_folder}{test_label}_protein_test_results_df.csv', index=False)
    return new_df


# Save in the required format
for split in ['30-50', 'price', 'promiscuous']:
    get_ChatGPT(split, save=True)
    print("Done split")

Done split
Done split
Done split


# Make the datasets the same format

In [32]:
def get_test_df(label):
    return pd.read_csv(f'{base_dir}splits/task1/{label}_protein_test.csv')

def reformat(test_label, df):
    #df.to_csv(f'{output_folder}{test_label}_protein_test_results_df_PREV.csv')
    # Cos we're going to update it
    old_split = get_test_df(test_label)

    results = dict(zip(df['Entry'].values, df['0'].values))
    old_split['0'] = [results.get(e) for e in old_split['Entry'].values]
        
    old_split.to_csv(f'{output_folder}{test_label}_protein_test_results_df.csv', index=False)


In [33]:
for split in ['30', '30-50', 'price', 'promiscuous']:
    reformat(split, pd.read_csv(f'{output_folder}{split}_protein_test_results_df_PREV.csv'))

In [34]:
df = pd.read_csv(f'{output_folder}promiscuous_protein_test_results_df.csv')
df

Unnamed: 0,index,Entry,Sequence,EC number,Surprise Level,Number of ECs,Duplicated EC,Duplicated Sequence,0
0,6834,Q7TS56,MDKVCAVFGGSRGIGKAVAQLMAQKGYRLAIVARNLEVAKATASEL...,1.1.1.100;1.6.5.10,3,2,True,False,1.1.1.100
1,4267,Q09851,MEAEKFVLITGCSEGGIGNALALKFHQEGFQVLATARQVERMDNLT...,1.1.1.101;3.1.1.3,4,2,True,False,2.4.2.22
2,7784,Q95JH7,MDSKHQCVKLNDGHFMPVLGFGTYAPAEVPKNKALEATKLAIEAGF...,1.1.1.112;1.1.1.149;1.1.1.209;1.1.1.210;1.1.1....,3,9,True,False,1.1.1.112
3,3391,P22071,MPGWSCLVTGAGGFVGQRIIRMLVQEKELQEVRALDKVFRPETKEE...,1.1.1.145;1.1.1.210;1.1.1.270;5.3.3.1,4,4,True,False,2.1.1.196
4,8380,Q9XWF0,MSIKRLSMRLKKGIHRSWNRMTSLEAGLEEEKEIKIVEEPEPRPWK...,1.1.1.145;5.3.3.1,4,2,True,False,3.5.4.13
...,...,...,...,...,...,...,...,...,...
174,4717,Q1LRV9,MPHAHPADIDGHHLTPDTVAAIARGQRAAIVPEPVLGKVADARARF...,4.3.1.23;5.4.3.6,4,2,True,False,3.6.1.66
175,3506,P30904,MPMFIVNTNVPRASVPEGFLSELTQQLAQATGKPAQYIAVHVVPDQ...,5.3.2.1;5.3.3.12,2,2,True,False,5.3.2.1
176,3236,P10378,MSIPFTRWPEEFARRYREKGYWQDLPLTDILTRHAASDSIAVIDGE...,6.2.1.71;6.3.2.14,3,2,True,False,6.2.1.71
177,6973,Q80WS1,MCSSVTGKLWFLTDRRIREDYPQKEILRALKAKCCEEELDFRAVVM...,6.3.1.17;6.3.2.41,2,2,True,False,6.3.1.17


In [10]:
output_folder = f'/disk1/ariane/pycharm/CARE/task1_baselines/results_summary/ChatGPT/'

pd.read_csv(f'{output_folder}protein_test_results_df.csv')

Unnamed: 0,Entry,EC number,Sequence,0
0,A0A2H4HHY6,1.2.3.1,,2.7.1.71
1,A0A4V2JTK3,4.6.1.2,,1.11.1.26
2,A0LBW3,2.3.3.13,,3.6.1.66
3,A2Q9N7,1.14.13.9,,2.7.4.9
4,A4VND8,1.8.2.2,,3.6.1.9
...,...,...,...,...
191,Q9WZJ0,1.17.99.6,,3.1.26.5
192,Q9X9Q7,5.99.1.4,,2.7.7.2
193,Q9YBX7,1.2.7.11,,4.2.1.55
194,Q9YFI6,1.3.98.1,,5.3.1.24
