## Query GPT-4 for name and analysis using a toy example

#### This uses an improved version of the original prompt that includes instructions to generate an LLM Confidence Score.

#### The prompt also includes an example analysis to help the LLM in its task.

#### The LLM Score has its own column in the output TSV file.

#### The JSON config file is updated to use "GPT-4_1106-preview" build.

In [14]:
import pandas as pd
import json 
from utils.openai_query import openai_chat
from utils.prompt_factory import make_user_prompt_with_score
from tqdm import tqdm
import openai
import os
%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [10]:
## load variables
# Replace with your actual values
config_file = './jsonFiles/GOLLMrun_config.json'  # replace with your actual config file 
input_file = 'data/GO_term_analysis/toy_example.csv' # replace with your actual input file
input_sep = ','  # replace with the separator
set_index = 'GO'  # replace with your column name that you want to set as index or None
gene_column = 'Genes'  # replace with your actual column name for the gene list
gene_sep = ' '  # replace with your actual separator
gene_features = None  # replace with your path to the gene features or None if you don't want to include in the prompt
direct = False # if True, then the prompt will be a direct sentence asking for a name and analysis from the gene set, otherwise default or customized prompt
out_file = 'data/GO_term_analysis/LLM_processed_toy_example_with_score'  # replace with your actual output file name
customized_prompt = False # if True, then the prompt will be the custom prompt, if False, then the prompt will use default

# load the config file
with open(config_file) as json_file:
    config = json.load(json_file)

if customized_prompt:
    # make sure the file exist 
    if os.path.isfile(config['CUSTOM_PROMPT_FILE']):
        with open(config['CUSTOM_PROMPT_FILE'], 'r') as f: # replace with your actual customized prompt file
            customized_prompt = f.read()
            assert len(customized_prompt) > 1, "Customized prompt is empty"
    else:
        print("Customized prompt file does not exist")
        customized_prompt = None
else:
    customized_prompt = None

# Load OpenAI key, context, and model used 
openai.api_key = os.environ["OPENAI_API_KEY"]

context = config['CONTEXT']
model = config['MODEL']
temperature = config['TEMP']
max_tokens = config['MAX_TOKENS']
rate_per_token = config['RATE_PER_TOKEN']
LOG_FILE = config['LOG_NAME']+'_log.json'
DOLLAR_LIMIT = config['DOLLAR_LIMIT']

SEED = 123

In [21]:
df = pd.read_csv(input_file, sep = input_sep)
df.set_index(set_index, inplace=True)

df['LLM Name'] = None
df['LLM Analysis'] = None
df['LLM Score'] = None
# print(df.head())

llm_response_dict = {}
i = 0 #used for track progress and saving the file
for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
    term_genes = row['Genes']
    genes = term_genes.split()
    
    if len(genes) >1000:
        print(f'Gene set {i} has more than 1000 genes, skipping')
        continue
    try:
        prompt = make_user_prompt_with_score(genes)
        analysis, finger_print = openai_chat(context, prompt, model, temperature, max_tokens, rate_per_token, LOG_FILE, DOLLAR_LIMIT, SEED)
    
        if analysis:
            llm_process = analysis.split("\n")[0].replace("Process: ", "")
            llm_score = llm_process.split(" ")[-1].strip("()")
            llm_name = llm_process.rsplit(" ", 1)[0]
            llm_analysis = analysis.split('\n', 2)[2]
            
            
            df.loc[idx, 'LLM Name'] = llm_name
            df.loc[idx, 'LLM Analysis'] = llm_analysis
            df.loc[idx, 'LLM score'] = llm_score

            llm_response_dict[idx] = {'prompt': prompt, 
                                      'responses': analysis,
                                      'finger_print': finger_print,
                                      'status': 'SUCCESS'}
        else:
            print(f'No analysis for {idx}')
            df.loc[idx, 'LLM Name'] = None
            df.loc[idx, 'LLM Analysis'] = None
            df.loc[idx, 'LLM Score'] = None
            llm_response_dict[idx] = {'prompt': prompt,
                                      'responses': None,
                                      'finger_print': None,
                                      'status': 'NO RESPONSE'}
        i += 1
    except Exception as e:
        print(f'Error for {idx}: {e}')
        df.loc[idx, 'LLM Name'] = None
        df.loc[idx, 'LLM Analysis'] = None
        llm_response_dict[idx] = {'prompt': prompt,
                                    'responses': None,
                                    'status': 'ERROR: '+str(e)}
        continue
    
    if i % 10 == 0:
        with open(f'{out_file}.json', 'w') as fp:
            json.dump(llm_response_dict, fp)
        df.to_csv(f'{out_file}.tsv', sep='\t', index=True)

with open(f'{out_file}.json', 'w') as fp:
    json.dump(llm_response_dict, fp)


 10%|█         | 1/10 [00:24<03:44, 24.94s/it]

1333


 20%|██        | 2/10 [01:00<04:10, 31.26s/it]

1610


 30%|███       | 3/10 [01:36<03:54, 33.46s/it]

1380


 40%|████      | 4/10 [02:21<03:46, 37.76s/it]

1719


 50%|█████     | 5/10 [02:51<02:56, 35.22s/it]

1560


 60%|██████    | 6/10 [03:14<02:03, 30.95s/it]

1367


 70%|███████   | 7/10 [03:52<01:40, 33.37s/it]

1389


 80%|████████  | 8/10 [04:16<01:00, 30.43s/it]

1358


 90%|█████████ | 9/10 [04:34<00:26, 26.46s/it]

1269


100%|██████████| 10/10 [04:53<00:00, 29.33s/it]

1366





In [22]:
def get_llm_response(genes, model):
    prompt = make_user_prompt_with_score(genes)
    # print(prompt)
    if model.startwith('gpt'):
        analysis = openai_chat(context, prompt, model, temperature, max_tokens, rate_per_token, LOG_FILE, DOLLAR_LIMIT)
    
    if analysis:
        llm_process = analysis.split("\n")[0].replace("Process: ", "")
        llm_score = llm_process.split(" ")[-1].strip("()")
        llm_name = llm_process.rsplit(" ", 1)[0]
        llm_analysis = analysis.split('\n', 2)[2]
    else:
        llm_name = None
        llm_score = None
        llm_analysis = None
        
    return llm_name, llm_score, llm_analysis

In [14]:
# new pipeline for running toy example and query for random and contaminated sets
import pandas as pd
import json 
from utils.openai_query import openai_chat
from utils.prompt_factory import make_user_prompt_with_score
from tqdm import tqdm
import openai
import os


## check example_config.json for the format of the config file
with open('./jsonFiles/GOLLMrun_config.json') as json_file:
    config = json.load(json_file)

context = config['CONTEXT']
gpt_model = config['GPT_MODEL']
temperature = config['TEMP']
max_tokens = config['MAX_TOKENS']
rate_per_token = config['RATE_PER_TOKEN']
LOG_FILE = config['LOG_NAME'] + '_log.json'
DOLLAR_LIMIT = config['DOLLAR_LIMIT']
openai.api_key = os.environ["OPENAI_API_KEY"] # set your openai api key in the environment variable or set in config
# Generate list of genes from file (file: data/go_terms_sample.csv) check notebook 0.[Prep GO] Download_and_parse_GO.ipynb
df = pd.read_csv('data/GO_term_analysis/toy_example_contaminated.csv', sep = ',',index_col=0)

columns = df.columns
geneset_columns = columns[columns.str.endswith('Genes')]


    
df.head()
# # print(df.head())
for i, row in tqdm(df.iterrows(), total=df.shape[0]):
    #initialize columns
    for geneset in geneset_columns:
        if 'contaminated' in geneset:
            prefix = '_'.join(geneset.split('_')[:2])
            
            # initialize columns  
            df[i, prefix + '_LLM Name'] = None
            df[i, prefix + '_LLM Analysis'] = None
            df[i, prefix + '_LLM Score'] = None
        else:
            df['LLM Name'] = None
            df['LLM Analysis'] = None
            df['LLM Score'] = None
# for i, row in tqdm(df.iterrows(), total=df.shape[0]):
#     for geneset in df.columns.endswith('Genes'):
#         if geneset.contains('contaminated'):
#             prefix = '_'.join(geneset.split('_')[:1])
#             # initialize columns  
#             df[i, prefix + '_LLM Name'] = None
#             df[i, prefix + '_LLM Analysis'] = None
#             df[i, prefix + '_LLM Score'] = None
#         else:
#             df['LLM Name'] = None
#             df['LLM Analysis'] = None
#             df['LLM Score'] = None
#         if col.endswith('contaminated_genes'):
#             # Process each contaminated gene set
#             contaminated_genes = row[col].split()
#             prompt = make_user_prompt_with_score(contaminated_genes)
            
#             analysis = openai_chat(context, prompt, gpt_model, temperature, max_tokens, rate_per_token, LOG_FILE, DOLLAR_LIMIT)
#             llm_process = analysis.split("\n")[0].replace("Process: ", "")
#             llm_score = llm_process.split(" ")[-1].strip("()")
#             llm_name = llm_process.rsplit(" ", 1)[0]
#             llm_analysis = analysis.split('\n', 2)[2]
            
#             # Update DataFrame with results
#             prefix = col.split('_')[0]
#             df.loc[i, prefix + '_LLM Score'] = llm_score
#             df.loc[i, prefix + '_LLM Name'] = llm_name
#             df.loc[i, prefix + '_LLM Analysis'] = llm_analysis

# # df.to_csv('data/GO_term_analysis/LLM_processed_toy_example_with_score.tsv', index=True, sep='\t')


Unnamed: 0_level_0,Genes,Gene_Count,Term_Description,50perc_contaminated_Genes,100perc_contaminated_Genes,LLM Name,LLM Analysis,LLM Score,50perc_contaminated_LLM Name,50perc_contaminated_LLM Analysis,50perc_contaminated_LLM Score,100perc_contaminated_LLM Name,100perc_contaminated_LLM Analysis,100perc_contaminated_LLM Score
GO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
GO:0032385,LDLRAP1 SCP2D1 ANXA2 SCP2,4,positive regulation of intracellular cholester...,LDLRAP1 SCP2 TRIM45 NME5,HMGA2 MID2 HSFX2 FOXP4,,,,,,,,,
GO:0002468,NOD1 HLA-DRA CLEC4A HLA-DRB1 CCL21 NOD2 CCL19 ...,15,dendritic cell antigen processing and presenta...,CD68 HLA-DRB3 CCL19 CCL21 HLA-DRA NOD2 THBS1 T...,JAG1 LTK ARL17A SLCO4A1 PLEKHO2 NDUFS5 ZC3H12D...,,,,,,,,,
GO:0033683,OGG1 ERCC5 XPA ERCC4 NTHL1,5,"nucleotide-excision repair, DNA incision",XPA NTHL1 NAA11 SCD5 CDCA8,MBTPS2 PRCD BUB3 SLC13A1 FADS2,,,,,,,,,
GO:0035672,SLC7A11 SLC25A39 SLC26A6 ABCB9 SLC15A4 ABCC5 C...,15,oligopeptide transmembrane transport,GJA1 SLC15A4 SLC15A1 CDH17 SLC25A39 SLC26A6 SL...,DEFB113 GLMN CELA2B SIGLEC7 RIGI CCL3L3 DEFB11...,,,,,,,,,
GO:0048023,OPN3 CDH3 ATP7A APPL1 ASIP RAB38 ZEB2 TYRP1 GIPC1,9,positive regulation of melanin biosynthetic pr...,TYRP1 CDH3 OPN3 RAB38 FGFRL1 ZNF429 DUS3L CTSK...,WEE2 STIM1 EXOC4 MYO15A GLIPR1L1 ATAD3A CDCA5 ...,,,,,,,,,


In [None]:
# test the script for batch run

input_file = 'data/GO_term_analysis/toy_example.csv'
config = './jsonFiles/GOLLMrun_config.json'
%run query_llm_for_analysis.py --input $input_file --start 0 --end 1 --config $config

## Checkout and combine the output from the batch run 

In [None]:
from glob import glob
import pandas as pd
import json

### sanity check code along the way
processed_files = glob('data/GO_term_analysis/LLM_processed_selected_go_terms*.tsv')

for file in processed_files:
    df = pd.read_csv(file, sep='\t')
    df.set_index('GO', inplace=True)
    ranges = file.split('/')[-1].split('.')[0].split('_')[5:7]
    with open(f'data/GO_term_analysis/LLM_response_go_terms_{ranges[0]}_{ranges[1]}.json') as fp:
        llm_response_dict = json.load(fp)
    for go_term, row in df.iterrows():
        if llm_response_dict[go_term] == 'NO ANALYSIS':
            print(file.split('/')[-1])
            print(f'No analysis for {go_term}')
            continue
        else:
            llm_analysis = llm_response_dict[go_term].split('\n', 2)[2]
            if df.loc[go_term, 'LLM Analysis'] != llm_analysis:
                print(f'LLM analysis for {go_term} is different')
            
    df.reset_index(inplace=True)
#     # print(ranges)
    print(df.shape)

    
combined_df = pd.concat([pd.read_csv(f, sep = '\t') for f in processed_files])
print(combined_df.shape)
print('Any duplicated GO: ',combined_df['GO'].duplicated().sum())
print('Any NAs in the LLM res: ', combined_df['LLM Name'].isna().sum())
print('Any duplicated LLM analysis: ', combined_df['LLM Analysis'].duplicated(keep=False).sum())

combined_df.to_csv('data/GO_term_analysis/LLM_processed_selected_1000_go_terms.tsv', index=False, sep='\t')