## Query GPT-4 for name and analysis using a toy example

#### This uses an improved version of the original prompt that includes instructions to generate an LLM Confidence Score.

#### The prompt also includes an example analysis to help the LLM in its task.

#### The LLM Score has its own column in the output TSV file.

#### The JSON config file is updated to use "GPT-4_1106-preview" build.

Update 12-21-2023

new available models through api: https://api.llm.ideker.ucsd.edu/api/chat

available models:

| NAME           | ID           | SIZE   |
|----------------|--------------|--------|
| llama2:70b     | c3a7af098300 | 38 GB  |
| llama2:7b      | fe938a131f40 | 3.8 GB |
| llama2:latest  | fe938a131f40 | 3.8 GB |
| mistral:7b     | 4d9f4b269c33 | 4.1 GB |
| mixtral:latest | 99a9202f8a7a | 26 GB  |

In [41]:
import pandas as pd
import json 
from utils.openai_query import openai_chat
from utils.prompt_factory import make_user_prompt_with_score
from utils.server_model_query import server_model_chat
from utils.llm_analysis_utils import process_analysis, save_progress
from utils.genai_query import query_genai_model
from tqdm import tqdm
import constant
import openai
import os
import logging
import re
%load_ext autoreload

%autoreload 2


**Default run is using GPT4**

In [51]:
## load variables
initialize = True # if True, then initialize the input table with llm names, analysis and score to None 
# Replace with your actual values
config_file = './jsonFiles/toyexample.json'  # replace with your actual config file 
input_file = 'data/GO_term_analysis/toy_example_w_contaminated.csv' # replace with your actual input file
input_sep = ','  # replace with the separator
set_index = 'GO'  # replace with your column name that you want to set as index or None
gene_column = 'Genes'  # replace with your actual column name for the gene list
gene_sep = ' '  # replace with your actual separator
gene_features = None  # replace with your path to the gene features or None if you don't want to include in the prompt
direct = False # if True, then the prompt will be a direct sentence asking for a name and analysis from the gene set, otherwise default or customized prompt
out_file = 'data/GO_term_analysis/LLM_processed_toy_example_w_contamination_gpt_4'  # replace with your actual output file name

customized_prompt = False # if True, then the prompt will be the custom prompt, if False, then the prompt will use default

# load the config file
with open(config_file) as json_file:
    config = json.load(json_file)

if customized_prompt:
    # make sure the file exist 
    if os.path.isfile(config['CUSTOM_PROMPT_FILE']):
        with open(config['CUSTOM_PROMPT_FILE'], 'r') as f: # replace with your actual customized prompt file
            customized_prompt = f.read()
            assert len(customized_prompt) > 1, "Customized prompt is empty"
    else:
        print("Customized prompt file does not exist")
        customized_prompt = None
else:
    customized_prompt = None

# Load OpenAI key, context, and model used 
openai.api_key = os.environ["OPENAI_API_KEY"]

context = config['CONTEXT']
model = config['MODEL']
temperature = config['TEMP']
max_tokens = config['MAX_TOKENS']
if model.startswith('gpt'):
    rate_per_token = config['RATE_PER_TOKEN']
    DOLLAR_LIMIT = config['DOLLAR_LIMIT']
LOG_FILE = config['LOG_NAME']+'_log.json'

SEED = constant.SEED
column_prefix = model.split('-')[0]

In [52]:
# handle the logger so it create a new one for each model run
def get_logger(filename):
    logger = logging.getLogger(filename)
    logger.setLevel(logging.INFO)
    if not logger.handlers:
        file_handler = logging.FileHandler(filename)
        formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
        file_handler.setFormatter(formatter)
        logger.addHandler(file_handler)
    return logger


def main(df):
    analysis_dict  = {}

    logger = get_logger(f'{out_file}.log')

    i = 0 #used for track progress and saving the file
    for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
        #only process None rows 
        if pd.notna(row[f'{column_prefix} Analysis']):
            continue
        
        gene_data = row[gene_column]
        # if gene_data is not a string, then skip
        if type(gene_data) != str:
            
            logger.warning(f'Gene set {idx} is not a string, skipping')
            continue
        genes = gene_data.split(gene_sep)
        
        if len(genes) >1000:
            logger.warning(f'Gene set {idx} is too big, skipping')
            continue

        try:
            prompt = make_user_prompt_with_score(genes)
            # print(prompt)
            finger_print = None
            if model.startswith('gpt'):
                print("Accessing OpenAI API")
                analysis, finger_print = openai_chat(context, prompt, model, temperature, max_tokens, rate_per_token, LOG_FILE, DOLLAR_LIMIT, SEED)
            elif model.startswith('gemini'):
                print("Using Google Gemini API")
                analysis, error_message = query_genai_model(f"{context}\n{prompt}", model, temperature, max_tokens, LOG_FILE) 
            else:
                print("Using server model")
                analysis, error_message= server_model_chat(context, prompt, model, temperature, max_tokens,LOG_FILE, SEED)

            
            if analysis:
                # print(analysis)
                llm_name, llm_score, llm_analysis = process_analysis(analysis)
                # clean up the score and return float
                try:
                    llm_score_value =  float(re.sub("[^0-9.-]", "", llm_score))
                except ValueError:
                    llm_score_value = llm_score
            
                
                df.loc[idx, f'{column_prefix} Name'] = llm_name
                df.loc[idx, f'{column_prefix} Analysis'] = llm_analysis
                df.loc[idx, f'{column_prefix} Score'] = llm_score_value
                analysis_dict[f'{idx}_{column_prefix}'] = analysis
                # Log success with fingerprint
                logger.info(f'Success for {idx} {column_prefix}.')
                if finger_print:
                    logger.info(f'GPT_Fingerprint for {idx}: {finger_print}')
                    
            else:
                logger.error(f'Error for query gene set {idx}: {error_message}')

        except Exception as e:
            logger.error(f'Error for {idx}: {e}')
            continue
        i += 1
        if i % 10 == 0:
            save_progress(df, analysis_dict, out_file)
            # df.to_csv(f'{out_file}.tsv', sep='\t', index=True)
            print(f"Saved progress for {i} genesets")
    # save the final file
    save_progress(df, analysis_dict, out_file)
    

In [5]:
#Define your own loop for running the pipeline
## 12-18-2023: this loop is for run the default gene set and the contaminated gene sets 
## can modify this loop for different models or only run on default gene set

##12-27-23: edited the prompt 
if __name__ == "__main__":
    
    df = pd.read_csv(input_file, sep=input_sep, index_col=set_index)
    
    if 'gpt' in model:
        name_fix = '_'.join(model.split('-')[:2])
    else:
        name_fix = model.replace(':', '_')
    # column_prefix = name_fix + '_default'
    
    # if initialize:
    #     # initialize the input file with llm names, analysis and score to None
    #     df[f'{column_prefix} Name'] = None
    #     df[f'{column_prefix} Analysis'] = None
    #     df[f'{column_prefix} Score'] = None
    # main(df)  ## run with the real set 
    
    ## run the pipeline for contaiminated gene sets 
    contaminated_columns = [col for col in df.columns if col.endswith('contaminated_Genes')]
    # print(contaminated_columns)
    for col in contaminated_columns:
        gene_column = col ## Note need to change the gene_column to the contaminated column
        contam_prefix = '_'.join(col.split('_')[0:2])
        
        column_prefix = name_fix + '_' +contam_prefix
        print(column_prefix)

        if initialize:
            # initialize the input file with llm names, analysis and score to None
            df[f'{column_prefix} Name'] = None
            df[f'{column_prefix} Analysis'] = None
            df[f'{column_prefix} Score'] = None
        main(df)
    df.head()


gpt_4_50perc_contaminated


  0%|          | 0/10 [00:00<?, ?it/s]

Accessing OpenAI API


 10%|█         | 1/10 [00:46<06:58, 46.45s/it]

1710
Accessing OpenAI API


 20%|██        | 2/10 [02:15<09:33, 71.63s/it]

1749
Accessing OpenAI API


 30%|███       | 3/10 [03:16<07:47, 66.75s/it]

1615
Accessing OpenAI API


 40%|████      | 4/10 [04:14<06:19, 63.23s/it]

1690
Accessing OpenAI API


 50%|█████     | 5/10 [04:49<04:24, 53.00s/it]

1871
Accessing OpenAI API


 60%|██████    | 6/10 [05:50<03:42, 55.64s/it]

1782
Accessing OpenAI API


 70%|███████   | 7/10 [06:31<02:33, 51.05s/it]

1717
Accessing OpenAI API


 80%|████████  | 8/10 [07:21<01:41, 50.55s/it]

1433
Accessing OpenAI API


 90%|█████████ | 9/10 [07:54<00:45, 45.08s/it]

1479
Accessing OpenAI API


100%|██████████| 10/10 [08:18<00:00, 49.82s/it]


1264
Saved progress for 10 genesets
gpt_4_100perc_contaminated


  0%|          | 0/10 [00:00<?, ?it/s]

Accessing OpenAI API


 10%|█         | 1/10 [01:02<09:18, 62.07s/it]

1644
Accessing OpenAI API


 20%|██        | 2/10 [02:31<10:24, 78.09s/it]

1521
Accessing OpenAI API


 30%|███       | 3/10 [03:28<07:59, 68.52s/it]

1887
Accessing OpenAI API


 40%|████      | 4/10 [04:47<07:16, 72.82s/it]

1861
Accessing OpenAI API


 50%|█████     | 5/10 [05:17<04:46, 57.24s/it]

1670
Accessing OpenAI API


 60%|██████    | 6/10 [06:19<03:55, 58.79s/it]

1753
Accessing OpenAI API


 70%|███████   | 7/10 [06:37<02:16, 45.62s/it]

1339
Accessing OpenAI API


 80%|████████  | 8/10 [07:16<01:26, 43.33s/it]

1339
Accessing OpenAI API


 90%|█████████ | 9/10 [07:46<00:39, 39.37s/it]

1520
Accessing OpenAI API


100%|██████████| 10/10 [08:12<00:00, 49.25s/it]

1387
Saved progress for 10 genesets





In [16]:
df = pd.read_csv(input_file, sep=input_sep, index_col=set_index)
df = df.loc['GO:1990874', :]

genes = df['100perc_contaminated_Genes'].split(' ')
print(make_user_prompt_with_score(genes))


Write a critical analysis of the biological processes performed by this system of interacting proteins.
Base your analysis on prior knowledge available in your training data.
After completing your analysis, propose a brief and detailed name for the most prominent biological process performed by the system.
    
After completing your analysis, please also assign a confidence score to the process name you selected.
This score should follow the name in parentheses and range from 0.00 to 1.00. A score of 0.00 indicates the lowest confidence,
while 1.00 reflects the highest confidence. This score helps gauge how accurately the chosen name represents the functions and activities
within the system of interacting proteins. When determining your score, consider the proportion of genes in the protein system that participate
in the identified biological process. For instance, if you select "Ribosome biogenesis" as the process name but only a few genes in the system 
contribute to this process, t

In [43]:
df

Unnamed: 0_level_0,Genes,Gene_Count,Term_Description,50perc_contaminated_Genes,100perc_contaminated_Genes,mixtral_latest_default Name,mixtral_latest_default Analysis,mixtral_latest_default Score,mixtral_latest_50perc_contaminated Name,mixtral_latest_50perc_contaminated Analysis,mixtral_latest_50perc_contaminated Score,mixtral_latest_100perc_contaminated Name,mixtral_latest_100perc_contaminated Analysis,mixtral_latest_100perc_contaminated Score
GO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
GO:0045940,STAR WNT4 ADM APOE CES1 STARD4 NR1D1 TNF FSHB ...,25,positive regulation of steroid metabolic process,WNT4 IL1A FSHB DAB2 IFNG ABCG1 APOA1 STAR CGA ...,CALD1 VPS29 MTRF1 MVP GTF2I CC2D1B EPHA7 SPACA...,Cholesterol and lipid homeostasis,"1. STAR, SCAP, and SREBF1/2 are involved in th...",0.92,"Cholesterol and lipid metabolism, immune resp...",The system of interacting proteins includes se...,0.85,Regulation of gene expression and chromatin o...,1. GTF2I is a general transcription factor IIi...,0.85
GO:0010757,PLAU CTSZ THBS1 SERPINF2 SERPINE1 SERPINE2 CPB...,8,negative regulation of plasminogen activation,CPB2 SERPINE1 PLAU THBS1 ZNF737 SMDT1 XPNPEP2 ...,ADAMTS17 CEP63 LRRC18 SUPT3H FUBP1 KCNK4 RPL27...,Extracellular matrix organization and regulat...,"1. PLAU, also known as urokinase-type plasmino...",0.92,Blood coagulation and fibrinolysis,"1. CPB2, also known as Carboxypeptidase B2 or ...",0.92,Ribosome biogenesis and protein translation,The proteins in this set are primarily involve...,0.85
GO:2000136,GNG5 TBX5 ISL1 RBPJ CTNNB1 NOTCH1 SMAD4 EYA1 B...,18,regulation of cell proliferation involved in h...,MKS1 BMP10 EYA1 SMAD4 HAND2 GNG5 RBPJ SIX1 ENG...,MGMT GPHN BANK1 NDP IL1F10 IKBKG PARD3 INCENP ...,Signaling in pancreatic development and endod...,"1. GNG5, TBX5, ISL1, and SIX1 are involved in ...",0.92,Regulation of developmental processes and sig...,"1. MKS1, BMP10, SMAD4, RBPJ, and ENG are invol...",0.92,Regulation of cellular processes and signaling,1. MGMT is a DNA repair protein that removes a...,0.82
GO:0002433,PLPP4 LYN PRKCE APPL1 PRKCD FYN VAV1 YES1 MYO1...,22,immune response-regulating cell surface recept...,ABL1 VAV3 APPL2 LYN FGR SYK PRKCE PRKCD PLPP4 ...,JAML PRKCSH PIM1 EID2 EPO UBE4A MRPL9 ASB18 SE...,Regulation of cell signaling and actin dynamics,The proteins in this set are primarily involve...,0.92,Regulation of cellular signaling and cytoskel...,"1. ABL1, SRC, LYN, FGR, HCK are non-receptor t...",0.87,Regulation of protein synthesis and modification,"1. PRKCSH, also known as protein kinase C subs...",0.87
GO:1990874,DBH NF1 ERN1 MMP2 HPGD IGFBP5 TGFB3 DDIT3 MAP3...,61,vascular associated smooth muscle cell prolife...,MAP3K7 PDGFB HPGD CDKN1A IGFBP5 EFEMP2 FGF9 TG...,GARNL3 OR1J1 SPP2 USP17L2 ARMH3 ANKRD13A HELZ ...,"Regulation of cell growth, survival, and meta...",The proteins in this set are primarily involve...,0.92,"Regulation of cell growth, differentiation, a...",The system of interacting proteins consists of...,0.92,Regulation of gene expression and protein hom...,"1. GARNL3, USP17L2, and UBE2I are involved in ...",0.85
GO:0002792,ADRA2A KCNB1 SFRP1 IRS1 GHSR CRH LEP CD74 PFKL...,46,negative regulation of peptide secretion,ADRA2A PPP3CA FOXO1 CHGA SFRP1 SREBF1 FAM3D LE...,DCAF7 CIBAR2 GPNMB HAND1 ATP5MG MRI1 MOCOS CYP...,Metabolic regulation and cellular signaling i...,"1. ADRA2A, KCNB1, KCNJ11, and ABCC8 are involv...",0.92,Signal transduction and metabolic regulation,The system of interacting proteins consists of...,0.92,Regulation of transcription and translation,The system of interacting proteins includes se...,0.92
GO:1900368,PUM1 MAEL TIAL1 RIPK1 TP53 FXR1 BCDIN3D FMR1 A...,25,regulation of post-transcriptional gene silenc...,BMP4 ZMPSTE24 MAEL PUM2 ELAVL1 ZFP36 LIN28B IL...,GPR75-ASB3 ELOC ADORA2B RPL41 PFDN5 PPP1R12C C...,Regulation of transcription and RNA processing,1. PUM1 and PUM2 are highly similar in sequenc...,0.92,Regulation of gene expression and cellular re...,"1. BMP4, TGFB1, and IL6 are growth factors tha...",0.85,Regulation of transcription and protein synth...,The system of interacting proteins is primaril...,0.92
GO:2000524,LILRB2 LILRB4 CD160,3,negative regulation of T cell costimulation,LILRB4 RIPK1 RASSF2,ATOH7 DRAM1 TOGARAM2,Immune cell regulation and activation,1. LILRB2 and LILRB4 are both receptors of the...,0.92,Immune regulation and apoptosis,"1. LILRB4, also known as leukocyte immunoglobu...",0.92,Regulation of centriole biogenesis and ciliog...,"1. ATOH7, also known as Math6, is a basic heli...",0.92
GO:2000556,IL18 ARID5A TBX21 SLAMF1 IL1R1 IL1B XCL1 IL18R1,8,positive regulation of T-helper 1 cell cytokin...,IL18 XCL1 SLAMF1 IL1R1 RPS6KA3 OR9Q1 OR52M1 ZFP36,PTH2 MGAT4C POM121 KRTAP4-9 DEFA1 SRPRA U2AF1 F2,Interleukin-1 (IL-1) signaling and inflammato...,"1. IL18, IL18R1, IL1B, and IL1R1 are key compo...",0.98,Inflammatory response and immune cell regulation,"1. IL18, XCL1, SLAMF1, and IL1R1 are involved ...",0.87,Protein glycosylation and trafficking,1. MGAT4C is a gene that encodes for N-acetylg...,0.87
GO:0120253,CYP2F1 BCO1 BCO2,3,hydrocarbon catabolic process,CYP2F1 NUF2 BTBD1,MORC3 ZNF627 EHMT1,Fat-soluble vitamin metabolism,1. CYP2F1 is a member of the cytochrome P450 s...,0.95,Unclear biological function,1. CYP2F1 is a member of the cytochrome P450 s...,0.35,Chromatin modification and transcriptional re...,"1. MORC3, a member of the microrchidia family,...",0.85


In [42]:
# check if there is any None in the analysis column, then rerun the pipeline

initialize = False 

SEED = 42
# model_options = ['gemini-pro','mistral:7b', 'mixtral:latest', 'llama2:7b', 'llama2:70b']
model_options = ['mixtral:latest']  # llama2 7b has formatting issue, ingore and 70b is too big causing server issue

if __name__ == "__main__":
    for m in model_options:
        input_file
        model = m
        
        if '-' in model:
            name_fix = '_'.join(model.split('-')[:2])
        else:
            name_fix = model.replace(':', '_')
        input_file = f'data/GO_term_analysis/LLM_processed_toy_example_w_contamination_{name_fix}.tsv' # replace with your actual input file
        out_file = f'data/GO_term_analysis/LLM_processed_toy_example_w_contamination_{name_fix}'  # save to the same file name as the input file
        LOG_FILE = config['LOG_NAME']+f'_{name_fix}'+'_log.json'

        df = pd.read_csv(input_file, sep=input_sep, index_col=set_index)
        # print(df.head())
        column_prefix = name_fix + '_default' #this is default
        print(column_prefix)
        
        gene_column = constant.GO_GENE_COL
        print(gene_column)
        if initialize:
            # initialize the input file with llm names, analysis and score to None
            df[f'{column_prefix} Name'] = None
            df[f'{column_prefix} Analysis'] = None
            df[f'{column_prefix} Score'] = None
        main(df)  ## run with the real set 
        
        ## run the pipeline for contaiminated gene sets 
        contaminated_columns = [col for col in df.columns if col.endswith('contaminated_Genes')]
        # print(contaminated_columns)
        for col in contaminated_columns:
            gene_column = col ## Note need to change the gene_column to the contaminated column
            print(gene_column)
            contam_prefix = '_'.join(col.split('_')[0:2])
            column_prefix = name_fix + '_' +contam_prefix
            print(column_prefix)

            if initialize:
                # initialize the input file with llm names, analysis and score to None
                df[f'{column_prefix} Name'] = None
                df[f'{column_prefix} Analysis'] = None
                df[f'{column_prefix} Score'] = None
            main(df)
            
print("Done")

mixtral_latest_default
Genes


100%|██████████| 10/10 [00:00<00:00, 14614.30it/s]


50perc_contaminated_Genes
mixtral_latest_50perc_contaminated


100%|██████████| 10/10 [00:00<00:00, 14553.45it/s]


100perc_contaminated_Genes
mixtral_latest_100perc_contaminated


100%|██████████| 10/10 [00:00<00:00, 20126.22it/s]

Done





In [44]:
import os 
from glob import glob


initialize = True 
input_file = 'data/GO_term_analysis/toy_example_w_contaminated.csv'
input_sep = constant.GO_FILE_SEP
set_index = constant.GO_INDEX_COL  
gene_column = constant.GO_GENE_COL 
gene_sep = ' '

## create a param file 
configs = glob('./jsonFiles/toyexample_*.json')
params = []
for conf_file in configs:
    model_names = '_'.join(conf_file.split('/')[-1].split('.')[0].split('_')[1:])
    # print(model_names)
    out_file = f'data/GO_term_analysis/LLM_processed_toy_example_w_contamination_{model_names}'  
    param = f"--config {conf_file} \
        --initialize  {initialize}\
        --input {input_file} \
        --input_sep  '{input_sep}'\
        --set_index {set_index} \
        --gene_column {gene_column}\
        --gene_sep '{gene_sep}' \
        --start 0 \
        --end 10 \
        --output_file {out_file}"
    print(param)
    params.append(param)

with open('toy_example_params.txt', 'w') as f:
    for p in params:
        f.write(p+'\n')

--config ./jsonFiles/toyexample_mixtral_latest.json         --initialize  True        --input data/GO_term_analysis/toy_example_w_contaminated.csv         --input_sep  ','        --set_index GO         --gene_column Genes        --gene_sep ' '         --start 0         --end 10         --output_file data/GO_term_analysis/LLM_processed_toy_example_w_contamination_mixtral_latest
--config ./jsonFiles/toyexample_llama2_70b.json         --initialize  True        --input data/GO_term_analysis/toy_example_w_contaminated.csv         --input_sep  ','        --set_index GO         --gene_column Genes        --gene_sep ' '         --start 0         --end 10         --output_file data/GO_term_analysis/LLM_processed_toy_example_w_contamination_llama2_70b
--config ./jsonFiles/toyexample_llama2_7b.json         --initialize  True        --input data/GO_term_analysis/toy_example_w_contaminated.csv         --input_sep  ','        --set_index GO         --gene_column Genes        --gene_sep ' '         -

In [73]:
# add a gene set == 100 and run the analysis. add to the toy example file
selected_go_terms = pd.read_csv('data/GO_term_analysis/1000_selected_go_contaminated.csv')

# only one that has gene count == 100
gene_count100 = selected_go_terms[selected_go_terms['Gene_Count'] == 100].reset_index(drop=True)

model_options = ['gpt-4-1106-preview','gemini-pro','mistral:7b', 'mixtral:latest', 'llama2:7b']
contaminated_columns = [col for col in gene_count100.columns if col.endswith('contaminated_Genes')]
columns = [constant.GO_GENE_COL] + contaminated_columns
print(columns)
for model in model_options:
    df = gene_count100.copy()
    for col in columns:

        gene_column = col
        genes = gene_count100[gene_column].iloc[0].split(' ')
        prompt = make_user_prompt_with_score(genes)

        if '-' in model:
            name_fix = '_'.join(model.split('-')[:2])
        else:
            name_fix = model.replace(':', '_')
        LOG_FILE = config['LOG_NAME']+f'_{name_fix}'+'_log.json'
        if col == constant.GO_GENE_COL:
            column_prefix = name_fix + '_default'
        else:
            contam_prefix = '_'.join(col.split('_')[0:2])
            column_prefix = name_fix + '_' +contam_prefix
        print(column_prefix)
        finger_print = None
        if 'gpt' in model:
            print("Accessing OpenAI API")
            analysis, finger_print = openai_chat(context, prompt, model, temperature, max_tokens, rate_per_token, LOG_FILE, DOLLAR_LIMIT, SEED)
        elif model.startswith('gemini'):
            print("Using Google Gemini API")
            analysis, error_message = query_genai_model(f"{context}\n{prompt}", model, temperature, max_tokens, LOG_FILE)
        else: 
            print("Using server model")
            analysis, error_message= server_model_chat(context, prompt, model, temperature, max_tokens,LOG_FILE, SEED)
        if analysis:
            # print(analysis)
            llm_name, llm_score, llm_analysis = process_analysis(analysis)
            # clean up the score and return float
            try:
                llm_score_value =  float(re.sub("[^0-9.-]", "", llm_score))
            except ValueError:
                llm_score_value = llm_score
            df.loc[0,f'{column_prefix } Name'] = llm_name
            df.loc[0,f'{column_prefix } Analysis'] = llm_analysis
            df.loc[0,f'{column_prefix } Score'] = llm_score_value
            print(f'Success for {gene_column} {name_fix}.')
            if finger_print:
                print(f'GPT_Fingerprint for {gene_column}: {finger_print}')
        else:    
            print(f'Error for query gene set {gene_column}: {error_message}')
    df.to_csv(f'data/GO_term_analysis/add_on_LLM_processed_toy_example_w_contamination_{name_fix}.tsv', index=False,sep='\t')
    print('save file for ', model)

['Genes', '50perc_contaminated_Genes', '100perc_contaminated_Genes']
gpt_4_default
Accessing OpenAI API
2046
Success for Genes gpt_4.
GPT_Fingerprint for Genes: fp_3905aa4f79
gpt_4_50perc_contaminated
Accessing OpenAI API
1804
Success for 50perc_contaminated_Genes gpt_4.
GPT_Fingerprint for 50perc_contaminated_Genes: fp_3905aa4f79
gpt_4_100perc_contaminated
Accessing OpenAI API
1760
Success for 100perc_contaminated_Genes gpt_4.
GPT_Fingerprint for 100perc_contaminated_Genes: fp_3905aa4f79
save file for  gpt-4-1106-preview
gemini_pro_default
Using Google Gemini API
Success for Genes gemini_pro.
gemini_pro_50perc_contaminated
Using Google Gemini API
Success for 50perc_contaminated_Genes gemini_pro.
gemini_pro_100perc_contaminated
Using Google Gemini API
Success for 100perc_contaminated_Genes gemini_pro.
save file for  gemini-pro
mistral_7b_default
Using server model
Encountering server issue 502. Retrying in  10  seconds
Success for Genes mistral_7b.
mistral_7b_50perc_contaminated
Using 

In [77]:
import time
df = pd.read_csv('data/GO_term_analysis/add_on_LLM_processed_toy_example_w_contamination_mixtral_latest.tsv', sep='\t', index_col=constant.GO_INDEX_COL)
# rerun the pipeline for 'Genes'
genes = df['Genes'].iloc[0].split(' ')
prompt = make_user_prompt_with_score(genes)
model = 'mixtral:latest'
name_fix = '_'.join(model.split(':')[:2])
column_prefix = name_fix + '_default'
print(column_prefix)
LOG_FILE = config['LOG_NAME']+f'_{name_fix}'+'_log.json'
analysis, err  = server_model_chat(context, prompt, model, temperature, max_tokens,LOG_FILE, SEED)
if analysis:
    llm_name, llm_score, llm_analysis = process_analysis(analysis)
    llm_score_value =  float(re.sub("[^0-9.-]", "", llm_score))
    df.loc[0,f'{column_prefix } Name'] = llm_name
    df.loc[0,f'{column_prefix } Analysis'] = llm_analysis
    df.loc[0,f'{column_prefix } Score'] = llm_score_value
    print(f'Success for {gene_column} {name_fix}.')
else:
    time.sleep(20)
    #retry
    analysis, err  = server_model_chat(context, prompt, model, temperature, max_tokens,LOG_FILE, SEED)
    if analysis:
        llm_name, llm_score, llm_analysis = process_analysis(analysis)
        llm_score_value =  float(re.sub("[^0-9.-]", "", llm_score))
        df.loc[0,f'{column_prefix } Name'] = llm_name
        df.loc[0,f'{column_prefix } Analysis'] = llm_analysis
        df.loc[0,f'{column_prefix } Score'] = llm_score_value
        print(f'Success for {gene_column} {name_fix}.')
    else:
        print(f'Error for query gene set {gene_column}: {err}')
    
print(df.head())


mixtral_latest_default
Encountering server issue 502. Retrying in  10  seconds
Encountering server issue 502. Retrying in  20  seconds
Encountering server issue 502. Retrying in  40  seconds
Encountering server issue 502. Retrying in  80  seconds
Encountering server issue 502. Retrying in  160  seconds
Encountering server issue 502. Retrying in  10  seconds
Encountering server issue 502. Retrying in  20  seconds
Encountering server issue 502. Retrying in  40  seconds
Encountering server issue 502. Retrying in  80  seconds
Encountering server issue 502. Retrying in  160  seconds
Error for query gene set 100perc_contaminated_Genes: Error: Max retries exceeded, last response error was: 502
                                                        Genes  Gene_Count  \
GO                                                                          
GO:0090100  SOX11 GDF5 TGFB3 NOTCH1 AMH TNXB FERMT1 TGFBR3...         100   

                                             Term_Description  \
GO     

In [None]:
# Define start, step, and end values
start = 0
step = 50
end = 1000

# Create a range list
range_list = list(range(start, end + step, step))

# Create tuples for each consecutive pair in the list
tuple_list = [(range_list[i], range_list[i+1]) for i in range(len(range_list)-1)]

tuple_list

In [None]:
# test the script for batch run

input_file = 'data/GO_term_analysis/toy_example.csv'
config = './jsonFiles/GOLLMrun_config.json'
%run query_llm_for_analysis.py --input $input_file --start 0 --end 1 --config $config

## Checkout and combine the output from the batch run 

In [None]:
from glob import glob
import pandas as pd
import json

### sanity check code along the way
processed_files = glob('data/GO_term_analysis/LLM_processed_selected_go_terms*.tsv')

for file in processed_files:
    df = pd.read_csv(file, sep='\t')
    df.set_index('GO', inplace=True)
    ranges = file.split('/')[-1].split('.')[0].split('_')[5:7]
    with open(f'data/GO_term_analysis/LLM_response_go_terms_{ranges[0]}_{ranges[1]}.json') as fp:
        llm_response_dict = json.load(fp)
    for go_term, row in df.iterrows():
        if llm_response_dict[go_term] == 'NO ANALYSIS':
            print(file.split('/')[-1])
            print(f'No analysis for {go_term}')
            continue
        else:
            llm_analysis = llm_response_dict[go_term].split('\n', 2)[2]
            if df.loc[go_term, 'LLM Analysis'] != llm_analysis:
                print(f'LLM analysis for {go_term} is different')
            
    df.reset_index(inplace=True)
#     # print(ranges)
    print(df.shape)

    
combined_df = pd.concat([pd.read_csv(f, sep = '\t') for f in processed_files])
print(combined_df.shape)
print('Any duplicated GO: ',combined_df['GO'].duplicated().sum())
print('Any NAs in the LLM res: ', combined_df['LLM Name'].isna().sum())
print('Any duplicated LLM analysis: ', combined_df['LLM Analysis'].duplicated(keep=False).sum())

combined_df.to_csv('data/GO_term_analysis/LLM_processed_selected_1000_go_terms.tsv', index=False, sep='\t')