## Query GPT-4 for name and analysis using a toy example

#### This uses an improved version of the original prompt that includes instructions to generate an LLM Confidence Score.

#### The prompt also includes an example analysis to help the LLM in its task.

#### The LLM Score has its own column in the output TSV file.

#### The JSON config file is updated to use "GPT-4_1106-preview" build.

Update 12-21-2023

new available models through api: https://api.llm.ideker.ucsd.edu/api/chat

available models:

| NAME           | ID           | SIZE   |
|----------------|--------------|--------|
| llama2:70b     | c3a7af098300 | 38 GB  |
| llama2:7b      | fe938a131f40 | 3.8 GB |
| llama2:latest  | fe938a131f40 | 3.8 GB |
| mistral:7b     | 4d9f4b269c33 | 4.1 GB |
| mixtral:latest | 99a9202f8a7a | 26 GB  |

In [3]:
import pandas as pd
import json 
from utils.openai_query import openai_chat
from utils.prompt_factory import make_user_prompt_with_score
from utils.server_model_query import server_model_chat
from utils.llm_analysis_utils import process_analysis, save_progress
from utils.genai_query import query_genai_model
from tqdm import tqdm
import constant
import openai
import os
import logging
import re
%load_ext autoreload

%autoreload 2


**Default run is using GPT4**

In [2]:
## load variables
initialize = True # if True, then initialize the input table with llm names, analysis and score to None 
# Replace with your actual values
config_file = './jsonFiles/toyexample.json'  # replace with your actual config file 
input_file = 'data/GO_term_analysis/toy_example_w_contaminated.csv' # replace with your actual input file
input_sep = ','  # replace with the separator
set_index = 'GO'  # replace with your column name that you want to set as index or None
gene_column = 'Genes'  # replace with your actual column name for the gene list
gene_sep = ' '  # replace with your actual separator
gene_features = None  # replace with your path to the gene features or None if you don't want to include in the prompt
direct = False # if True, then the prompt will be a direct sentence asking for a name and analysis from the gene set, otherwise default or customized prompt
out_file = 'data/GO_term_analysis/0120_heavychangeprompt_LLM_processed_toy_example_w_contamination_gpt_4'  # replace with your actual output file name

customized_prompt = False # if True, then the prompt will be the custom prompt, if False, then the prompt will use default

# load the config file
with open(config_file) as json_file:
    config = json.load(json_file)

if customized_prompt:
    # make sure the file exist 
    if os.path.isfile(config['CUSTOM_PROMPT_FILE']):
        with open(config['CUSTOM_PROMPT_FILE'], 'r') as f: # replace with your actual customized prompt file
            customized_prompt = f.read()
            assert len(customized_prompt) > 1, "Customized prompt is empty"
    else:
        print("Customized prompt file does not exist")
        customized_prompt = None
else:
    customized_prompt = None

# Load OpenAI key, context, and model used 
openai.api_key = os.environ["OPENAI_API_KEY"]

context = config['CONTEXT']
model = config['MODEL']
temperature = config['TEMP']
max_tokens = config['MAX_TOKENS']
if model.startswith('gpt'):
    rate_per_token = config['RATE_PER_TOKEN']
    DOLLAR_LIMIT = config['DOLLAR_LIMIT']
LOG_FILE = config['LOG_NAME']+'_log.json'

SEED = constant.SEED
column_prefix = model.split('-')[0]

In [3]:
# handle the logger so it create a new one for each model run
def get_logger(filename):
    logger = logging.getLogger(filename)
    logger.setLevel(logging.INFO)
    if not logger.handlers:
        file_handler = logging.FileHandler(filename)
        formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
        file_handler.setFormatter(formatter)
        logger.addHandler(file_handler)
    return logger


def main(df):
    analysis_dict  = {}

    logger = get_logger(f'{out_file}.log')

    i = 0 #used for track progress and saving the file
    for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
        #only process None rows 
        if pd.notna(row[f'{column_prefix} Analysis']):
            continue
        
        gene_data = row[gene_column]
        # if gene_data is not a string, then skip
        if type(gene_data) != str:
            
            logger.warning(f'Gene set {idx} is not a string, skipping')
            continue
        genes = gene_data.split(gene_sep)
        
        if len(genes) >1000:
            logger.warning(f'Gene set {idx} is too big, skipping')
            continue

        try:
            prompt = make_user_prompt_with_score(genes)
            # print(prompt)
            finger_print = None
            if model.startswith('gpt'):
                print("Accessing OpenAI API")
                analysis, finger_print = openai_chat(context, prompt, model, temperature, max_tokens, rate_per_token, LOG_FILE, DOLLAR_LIMIT, SEED)
            elif model.startswith('gemini'):
                print("Using Google Gemini API")
                analysis, error_message = query_genai_model(f"{context}\n{prompt}", model, temperature, max_tokens, LOG_FILE) 
            else:
                print("Using server model")
                analysis, error_message= server_model_chat(context, prompt, model, temperature, max_tokens,LOG_FILE, SEED)

            
            if analysis:
                # print(analysis)
                llm_name, llm_score, llm_analysis = process_analysis(analysis)
                # clean up the score and return float
                try:
                    llm_score_value =  float(re.sub("[^0-9.-]", "", llm_score))
                except ValueError:
                    llm_score_value = llm_score
            
                
                df.loc[idx, f'{column_prefix} Name'] = llm_name
                df.loc[idx, f'{column_prefix} Analysis'] = llm_analysis
                df.loc[idx, f'{column_prefix} Score'] = llm_score_value
                analysis_dict[f'{idx}_{column_prefix}'] = analysis
                # Log success with fingerprint
                logger.info(f'Success for {idx} {column_prefix}.')
                if finger_print:
                    logger.info(f'GPT_Fingerprint for {idx}: {finger_print}')
                    
            else:
                logger.error(f'Error for query gene set {idx}: {error_message}')

        except Exception as e:
            logger.error(f'Error for {idx}: {e}')
            continue
        i += 1
        if i % 10 == 0:
            save_progress(df, analysis_dict, out_file)
            # df.to_csv(f'{out_file}.tsv', sep='\t', index=True)
            print(f"Saved progress for {i} genesets")
    # save the final file
    save_progress(df, analysis_dict, out_file)
    

In [4]:
#Define your own loop for running the pipeline
## 12-18-2023: this loop is for run the default gene set and the contaminated gene sets 
## can modify this loop for different models or only run on default gene set

##12-27-23: edited the prompt 
if __name__ == "__main__":
    
    df = pd.read_csv(input_file, sep=input_sep, index_col=set_index)
    
    if 'gpt' in model:
        name_fix = '_'.join(model.split('-')[:2])
    else:
        name_fix = model.replace(':', '_')
    column_prefix = name_fix + '_default'
    print(column_prefix)
    
    if initialize:
        # initialize the input file with llm names, analysis and score to None
        df[f'{column_prefix} Name'] = None
        df[f'{column_prefix} Analysis'] = None
        df[f'{column_prefix} Score'] = None
    main(df)  ## run with the real set 
    
    ## run the pipeline for contaiminated gene sets 
    contaminated_columns = [col for col in df.columns if col.endswith('contaminated_Genes')]
    # print(contaminated_columns)
    for col in contaminated_columns:
        gene_column = col ## Note need to change the gene_column to the contaminated column
        contam_prefix = '_'.join(col.split('_')[0:2])
        
        column_prefix = name_fix + '_' +contam_prefix
        print(column_prefix)

        if initialize:
            # initialize the input file with llm names, analysis and score to None
            df[f'{column_prefix} Name'] = None
            df[f'{column_prefix} Analysis'] = None
            df[f'{column_prefix} Score'] = None
        main(df)
    df.head()


gpt_4_default


  0%|          | 0/11 [00:00<?, ?it/s]

Accessing OpenAI API


  9%|▉         | 1/11 [00:21<03:36, 21.65s/it]

1761
Accessing OpenAI API


 18%|█▊        | 2/11 [00:37<02:46, 18.47s/it]

1646
Accessing OpenAI API


 27%|██▋       | 3/11 [01:04<02:57, 22.21s/it]

1519
Accessing OpenAI API


 36%|███▋      | 4/11 [01:26<02:35, 22.20s/it]

1852
Accessing OpenAI API


 45%|████▌     | 5/11 [02:07<02:54, 29.01s/it]

1845
Accessing OpenAI API


 55%|█████▍    | 6/11 [02:54<02:55, 35.04s/it]

1886
Accessing OpenAI API


 64%|██████▎   | 7/11 [03:33<02:25, 36.41s/it]

1795
Accessing OpenAI API


 73%|███████▎  | 8/11 [03:44<01:25, 28.37s/it]

1319
Accessing OpenAI API


 82%|████████▏ | 9/11 [04:08<00:53, 26.76s/it]

1537
Accessing OpenAI API


 91%|█████████ | 10/11 [04:22<00:23, 23.07s/it]

1354
Saved progress for 10 genesets
Accessing OpenAI API


100%|██████████| 11/11 [05:09<00:00, 28.10s/it]


2096
gpt_4_50perc_contaminated


  0%|          | 0/11 [00:00<?, ?it/s]

Accessing OpenAI API


  9%|▉         | 1/11 [00:32<05:28, 32.87s/it]

1672
Accessing OpenAI API


 18%|█▊        | 2/11 [00:52<03:43, 24.82s/it]

1678
Accessing OpenAI API


 27%|██▋       | 3/11 [01:21<03:35, 26.98s/it]

1714
Accessing OpenAI API


 36%|███▋      | 4/11 [01:40<02:47, 23.97s/it]

1686
Accessing OpenAI API


 45%|████▌     | 5/11 [02:06<02:26, 24.39s/it]

1673
Accessing OpenAI API


 55%|█████▍    | 6/11 [02:24<01:51, 22.38s/it]

1786
Accessing OpenAI API


 64%|██████▎   | 7/11 [02:51<01:35, 23.86s/it]

1488
Accessing OpenAI API


 73%|███████▎  | 8/11 [03:16<01:12, 24.29s/it]

1442
Accessing OpenAI API


 82%|████████▏ | 9/11 [03:46<00:51, 25.96s/it]

1560
Accessing OpenAI API


 91%|█████████ | 10/11 [04:00<00:22, 22.42s/it]

1324
Saved progress for 10 genesets
Accessing OpenAI API


100%|██████████| 11/11 [04:20<00:00, 23.67s/it]


2076
gpt_4_100perc_contaminated


  0%|          | 0/11 [00:00<?, ?it/s]

Accessing OpenAI API


  9%|▉         | 1/11 [00:27<04:35, 27.60s/it]

2010
Accessing OpenAI API


 18%|█▊        | 2/11 [00:43<03:06, 20.70s/it]

1556
Accessing OpenAI API


 27%|██▋       | 3/11 [01:40<04:59, 37.40s/it]

1813
Accessing OpenAI API


 36%|███▋      | 4/11 [02:22<04:32, 39.00s/it]

1900
Accessing OpenAI API


 45%|████▌     | 5/11 [02:54<03:39, 36.65s/it]

1757
Accessing OpenAI API


 55%|█████▍    | 6/11 [03:24<02:51, 34.38s/it]

1625
Accessing OpenAI API


 64%|██████▎   | 7/11 [03:47<02:02, 30.54s/it]

1612
Accessing OpenAI API


 73%|███████▎  | 8/11 [04:04<01:18, 26.16s/it]

1343
Accessing OpenAI API


 82%|████████▏ | 9/11 [04:16<00:43, 21.75s/it]

1475
Accessing OpenAI API


 91%|█████████ | 10/11 [04:58<00:28, 28.00s/it]

1423
Saved progress for 10 genesets
Accessing OpenAI API


100%|██████████| 11/11 [05:21<00:00, 29.25s/it]

1802





In [4]:
df = pd.read_csv(input_file, sep=input_sep, index_col=set_index)
df = df.loc['GO:2000136', :]

genes = df['Genes'].split(' ')
print(make_user_prompt_with_score(genes))


Write a critical analysis of the biological processes performed by this system of interacting proteins.
Base your analysis on prior knowledge available in your training data.
After completing your analysis, propose a brief and detailed name for the most prominent biological process performed by the system.
    
After completing your analysis, please also assign a confidence score to the process name you selected.
This score should follow the name in parentheses and range from 0.00 to 1.00. A score of 0.00 indicates the lowest confidence,
while 1.00 reflects the highest confidence. This score helps gauge how accurately the chosen name represents the functions and activities
within the system of interacting proteins. When determining your score, consider the proportion of genes in the protein system that participate
in the identified biological process. For instance, if you select "Ribosome biogenesis" as the process name but only a few genes in the system 
contribute to this process, t

In [43]:
df

Unnamed: 0_level_0,Genes,Gene_Count,Term_Description,50perc_contaminated_Genes,100perc_contaminated_Genes,mixtral_latest_default Name,mixtral_latest_default Analysis,mixtral_latest_default Score,mixtral_latest_50perc_contaminated Name,mixtral_latest_50perc_contaminated Analysis,mixtral_latest_50perc_contaminated Score,mixtral_latest_100perc_contaminated Name,mixtral_latest_100perc_contaminated Analysis,mixtral_latest_100perc_contaminated Score
GO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
GO:0045940,STAR WNT4 ADM APOE CES1 STARD4 NR1D1 TNF FSHB ...,25,positive regulation of steroid metabolic process,WNT4 IL1A FSHB DAB2 IFNG ABCG1 APOA1 STAR CGA ...,CALD1 VPS29 MTRF1 MVP GTF2I CC2D1B EPHA7 SPACA...,Cholesterol and lipid homeostasis,"1. STAR, SCAP, and SREBF1/2 are involved in th...",0.92,"Cholesterol and lipid metabolism, immune resp...",The system of interacting proteins includes se...,0.85,Regulation of gene expression and chromatin o...,1. GTF2I is a general transcription factor IIi...,0.85
GO:0010757,PLAU CTSZ THBS1 SERPINF2 SERPINE1 SERPINE2 CPB...,8,negative regulation of plasminogen activation,CPB2 SERPINE1 PLAU THBS1 ZNF737 SMDT1 XPNPEP2 ...,ADAMTS17 CEP63 LRRC18 SUPT3H FUBP1 KCNK4 RPL27...,Extracellular matrix organization and regulat...,"1. PLAU, also known as urokinase-type plasmino...",0.92,Blood coagulation and fibrinolysis,"1. CPB2, also known as Carboxypeptidase B2 or ...",0.92,Ribosome biogenesis and protein translation,The proteins in this set are primarily involve...,0.85
GO:2000136,GNG5 TBX5 ISL1 RBPJ CTNNB1 NOTCH1 SMAD4 EYA1 B...,18,regulation of cell proliferation involved in h...,MKS1 BMP10 EYA1 SMAD4 HAND2 GNG5 RBPJ SIX1 ENG...,MGMT GPHN BANK1 NDP IL1F10 IKBKG PARD3 INCENP ...,Signaling in pancreatic development and endod...,"1. GNG5, TBX5, ISL1, and SIX1 are involved in ...",0.92,Regulation of developmental processes and sig...,"1. MKS1, BMP10, SMAD4, RBPJ, and ENG are invol...",0.92,Regulation of cellular processes and signaling,1. MGMT is a DNA repair protein that removes a...,0.82
GO:0002433,PLPP4 LYN PRKCE APPL1 PRKCD FYN VAV1 YES1 MYO1...,22,immune response-regulating cell surface recept...,ABL1 VAV3 APPL2 LYN FGR SYK PRKCE PRKCD PLPP4 ...,JAML PRKCSH PIM1 EID2 EPO UBE4A MRPL9 ASB18 SE...,Regulation of cell signaling and actin dynamics,The proteins in this set are primarily involve...,0.92,Regulation of cellular signaling and cytoskel...,"1. ABL1, SRC, LYN, FGR, HCK are non-receptor t...",0.87,Regulation of protein synthesis and modification,"1. PRKCSH, also known as protein kinase C subs...",0.87
GO:1990874,DBH NF1 ERN1 MMP2 HPGD IGFBP5 TGFB3 DDIT3 MAP3...,61,vascular associated smooth muscle cell prolife...,MAP3K7 PDGFB HPGD CDKN1A IGFBP5 EFEMP2 FGF9 TG...,GARNL3 OR1J1 SPP2 USP17L2 ARMH3 ANKRD13A HELZ ...,"Regulation of cell growth, survival, and meta...",The proteins in this set are primarily involve...,0.92,"Regulation of cell growth, differentiation, a...",The system of interacting proteins consists of...,0.92,Regulation of gene expression and protein hom...,"1. GARNL3, USP17L2, and UBE2I are involved in ...",0.85
GO:0002792,ADRA2A KCNB1 SFRP1 IRS1 GHSR CRH LEP CD74 PFKL...,46,negative regulation of peptide secretion,ADRA2A PPP3CA FOXO1 CHGA SFRP1 SREBF1 FAM3D LE...,DCAF7 CIBAR2 GPNMB HAND1 ATP5MG MRI1 MOCOS CYP...,Metabolic regulation and cellular signaling i...,"1. ADRA2A, KCNB1, KCNJ11, and ABCC8 are involv...",0.92,Signal transduction and metabolic regulation,The system of interacting proteins consists of...,0.92,Regulation of transcription and translation,The system of interacting proteins includes se...,0.92
GO:1900368,PUM1 MAEL TIAL1 RIPK1 TP53 FXR1 BCDIN3D FMR1 A...,25,regulation of post-transcriptional gene silenc...,BMP4 ZMPSTE24 MAEL PUM2 ELAVL1 ZFP36 LIN28B IL...,GPR75-ASB3 ELOC ADORA2B RPL41 PFDN5 PPP1R12C C...,Regulation of transcription and RNA processing,1. PUM1 and PUM2 are highly similar in sequenc...,0.92,Regulation of gene expression and cellular re...,"1. BMP4, TGFB1, and IL6 are growth factors tha...",0.85,Regulation of transcription and protein synth...,The system of interacting proteins is primaril...,0.92
GO:2000524,LILRB2 LILRB4 CD160,3,negative regulation of T cell costimulation,LILRB4 RIPK1 RASSF2,ATOH7 DRAM1 TOGARAM2,Immune cell regulation and activation,1. LILRB2 and LILRB4 are both receptors of the...,0.92,Immune regulation and apoptosis,"1. LILRB4, also known as leukocyte immunoglobu...",0.92,Regulation of centriole biogenesis and ciliog...,"1. ATOH7, also known as Math6, is a basic heli...",0.92
GO:2000556,IL18 ARID5A TBX21 SLAMF1 IL1R1 IL1B XCL1 IL18R1,8,positive regulation of T-helper 1 cell cytokin...,IL18 XCL1 SLAMF1 IL1R1 RPS6KA3 OR9Q1 OR52M1 ZFP36,PTH2 MGAT4C POM121 KRTAP4-9 DEFA1 SRPRA U2AF1 F2,Interleukin-1 (IL-1) signaling and inflammato...,"1. IL18, IL18R1, IL1B, and IL1R1 are key compo...",0.98,Inflammatory response and immune cell regulation,"1. IL18, XCL1, SLAMF1, and IL1R1 are involved ...",0.87,Protein glycosylation and trafficking,1. MGAT4C is a gene that encodes for N-acetylg...,0.87
GO:0120253,CYP2F1 BCO1 BCO2,3,hydrocarbon catabolic process,CYP2F1 NUF2 BTBD1,MORC3 ZNF627 EHMT1,Fat-soluble vitamin metabolism,1. CYP2F1 is a member of the cytochrome P450 s...,0.95,Unclear biological function,1. CYP2F1 is a member of the cytochrome P450 s...,0.35,Chromatin modification and transcriptional re...,"1. MORC3, a member of the microrchidia family,...",0.85


In [3]:
import pandas as pd 
selected_go = pd.read_csv('data/GO_term_analysis/1000_selected_go_contaminated.csv')
# create a new dataframe by removing the 100 sets have been already ran 
model_compare_df = pd.read_csv('data/GO_term_analysis/model_compare/LLM_processed_model_compare_100set_gpt_4.tsv', sep='\t', index_col='GO')
model_compare_GO = model_compare_df.index.tolist()
print(len(model_compare_GO))

new = selected_go[~selected_go['GO'].isin(model_compare_GO)]
print(new.shape)
new.to_csv('data/GO_term_analysis/900_selected_go_contaminated.csv', index=False)

100
(900, 6)


In [9]:
## set up parameters for running gpt4 pipeline for the 1000 gene sets
import os 
from glob import glob
# Define start, step, and end values
start = 0
step = 50
end = 900 #already ran 100 before

# Create a range list
range_list = list(range(start, end + step, step))

# Create tuples for each consecutive pair in the list
tuple_list = [(range_list[i], range_list[i+1]) for i in range(len(range_list)-1)]


initialize = True 
input_file = './data/GO_term_analysis/900_selected_go_contaminated.csv'
input_sep = constant.GO_FILE_SEP
set_index = constant.GO_INDEX_COL  
gene_column = constant.GO_GENE_COL 
gene_sep = ' '

## create a param file 
conf_file = './jsonFiles/thousandGOrunGPT4_config.json'
params = []
for start, end in tuple_list:
    out_file = f'./data/GO_term_analysis/LLM_processed_gpt_4_{start}_{end}'  
    param = f"--config {conf_file} \
        --initialize \
        --input {input_file} \
        --input_sep  '{input_sep}'\
        --set_index {set_index} \
        --gene_column {gene_column}\
        --gene_sep '{gene_sep}' \
        --start {start} \
        --end {end} \
        --output_file {out_file}"
    print(param)
    params.append(param)
print('number of params: ', len(params))

with open('thousandGOsets_GPT4Run_params.txt', 'w') as f:
    for p in params:
        f.write(p+'\n')

--config ./jsonFiles/thousandGOrunGPT4_config.json         --initialize         --input ./data/GO_term_analysis/900_selected_go_contaminated.csv         --input_sep  ','        --set_index GO         --gene_column Genes        --gene_sep ' '         --start 0         --end 50         --output_file ./data/GO_term_analysis/LLM_processed_gpt_4_0_50
--config ./jsonFiles/thousandGOrunGPT4_config.json         --initialize         --input ./data/GO_term_analysis/900_selected_go_contaminated.csv         --input_sep  ','        --set_index GO         --gene_column Genes        --gene_sep ' '         --start 50         --end 100         --output_file ./data/GO_term_analysis/LLM_processed_gpt_4_50_100
--config ./jsonFiles/thousandGOrunGPT4_config.json         --initialize         --input ./data/GO_term_analysis/900_selected_go_contaminated.csv         --input_sep  ','        --set_index GO         --gene_column Genes        --gene_sep ' '         --start 100         --end 150         --output_fil

#### For CC and MF branch

In [9]:
## set up parameters for running gpt4 pipeline for the 1000 gene sets for CC and MF
import os 
from glob import glob
# Define start, step, and end values
start = 0
step = 50
end = 1000 


# Create a range list
range_list = list(range(start, end + step, step))

# Create tuples for each consecutive pair in the list
tuple_list = [(range_list[i], range_list[i+1]) for i in range(len(range_list)-1)]

input_sep = constant.GO_FILE_SEP
set_index = constant.GO_INDEX_COL  
gene_column = constant.GO_GENE_COL 
gene_sep = ' '

## create a param file 
conf_file = './jsonFiles/thousandGO_CC_MF_runGPT4_config.json'
params = []
for branch in ['CC', 'MF']:
    input_file = f'./data/GO_term_analysis/CC_MF_branch/{branch}_1000_selected_go_terms.csv'
    for start, end in tuple_list:
        out_file = f'./data/GO_term_analysis/CC_MF_branch/LLM_processed_{branch}terms_gpt_4_{start}_{end}'  
        param = f"--config {conf_file} \
            --initialize \
            --input {input_file} \
            --input_sep  '{input_sep}'\
            --set_index {set_index} \
            --gene_column {gene_column}\
            --gene_sep '{gene_sep}' \
            --start {start} \
            --end {end} \
            --output_file {out_file}"
        print(param)
        params.append(param)
print('number of params: ', len(params))

with open('thousandGOsets_CC_MF_GPT4Run_params.txt', 'w') as f:
    for p in params:
        f.write(p+'\n')

--config ./jsonFiles/thousandGO_CC_MF_runGPT4_config.json             --initialize             --input ./data/GO_term_analysis/CC_MF_branch/CC_1000_selected_go_terms.csv             --input_sep  ','            --set_index GO             --gene_column Genes            --gene_sep ' '             --start 0             --end 50             --output_file ./data/GO_term_analysis/CC_MF_branch/LLM_processed_CCterms_gpt_4_0_50
--config ./jsonFiles/thousandGO_CC_MF_runGPT4_config.json             --initialize             --input ./data/GO_term_analysis/CC_MF_branch/CC_1000_selected_go_terms.csv             --input_sep  ','            --set_index GO             --gene_column Genes            --gene_sep ' '             --start 50             --end 100             --output_file ./data/GO_term_analysis/CC_MF_branch/LLM_processed_CCterms_gpt_4_50_100
--config ./jsonFiles/thousandGO_CC_MF_runGPT4_config.json             --initialize             --input ./data/GO_term_analysis/CC_MF_branch/CC_1000_sele

In [1]:
# test the cost and time usage 
import json 
from glob import glob

logs = glob('./logs/thousand_GO_run_gpt4*.log')
total_cost = 0
total_run = 0
time_total = 0
for log in logs:
    with open(log, 'r') as f:
        data = json.load(f)
        total_cost += data['dollars_spent']
        total_run += data['runs']
        time_total += data['time_taken_total']
print('total cost: {:.2f}'.format(total_cost))
print('cost per run: {:.2f}'.format(total_cost/total_run))
print('time per run: {:.2f}'.format(time_total/total_run))



total cost: 43.75
cost per run: 0.05
time per run: 38.11


In [None]:
# test the script for batch run

input_file = 'data/GO_term_analysis/toy_example.csv'
config = './jsonFiles/GOLLMrun_config.json'
%run query_llm_for_analysis.py --input $input_file --start 0 --end 1 --config $config

## Checkout and combine the output from the batch run 

In [22]:
from glob import glob
import pandas as pd
import json

### sanity check code along the way
processed_files = glob('data/GO_term_analysis/LLM_processed_gpt_4*.tsv')

for file in processed_files:
    df = pd.read_csv(file, sep='\t')
    df.set_index('GO', inplace=True)
    # check if the Analysis, Name and Score are all filled
    columns = [col for col in df.columns if col.endswith('Analysis') or col.endswith('Name') or col.endswith('Score')]
    print(columns)
    for col in columns:
        n_na = df[col].isna().sum()
        if n_na > 0:
            print(f'Error in {file} for {col}, has {n_na} NAs')
            print(df[df[col].isna()])
        else:
            continue
    # check if there is any duplicated GO terms
    print('Any duplicated GO: ',df.index.duplicated().sum())
    
    df.reset_index(inplace=True)
#     # print(ranges)
    print(df.shape)

    
combined_df = pd.concat([pd.read_csv(f, sep = '\t') for f in processed_files])
print(combined_df.shape)
print('Any duplicated GO: ',combined_df['GO'].duplicated().sum())
analysis_columns = [col for col in combined_df.columns if col.endswith('Analysis')]
print('Any duplicated LLM analysis: ', combined_df[analysis_columns[0]].duplicated(keep=False).sum())

combined_df.to_csv('data/GO_term_analysis/LLM_processed_selected_1000_go_terms.tsv', index=False, sep='\t')

['gpt_4_default Name', 'gpt_4_default Analysis', 'gpt_4_default Score']
Any duplicated GO:  0
(50, 9)
['gpt_4_default Name', 'gpt_4_default Analysis', 'gpt_4_default Score']
Any duplicated GO:  0
(50, 9)
['gpt_4_default Name', 'gpt_4_default Analysis', 'gpt_4_default Score']
Any duplicated GO:  0
(50, 9)
['gpt_4_default Name', 'gpt_4_default Analysis', 'gpt_4_default Score']
Any duplicated GO:  0
(50, 9)
['gpt_4_default Name', 'gpt_4_default Analysis', 'gpt_4_default Score']
Any duplicated GO:  0
(50, 9)
['gpt_4_default Name', 'gpt_4_default Analysis', 'gpt_4_default Score']
Any duplicated GO:  0
(50, 9)
['gpt_4_default Name', 'gpt_4_default Analysis', 'gpt_4_default Score']
Any duplicated GO:  0
(50, 9)
['gpt_4_default Name', 'gpt_4_default Analysis', 'gpt_4_default Score']
Any duplicated GO:  0
(50, 9)
['gpt_4_default Name', 'gpt_4_default Analysis', 'gpt_4_default Score']
Any duplicated GO:  0
(50, 9)
['gpt_4_default Name', 'gpt_4_default Analysis', 'gpt_4_default Score']
Any duplic

In [23]:
#combine with the 100 sets that are already ran 
model_compare_df = pd.read_csv('data/GO_term_analysis/model_compare/LLM_processed_model_compare_100set_gpt_4.tsv', sep='\t')
common_cols = [col for col in model_compare_df.columns if col in combined_df.columns]
print(common_cols)
model_compare_df = model_compare_df.loc[:, common_cols]
print(model_compare_df.shape)

combined_df = pd.concat([combined_df, model_compare_df])
print(combined_df.shape)
print('Any duplicated GO: ',combined_df['GO'].duplicated().sum())


['GO', 'Genes', 'Gene_Count', 'Term_Description', '50perc_contaminated_Genes', '100perc_contaminated_Genes', 'gpt_4_default Name', 'gpt_4_default Analysis', 'gpt_4_default Score']
(100, 9)
(1000, 9)
Any duplicated GO:  0


In [24]:

combined_df.to_csv('data/GO_term_analysis/LLM_processed_selected_1000_go_terms.tsv', index=False, sep='\t')

In [3]:
from glob import glob
import pandas as pd
import json

### sanity check code along the way

branches = ['CC', 'MF']
for branch in branches:
    branch_processed_files = glob(f'data/GO_term_analysis/CC_MF_branch/LLM_processed_{branch}terms_gpt_4*.tsv')
    print(len(branch_processed_files))
    for file in branch_processed_files:
        df = pd.read_csv(file, sep='\t')
        df.set_index('GO', inplace=True)
        # check if the Analysis, Name and Score are all filled
        columns = [col for col in df.columns if col.endswith('Analysis') or col.endswith('Name') or col.endswith('Score')]
        print(columns)
        for col in columns:
            n_na = df[col].isna().sum()
            if n_na > 0:
                print(f'Error in {file} for {col}, has {n_na} NAs')
                print(df[df[col].isna()])
            else:
                continue
        # check if there is any duplicated GO terms
        if df.index.duplicated().sum() > 0:
            print('Number of duplicated GO: ',df.index.duplicated().sum())
        
        df.reset_index(inplace=True)
    #     # print(ranges)
        print(df.shape)

        
    combined_df = pd.concat([pd.read_csv(f, sep = '\t') for f in branch_processed_files])
    print(combined_df.shape)
    print(f'Any duplicated GO in {branch} combined file: ',combined_df['GO'].duplicated().sum())
    analysis_columns = [col for col in combined_df.columns if col.endswith('Analysis')]
    print(f'Any duplicated {branch} LLM analysis: ', combined_df[analysis_columns[0]].duplicated(keep=False).sum())

    combined_df.to_csv(f'data/GO_term_analysis/CC_MF_branch/LLM_processed_selected_1000_go_{branch}terms.tsv', index=False, sep='\t')

20
['gpt_4_default Name', 'gpt_4_default Analysis', 'gpt_4_default Score']
(50, 7)
['gpt_4_default Name', 'gpt_4_default Analysis', 'gpt_4_default Score']
(50, 7)
['gpt_4_default Name', 'gpt_4_default Analysis', 'gpt_4_default Score']
(50, 7)
['gpt_4_default Name', 'gpt_4_default Analysis', 'gpt_4_default Score']
(50, 7)
['gpt_4_default Name', 'gpt_4_default Analysis', 'gpt_4_default Score']
(50, 7)
['gpt_4_default Name', 'gpt_4_default Analysis', 'gpt_4_default Score']
(50, 7)
['gpt_4_default Name', 'gpt_4_default Analysis', 'gpt_4_default Score']
(50, 7)
['gpt_4_default Name', 'gpt_4_default Analysis', 'gpt_4_default Score']
(50, 7)
['gpt_4_default Name', 'gpt_4_default Analysis', 'gpt_4_default Score']
(50, 7)
['gpt_4_default Name', 'gpt_4_default Analysis', 'gpt_4_default Score']
(50, 7)
['gpt_4_default Name', 'gpt_4_default Analysis', 'gpt_4_default Score']
(50, 7)
['gpt_4_default Name', 'gpt_4_default Analysis', 'gpt_4_default Score']
(50, 7)
['gpt_4_default Name', 'gpt_4_default