## Set up model comparison

**gpt model**

gpt-4-1106-preview

**local model on server**

available models:

| NAME           | ID           | SIZE   |
|----------------|--------------|--------|
| llama2:70b     | c3a7af098300 | 38 GB  |
| llama2:7b      | fe938a131f40 | 3.8 GB |
| llama2:latest  | fe938a131f40 | 3.8 GB |
| mistral:7b     | 4d9f4b269c33 | 4.1 GB |
| mixtral:latest | 99a9202f8a7a | 26 GB  |


**API for calling Google Gemini pro**

GO TO: https://makersuite.google.com/app/apikey to get the apikey for gemini pro

export GOOGLEAI_KEY = xxxx

model = 'gemini-pro'

In [1]:
import pandas as pd
import numpy as np
import json 
from utils.openai_query import openai_chat
from utils.prompt_factory import make_user_prompt_with_score
from utils.server_model_query import server_model_chat
from utils.llm_analysis_utils import process_analysis, save_progress
from utils.genai_query import query_genai_model
from tqdm import tqdm
import constant
import openai
import os
import logging
import re
%load_ext autoreload

%autoreload 2


**Example for running in the jupyter notebook**

In [8]:
## load variables
initialize = True # if True, then initialize the input table with llm names, analysis and score to None 
# Replace with your actual values
config_file = './jsonFiles/toyexample_gpt35.json'  # replace with your actual config file 
input_file = './data/GO_term_analysis/100_selected_go_contaminated.csv' # replace with your actual input file
input_sep = ','  # replace with the separator
set_index = 'GO'  # replace with your column name that you want to set as index or None
gene_column = 'Genes'  # replace with your actual column name for the gene list
gene_sep = ' '  # replace with your actual separator
gene_features = None  # replace with your path to the gene features or None if you don't want to include in the prompt
direct = False # if True, then the prompt will be a direct sentence asking for a name and analysis from the gene set, otherwise default or customized prompt
out_file = 'data/GO_term_analysis/model_compare/LLM_processed_model_compare_gpt_35'  # replace with your actual output file name

customized_prompt = False # if True, then the prompt will be the custom prompt, if False, then the prompt will use default

# load the config file
with open(config_file) as json_file:
    config = json.load(json_file)

if customized_prompt:
    # make sure the file exist 
    if os.path.isfile(config['CUSTOM_PROMPT_FILE']):
        with open(config['CUSTOM_PROMPT_FILE'], 'r') as f: # replace with your actual customized prompt file
            customized_prompt = f.read()
            assert len(customized_prompt) > 1, "Customized prompt is empty"
    else:
        print("Customized prompt file does not exist")
        customized_prompt = None
else:
    customized_prompt = None

# Load OpenAI key, context, and model used 
openai.api_key = os.environ["OPENAI_API_KEY"]

context = config['CONTEXT']
model = config['MODEL']
temperature = config['TEMP']
max_tokens = config['MAX_TOKENS']
if model.startswith('gpt'):
    rate_per_token = config['RATE_PER_TOKEN']
    DOLLAR_LIMIT = config['DOLLAR_LIMIT']
LOG_FILE = config['LOG_NAME']+'_.log'

SEED = constant.SEED
column_prefix = model.split('-')[0]

In [10]:
# handle the logger so it create a new one for each model run
def get_logger(filename):
    logger = logging.getLogger(filename)
    logger.setLevel(logging.INFO)
    if not logger.handlers:
        file_handler = logging.FileHandler(filename)
        formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
        file_handler.setFormatter(formatter)
        logger.addHandler(file_handler)
    return logger


def main(df):
    analysis_dict  = {}

    logger = get_logger(f'{out_file}.log')

    i = 0 #used for track progress and saving the file
    for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
        #only process None rows 
        if pd.notna(row[f'{column_prefix} Analysis']):
            continue
        
        gene_data = row[gene_column]
        # if gene_data is not a string, then skip
        if type(gene_data) != str:
            
            logger.warning(f'Gene set {idx} is not a string, skipping')
            continue
        genes = gene_data.split(gene_sep)
        
        if len(genes) >1000:
            logger.warning(f'Gene set {idx} is too big, skipping')
            continue

        try:
            prompt = make_user_prompt_with_score(genes)
            # print(prompt)
            finger_print = None
            if model.startswith('gpt'):
                print("Accessing OpenAI API")
                analysis, finger_print = openai_chat(context, prompt, model, temperature, max_tokens, rate_per_token, LOG_FILE, DOLLAR_LIMIT, SEED)
            elif model.startswith('gemini'):
                print("Using Google Gemini API")
                analysis, error_message = query_genai_model(f"{context}\n{prompt}", model, temperature, max_tokens, LOG_FILE) 
            else:
                print("Using server model")
                analysis, error_message= server_model_chat(context, prompt, model, temperature, max_tokens,LOG_FILE, SEED)

            
            if analysis:
                # print(analysis)
                llm_name, llm_score, llm_analysis = process_analysis(analysis)
                # clean up the score and return float
                try:
                    llm_score_value =  float(re.sub("[^0-9.-]", "", llm_score))
                except ValueError:
                    llm_score_value = llm_score
            
                
                df.loc[idx, f'{column_prefix} Name'] = llm_name
                df.loc[idx, f'{column_prefix} Analysis'] = llm_analysis
                df.loc[idx, f'{column_prefix} Score'] = llm_score_value
                
                analysis_dict[f'{idx}_{column_prefix}'] = analysis
                # Log success with fingerprint
                logger.info(f'Success for {idx} {column_prefix}.')
                if finger_print:
                    logger.info(f'GPT_Fingerprint for {idx}: {finger_print}')
                    
            else:
                logger.error(f'Error for query gene set {idx}: {error_message}')

        except Exception as e:
            logger.error(f'Error for {idx}: {e}')
            continue
        i += 1
        if i % 10 == 0:
            # bin scores into no score, low score, medium score, high score
            bins = [-np.inf, 0, 0.79, 0.86, np.inf] # 0 is no score (name not assigned), between 0 to 0.79 is low score, between 0.8 to 0.86 is medium score, above 0.86 is high score
            labels = ['Name not assigned', 'Low Score', 'Medium Score', 'High Score']  # Define the corresponding labels
            
            df[f'{column_prefix} Score bins'] = pd.cut(df[f'{column_prefix} Score'], bins=bins, labels=labels)
                
            save_progress(df, analysis_dict, out_file)
            # df.to_csv(f'{out_file}.tsv', sep='\t', index=True)
            print(f"Saved progress for {i} genesets")
    # save the final file
    save_progress(df, analysis_dict, out_file)
    

In [None]:
import os 
from glob import glob


initialize = True 
input_file = 'data/GO_term_analysis/toy_example_w_contaminated.csv'
input_sep = constant.GO_FILE_SEP
set_index = constant.GO_INDEX_COL  
gene_column = constant.GO_GENE_COL 
gene_sep = ' '

## create a param file 
configs = glob('./jsonFiles/toyexample_*.json')
params = []
for conf_file in configs:
    model_names = '_'.join(conf_file.split('/')[-1].split('.')[0].split('_')[1:])
    # print(model_names)
    out_file = f'data/GO_term_analysis/LLM_processed_toy_example_w_contamination_{model_names}'  
    param = f"--config {conf_file} \
        --initialize \
        --input {input_file} \
        --input_sep  '{input_sep}'\
        --set_index {set_index} \
        --gene_column {gene_column}\
        --gene_sep '{gene_sep}' \
        --start 0 \
        --end 10 \
        --output_file {out_file}"
    print(param)
    params.append(param)

with open('toy_example_params.txt', 'w') as f:
    for p in params:
        f.write(p+'\n')

In [13]:
#Define your own loop for running the pipeline
## 12-18-2023: this loop is for run the default gene set and the contaminated gene sets 
## can modify this loop for different models or only run on default gene set

##12-27-23: edited the prompt 

##01-26-2023: test with bin scores

if __name__ == "__main__":
    
    df = pd.read_csv(input_file, sep=input_sep, index_col=set_index)
    
    if 'gpt' in model:
        name_fix = '_'.join(model.split('-')[:2])
    else:
        name_fix = model.replace(':', '_')
    column_prefix = name_fix + '_default'
    print(column_prefix)
    if initialize:
        # initialize the input file with llm names, analysis and score to None
        df[f'{column_prefix} Name'] = None
        df[f'{column_prefix} Analysis'] = None
        df[f'{column_prefix} Score'] = -np.inf
    main(df)  ## run with the real set 
    
    ## run the pipeline for contaiminated gene sets 
    contaminated_columns = [col for col in df.columns if col.endswith('contaminated_Genes')]
    # print(contaminated_columns)
    for col in contaminated_columns:
        gene_column = col ## Note need to change the gene_column to the contaminated column
        contam_prefix = '_'.join(col.split('_')[0:2])
        
        column_prefix = name_fix + '_' +contam_prefix
        print(column_prefix)

        if initialize:
            # initialize the input file with llm names, analysis and score to None
            df[f'{column_prefix} Name'] = None
            df[f'{column_prefix} Analysis'] = None
            df[f'{column_prefix} Score'] = -np.inf
        main(df)
    df.head()


gpt_3.5_default


  0%|          | 0/100 [00:00<?, ?it/s]

Accessing OpenAI API


  1%|          | 1/100 [00:03<05:24,  3.27s/it]

1192
Accessing OpenAI API


  2%|▏         | 2/100 [00:07<06:32,  4.01s/it]

1375
Accessing OpenAI API


  3%|▎         | 3/100 [00:13<08:03,  4.98s/it]

1471
Accessing OpenAI API


  4%|▍         | 4/100 [00:19<08:25,  5.27s/it]

1369
Accessing OpenAI API


  5%|▌         | 5/100 [00:28<10:33,  6.67s/it]

1836
Accessing OpenAI API


  6%|▌         | 6/100 [00:35<10:28,  6.68s/it]

1756
Accessing OpenAI API


  7%|▋         | 7/100 [00:42<10:24,  6.72s/it]

1297
Accessing OpenAI API


  8%|▊         | 8/100 [00:45<08:25,  5.50s/it]

1487
Accessing OpenAI API


  9%|▉         | 9/100 [00:50<08:24,  5.54s/it]

1399
Accessing OpenAI API


 10%|█         | 10/100 [00:54<07:32,  5.03s/it]

1198
Saved progress for 10 genesets
Accessing OpenAI API


 11%|█         | 11/100 [01:01<08:04,  5.45s/it]

1319
Accessing OpenAI API


 12%|█▏        | 12/100 [01:05<07:23,  5.04s/it]

1219
Accessing OpenAI API


 13%|█▎        | 13/100 [01:11<07:58,  5.50s/it]

1449
Accessing OpenAI API


 14%|█▍        | 14/100 [01:16<07:35,  5.29s/it]

1321
Accessing OpenAI API


 15%|█▌        | 15/100 [01:19<06:24,  4.53s/it]

1440
Accessing OpenAI API


 16%|█▌        | 16/100 [01:22<05:51,  4.19s/it]

1634
Accessing OpenAI API


 17%|█▋        | 17/100 [01:29<06:47,  4.91s/it]

1354
Accessing OpenAI API


 18%|█▊        | 18/100 [01:34<06:40,  4.88s/it]

1315
Accessing OpenAI API


 19%|█▉        | 19/100 [01:41<07:46,  5.76s/it]

1552
Accessing OpenAI API


 20%|██        | 20/100 [01:46<07:23,  5.55s/it]

1222
Saved progress for 20 genesets
Accessing OpenAI API


 21%|██        | 21/100 [01:54<08:02,  6.11s/it]

1397
Accessing OpenAI API


 22%|██▏       | 22/100 [01:59<07:37,  5.86s/it]

1228
Accessing OpenAI API


 23%|██▎       | 23/100 [02:09<09:01,  7.03s/it]

1577
Accessing OpenAI API


 24%|██▍       | 24/100 [02:15<08:22,  6.61s/it]

1596
Accessing OpenAI API


 25%|██▌       | 25/100 [02:19<07:23,  5.91s/it]

1497
Accessing OpenAI API


 26%|██▌       | 26/100 [02:24<07:04,  5.74s/it]

1303
Accessing OpenAI API


 27%|██▋       | 27/100 [02:30<06:51,  5.63s/it]

1311
Accessing OpenAI API


 28%|██▊       | 28/100 [02:34<06:11,  5.16s/it]

1607
Accessing OpenAI API


 29%|██▉       | 29/100 [02:38<05:53,  4.98s/it]

1120
Accessing OpenAI API


 30%|███       | 30/100 [02:45<06:25,  5.51s/it]

1478
Saved progress for 30 genesets
Accessing OpenAI API


 31%|███       | 31/100 [02:51<06:21,  5.53s/it]

1386
Accessing OpenAI API


 32%|███▏      | 32/100 [02:55<05:53,  5.20s/it]

1466
Accessing OpenAI API


 33%|███▎      | 33/100 [03:01<05:55,  5.31s/it]

1322
Accessing OpenAI API


 34%|███▍      | 34/100 [03:05<05:38,  5.13s/it]

1316
Accessing OpenAI API


 35%|███▌      | 35/100 [03:11<05:46,  5.34s/it]

1327
Accessing OpenAI API


 36%|███▌      | 36/100 [03:17<05:49,  5.46s/it]

1306
Accessing OpenAI API


 37%|███▋      | 37/100 [03:21<05:11,  4.94s/it]

1227
Accessing OpenAI API


 38%|███▊      | 38/100 [03:25<05:01,  4.86s/it]

1273
Accessing OpenAI API


 39%|███▉      | 39/100 [03:30<04:58,  4.90s/it]

1375
Accessing OpenAI API


 40%|████      | 40/100 [05:10<33:24, 33.41s/it]

1543
Saved progress for 40 genesets
Accessing OpenAI API


 41%|████      | 41/100 [05:16<24:38, 25.06s/it]

1301
Accessing OpenAI API


 42%|████▏     | 42/100 [05:22<18:51, 19.51s/it]

1425
Accessing OpenAI API


 43%|████▎     | 43/100 [05:30<15:02, 15.84s/it]

1506
Accessing OpenAI API


 44%|████▍     | 44/100 [05:37<12:27, 13.35s/it]

1380
Accessing OpenAI API


 45%|████▌     | 45/100 [05:44<10:34, 11.53s/it]

1395
Accessing OpenAI API


 46%|████▌     | 46/100 [05:50<08:44,  9.70s/it]

1358
Accessing OpenAI API


 47%|████▋     | 47/100 [05:53<06:58,  7.90s/it]

1586
Accessing OpenAI API


 48%|████▊     | 48/100 [05:59<06:09,  7.10s/it]

1269
Accessing OpenAI API


 49%|████▉     | 49/100 [06:04<05:34,  6.57s/it]

1316
Accessing OpenAI API


 50%|█████     | 50/100 [06:12<05:44,  6.90s/it]

1388
Saved progress for 50 genesets
Accessing OpenAI API


 51%|█████     | 51/100 [06:17<05:21,  6.56s/it]

1313
Accessing OpenAI API


 52%|█████▏    | 52/100 [06:23<04:57,  6.20s/it]

1294
Accessing OpenAI API


 53%|█████▎    | 53/100 [06:30<05:06,  6.53s/it]

1410
Accessing OpenAI API


 54%|█████▍    | 54/100 [06:36<04:52,  6.36s/it]

1384
Accessing OpenAI API


 55%|█████▌    | 55/100 [06:39<04:05,  5.46s/it]

1243
Accessing OpenAI API


 56%|█████▌    | 56/100 [06:45<04:02,  5.52s/it]

1258
Accessing OpenAI API


 57%|█████▋    | 57/100 [06:50<03:44,  5.23s/it]

1506
Accessing OpenAI API


 58%|█████▊    | 58/100 [06:57<04:08,  5.92s/it]

1487
Accessing OpenAI API


 59%|█████▉    | 59/100 [07:05<04:24,  6.45s/it]

1496
Accessing OpenAI API


 60%|██████    | 60/100 [07:09<03:52,  5.82s/it]

1472
Saved progress for 60 genesets
Accessing OpenAI API


 61%|██████    | 61/100 [07:16<03:55,  6.04s/it]

1513
Accessing OpenAI API


 62%|██████▏   | 62/100 [07:21<03:35,  5.68s/it]

1311
Accessing OpenAI API


 63%|██████▎   | 63/100 [07:26<03:24,  5.53s/it]

1378
Accessing OpenAI API


 64%|██████▍   | 64/100 [07:31<03:15,  5.44s/it]

1356
Accessing OpenAI API


 65%|██████▌   | 65/100 [07:38<03:23,  5.82s/it]

1326
Accessing OpenAI API


 66%|██████▌   | 66/100 [07:44<03:25,  6.04s/it]

1336
Accessing OpenAI API


 67%|██████▋   | 67/100 [07:50<03:13,  5.85s/it]

1476
Accessing OpenAI API


 68%|██████▊   | 68/100 [07:55<03:04,  5.77s/it]

1271
Accessing OpenAI API


 69%|██████▉   | 69/100 [07:58<02:26,  4.71s/it]

1462
Accessing OpenAI API


 70%|███████   | 70/100 [08:06<02:55,  5.84s/it]

1340
Saved progress for 70 genesets
Accessing OpenAI API


 71%|███████   | 71/100 [08:13<03:02,  6.28s/it]

1496
Accessing OpenAI API


 72%|███████▏  | 72/100 [08:19<02:47,  5.99s/it]

1284
Accessing OpenAI API


 73%|███████▎  | 73/100 [08:23<02:31,  5.63s/it]

1209
Accessing OpenAI API


 74%|███████▍  | 74/100 [08:28<02:18,  5.33s/it]

1309
Accessing OpenAI API


 75%|███████▌  | 75/100 [08:34<02:20,  5.61s/it]

1329
Accessing OpenAI API


 76%|███████▌  | 76/100 [08:38<02:03,  5.17s/it]

1287
Accessing OpenAI API


 77%|███████▋  | 77/100 [28:47<2:20:20, 366.12s/it]

1287
Accessing OpenAI API


 78%|███████▊  | 78/100 [28:53<1:34:41, 258.24s/it]

1792
Accessing OpenAI API


 79%|███████▉  | 79/100 [28:57<1:03:38, 181.83s/it]

1149
Accessing OpenAI API


 80%|████████  | 80/100 [29:04<43:06, 129.31s/it]  

1373
Saved progress for 80 genesets
Accessing OpenAI API


 81%|████████  | 81/100 [29:09<29:09, 92.07s/it] 

1442
Accessing OpenAI API


 82%|████████▏ | 82/100 [29:15<19:53, 66.32s/it]

1363
Accessing OpenAI API


 83%|████████▎ | 83/100 [29:22<13:43, 48.44s/it]

1488
Accessing OpenAI API


 84%|████████▍ | 84/100 [29:29<09:35, 35.99s/it]

1371
Accessing OpenAI API


 85%|████████▌ | 85/100 [29:35<06:46, 27.07s/it]

1428
Accessing OpenAI API


 86%|████████▌ | 86/100 [29:40<04:46, 20.45s/it]

1281
Accessing OpenAI API


 87%|████████▋ | 87/100 [29:47<03:33, 16.44s/it]

1394
Accessing OpenAI API


 88%|████████▊ | 88/100 [29:54<02:44, 13.67s/it]

1572
Accessing OpenAI API


 89%|████████▉ | 89/100 [30:06<02:22, 12.95s/it]

1749
Accessing OpenAI API


 90%|█████████ | 90/100 [30:10<01:43, 10.31s/it]

1382
Saved progress for 90 genesets
Accessing OpenAI API


 91%|█████████ | 91/100 [30:19<01:30, 10.01s/it]

1553
Accessing OpenAI API


 92%|█████████▏| 92/100 [30:26<01:12,  9.04s/it]

1464
Accessing OpenAI API


 93%|█████████▎| 93/100 [30:29<00:51,  7.35s/it]

1287
Accessing OpenAI API


 94%|█████████▍| 94/100 [30:36<00:43,  7.20s/it]

1552
Accessing OpenAI API


 95%|█████████▌| 95/100 [30:42<00:34,  6.92s/it]

1641
Accessing OpenAI API


 96%|█████████▌| 96/100 [30:48<00:25,  6.46s/it]

1338
Accessing OpenAI API


 97%|█████████▋| 97/100 [30:52<00:17,  5.72s/it]

1213
Accessing OpenAI API


 98%|█████████▊| 98/100 [30:57<00:11,  5.68s/it]

1337
Accessing OpenAI API


 99%|█████████▉| 99/100 [31:03<00:05,  5.56s/it]

1209
Accessing OpenAI API


100%|██████████| 100/100 [31:08<00:00, 18.69s/it]


1637
Saved progress for 100 genesets
gpt_3.5_50perc_contaminated


  0%|          | 0/100 [00:00<?, ?it/s]

Accessing OpenAI API


  1%|          | 1/100 [00:05<08:25,  5.10s/it]

1244
Accessing OpenAI API


  2%|▏         | 2/100 [00:14<12:38,  7.74s/it]

1559
Accessing OpenAI API


  3%|▎         | 3/100 [00:21<11:56,  7.39s/it]

1574
Accessing OpenAI API


  4%|▍         | 4/100 [00:26<10:16,  6.43s/it]

1357
Accessing OpenAI API


  5%|▌         | 5/100 [00:31<09:01,  5.70s/it]

1394
Accessing OpenAI API


  6%|▌         | 6/100 [00:36<08:39,  5.53s/it]

1679
Accessing OpenAI API


  7%|▋         | 7/100 [00:41<08:31,  5.51s/it]

1286
Accessing OpenAI API


  8%|▊         | 8/100 [00:45<07:28,  4.87s/it]

1501
Accessing OpenAI API


  9%|▉         | 9/100 [00:51<08:00,  5.27s/it]

1439
Accessing OpenAI API


 10%|█         | 10/100 [00:56<07:58,  5.32s/it]

1272
Saved progress for 10 genesets
Accessing OpenAI API


 11%|█         | 11/100 [01:02<08:11,  5.52s/it]

1366
Accessing OpenAI API


 12%|█▏        | 12/100 [01:07<07:50,  5.34s/it]

1199
Accessing OpenAI API


 13%|█▎        | 13/100 [01:15<08:53,  6.14s/it]

1379
Accessing OpenAI API


 14%|█▍        | 14/100 [01:23<09:31,  6.65s/it]

1424
Accessing OpenAI API


 15%|█▌        | 15/100 [01:27<08:13,  5.81s/it]

1438
Accessing OpenAI API


 16%|█▌        | 16/100 [01:32<07:48,  5.58s/it]

1760
Accessing OpenAI API


 17%|█▋        | 17/100 [01:37<07:36,  5.50s/it]

1339
Accessing OpenAI API


 18%|█▊        | 18/100 [01:43<07:48,  5.72s/it]

1416
Accessing OpenAI API


 19%|█▉        | 19/100 [01:49<07:46,  5.76s/it]

1250
Accessing OpenAI API


 20%|██        | 20/100 [01:54<07:19,  5.49s/it]

1264
Saved progress for 20 genesets
Accessing OpenAI API


 21%|██        | 21/100 [02:01<07:53,  6.00s/it]

1406
Accessing OpenAI API


 22%|██▏       | 22/100 [02:11<09:07,  7.02s/it]

1569
Accessing OpenAI API


 23%|██▎       | 23/100 [02:18<08:56,  6.97s/it]

1593
Accessing OpenAI API


 24%|██▍       | 24/100 [02:24<08:28,  6.69s/it]

1567
Accessing OpenAI API


 25%|██▌       | 25/100 [02:27<07:03,  5.65s/it]

1468
Accessing OpenAI API


 26%|██▌       | 26/100 [02:32<06:38,  5.39s/it]

1383
Accessing OpenAI API


 27%|██▋       | 27/100 [02:36<06:05,  5.01s/it]

1176
Accessing OpenAI API


 28%|██▊       | 28/100 [02:43<06:40,  5.56s/it]

1668
Accessing OpenAI API


 29%|██▉       | 29/100 [02:46<05:54,  4.99s/it]

1178
Accessing OpenAI API


 30%|███       | 30/100 [02:52<06:03,  5.20s/it]

1389
Saved progress for 30 genesets
Accessing OpenAI API


 31%|███       | 31/100 [02:59<06:34,  5.72s/it]

1400
Accessing OpenAI API


 32%|███▏      | 32/100 [03:04<06:14,  5.51s/it]

1438
Accessing OpenAI API


 33%|███▎      | 33/100 [03:11<06:41,  5.99s/it]

1576
Accessing OpenAI API


 34%|███▍      | 34/100 [03:17<06:44,  6.13s/it]

1397
Accessing OpenAI API


 35%|███▌      | 35/100 [03:23<06:30,  6.00s/it]

1229
Accessing OpenAI API


 36%|███▌      | 36/100 [03:29<06:28,  6.07s/it]

1359
Accessing OpenAI API


 37%|███▋      | 37/100 [03:36<06:31,  6.22s/it]

1378
Accessing OpenAI API


 38%|███▊      | 38/100 [03:45<07:25,  7.19s/it]

1450
Accessing OpenAI API


 39%|███▉      | 39/100 [03:50<06:22,  6.27s/it]

1313
Accessing OpenAI API


 40%|████      | 40/100 [03:56<06:27,  6.45s/it]

1305
Saved progress for 40 genesets
Accessing OpenAI API


 41%|████      | 41/100 [04:03<06:23,  6.49s/it]

1394
Accessing OpenAI API


 42%|████▏     | 42/100 [04:11<06:48,  7.04s/it]

1492
Accessing OpenAI API


 43%|████▎     | 43/100 [04:23<07:58,  8.40s/it]

1610
Accessing OpenAI API


 44%|████▍     | 44/100 [04:30<07:31,  8.06s/it]

1359
Accessing OpenAI API


 45%|████▌     | 45/100 [04:39<07:38,  8.34s/it]

1425
Accessing OpenAI API


 46%|████▌     | 46/100 [04:47<07:14,  8.05s/it]

1346
Accessing OpenAI API


 47%|████▋     | 47/100 [04:54<06:50,  7.75s/it]

1714
Accessing OpenAI API


 48%|████▊     | 48/100 [05:00<06:15,  7.22s/it]

1257
Accessing OpenAI API


 49%|████▉     | 49/100 [05:08<06:24,  7.54s/it]

1409
Accessing OpenAI API


 50%|█████     | 50/100 [05:18<06:53,  8.26s/it]

1487
Saved progress for 50 genesets
Accessing OpenAI API


 51%|█████     | 51/100 [05:23<06:00,  7.36s/it]

1326
Accessing OpenAI API


 52%|█████▏    | 52/100 [05:29<05:30,  6.90s/it]

1351
Accessing OpenAI API


 53%|█████▎    | 53/100 [05:35<05:09,  6.59s/it]

1452
Accessing OpenAI API


 54%|█████▍    | 54/100 [05:39<04:29,  5.87s/it]

1357
Accessing OpenAI API


 55%|█████▌    | 55/100 [05:46<04:38,  6.19s/it]

1250
Accessing OpenAI API


 56%|█████▌    | 56/100 [05:51<04:24,  6.01s/it]

1295
Accessing OpenAI API


 57%|█████▋    | 57/100 [05:59<04:41,  6.54s/it]

1603
Accessing OpenAI API


 58%|█████▊    | 58/100 [06:10<05:27,  7.80s/it]

1647
Accessing OpenAI API


 59%|█████▉    | 59/100 [06:17<05:07,  7.50s/it]

1363
Accessing OpenAI API


 60%|██████    | 60/100 [06:24<05:01,  7.54s/it]

1545
Saved progress for 60 genesets
Accessing OpenAI API


 61%|██████    | 61/100 [06:31<04:45,  7.32s/it]

1465
Accessing OpenAI API


 62%|██████▏   | 62/100 [06:38<04:33,  7.20s/it]

1351
Accessing OpenAI API


 63%|██████▎   | 63/100 [06:45<04:18,  6.98s/it]

1433
Accessing OpenAI API


 64%|██████▍   | 64/100 [06:55<04:45,  7.94s/it]

1531
Accessing OpenAI API


 65%|██████▌   | 65/100 [07:02<04:30,  7.72s/it]

1325
Accessing OpenAI API


 66%|██████▌   | 66/100 [07:08<04:09,  7.34s/it]

1217
Accessing OpenAI API


 67%|██████▋   | 67/100 [07:14<03:48,  6.93s/it]

1538
Accessing OpenAI API


 68%|██████▊   | 68/100 [07:23<04:01,  7.54s/it]

1364
Accessing OpenAI API


 69%|██████▉   | 69/100 [07:28<03:27,  6.69s/it]

1591
Accessing OpenAI API


 70%|███████   | 70/100 [07:33<03:05,  6.19s/it]

1241
Saved progress for 70 genesets
Accessing OpenAI API


 71%|███████   | 71/100 [07:40<03:06,  6.42s/it]

1404
Accessing OpenAI API


 72%|███████▏  | 72/100 [07:49<03:18,  7.08s/it]

1440
Accessing OpenAI API


 73%|███████▎  | 73/100 [07:52<02:43,  6.06s/it]

1215
Accessing OpenAI API


 74%|███████▍  | 74/100 [07:58<02:34,  5.95s/it]

1324
Accessing OpenAI API


 75%|███████▌  | 75/100 [08:02<02:16,  5.47s/it]

1285
Accessing OpenAI API


 76%|███████▌  | 76/100 [08:11<02:33,  6.40s/it]

1462
Accessing OpenAI API


 77%|███████▋  | 77/100 [08:18<02:28,  6.46s/it]

1357
Accessing OpenAI API


 78%|███████▊  | 78/100 [08:26<02:34,  7.04s/it]

1820
Accessing OpenAI API


 79%|███████▉  | 79/100 [08:32<02:23,  6.83s/it]

1345
Accessing OpenAI API


 80%|████████  | 80/100 [08:40<02:19,  6.99s/it]

1416
Saved progress for 80 genesets
Accessing OpenAI API


 81%|████████  | 81/100 [08:44<02:00,  6.35s/it]

1415
Accessing OpenAI API


 82%|████████▏ | 82/100 [08:53<02:07,  7.09s/it]

1492
Accessing OpenAI API


 83%|████████▎ | 83/100 [08:59<01:54,  6.71s/it]

1383
Accessing OpenAI API


 84%|████████▍ | 84/100 [09:09<02:03,  7.70s/it]

1449
Accessing OpenAI API


 85%|████████▌ | 85/100 [09:16<01:52,  7.50s/it]

1406
Accessing OpenAI API


 86%|████████▌ | 86/100 [09:21<01:33,  6.68s/it]

1335
Accessing OpenAI API


 87%|████████▋ | 87/100 [09:28<01:29,  6.87s/it]

1403
Accessing OpenAI API


 88%|████████▊ | 88/100 [09:33<01:13,  6.13s/it]

1432
Accessing OpenAI API


 89%|████████▉ | 89/100 [09:41<01:14,  6.75s/it]

1444
Accessing OpenAI API


 90%|█████████ | 90/100 [09:46<01:02,  6.21s/it]

1268
Saved progress for 90 genesets
Accessing OpenAI API


 91%|█████████ | 91/100 [09:52<00:55,  6.14s/it]

1419
Accessing OpenAI API


 92%|█████████▏| 92/100 [09:59<00:52,  6.59s/it]

1549
Accessing OpenAI API


 93%|█████████▎| 93/100 [10:04<00:42,  6.06s/it]

1282
Accessing OpenAI API


 94%|█████████▍| 94/100 [10:17<00:48,  8.01s/it]

2198
Accessing OpenAI API


 95%|█████████▌| 95/100 [10:19<00:31,  6.39s/it]

1305
Accessing OpenAI API


 96%|█████████▌| 96/100 [10:23<00:22,  5.60s/it]

1324
Accessing OpenAI API


 97%|█████████▋| 97/100 [10:30<00:17,  5.92s/it]

1221
Accessing OpenAI API


 98%|█████████▊| 98/100 [10:34<00:10,  5.49s/it]

1308
Accessing OpenAI API


 99%|█████████▉| 99/100 [10:37<00:04,  4.63s/it]

1147
Accessing OpenAI API


100%|██████████| 100/100 [10:41<00:00,  6.42s/it]


1603
Saved progress for 100 genesets
gpt_3.5_100perc_contaminated


  0%|          | 0/100 [00:00<?, ?it/s]

Accessing OpenAI API


  1%|          | 1/100 [00:04<07:33,  4.58s/it]

1284
Accessing OpenAI API


  2%|▏         | 2/100 [00:10<09:03,  5.55s/it]

1519
Accessing OpenAI API


  3%|▎         | 3/100 [00:14<07:24,  4.58s/it]

1340
Accessing OpenAI API


  4%|▍         | 4/100 [00:23<10:06,  6.32s/it]

1554
Accessing OpenAI API


  5%|▌         | 5/100 [00:29<09:43,  6.14s/it]

1393
Accessing OpenAI API


  6%|▌         | 6/100 [00:31<07:28,  4.77s/it]

1462
Accessing OpenAI API


  7%|▋         | 7/100 [00:38<08:32,  5.51s/it]

1352
Accessing OpenAI API


  8%|▊         | 8/100 [00:41<07:25,  4.84s/it]

1514
Accessing OpenAI API


  9%|▉         | 9/100 [00:48<08:21,  5.51s/it]

1420
Accessing OpenAI API


 10%|█         | 10/100 [00:55<08:50,  5.90s/it]

1316
Saved progress for 10 genesets
Accessing OpenAI API


 11%|█         | 11/100 [01:00<08:36,  5.81s/it]

1450
Accessing OpenAI API


 12%|█▏        | 12/100 [01:06<08:29,  5.79s/it]

1219
Accessing OpenAI API


 13%|█▎        | 13/100 [01:13<08:59,  6.20s/it]

1388
Accessing OpenAI API


 14%|█▍        | 14/100 [01:21<09:37,  6.72s/it]

1441
Accessing OpenAI API


 15%|█▌        | 15/100 [01:25<08:08,  5.74s/it]

1493
Accessing OpenAI API


 16%|█▌        | 16/100 [01:27<06:45,  4.83s/it]

1509
Accessing OpenAI API


 17%|█▋        | 17/100 [01:34<07:18,  5.28s/it]

1387
Accessing OpenAI API


 18%|█▊        | 18/100 [01:41<07:54,  5.78s/it]

1415
Accessing OpenAI API


 19%|█▉        | 19/100 [01:48<08:22,  6.20s/it]

1380
Accessing OpenAI API


 20%|██        | 20/100 [01:55<08:48,  6.60s/it]

1369
Saved progress for 20 genesets
Accessing OpenAI API


 21%|██        | 21/100 [02:08<11:07,  8.45s/it]

1730
Accessing OpenAI API


 22%|██▏       | 22/100 [02:16<10:48,  8.31s/it]

1467
Accessing OpenAI API


 23%|██▎       | 23/100 [02:25<10:51,  8.47s/it]

1491
Accessing OpenAI API


 24%|██▍       | 24/100 [02:29<09:09,  7.23s/it]

1527
Accessing OpenAI API


 25%|██▌       | 25/100 [02:32<07:27,  5.97s/it]

1465
Accessing OpenAI API


 26%|██▌       | 26/100 [02:41<08:28,  6.87s/it]

1379
Accessing OpenAI API


 27%|██▋       | 27/100 [02:45<07:11,  5.91s/it]

1156
Accessing OpenAI API


 28%|██▊       | 28/100 [02:51<07:10,  5.98s/it]

1662
Accessing OpenAI API


 29%|██▉       | 29/100 [02:55<06:25,  5.44s/it]

1262
Accessing OpenAI API


 30%|███       | 30/100 [03:01<06:26,  5.52s/it]

1551
Saved progress for 30 genesets
Accessing OpenAI API


 31%|███       | 31/100 [03:06<06:12,  5.40s/it]

1462
Accessing OpenAI API


 32%|███▏      | 32/100 [03:11<05:51,  5.17s/it]

1329
Accessing OpenAI API


 33%|███▎      | 33/100 [03:16<05:54,  5.29s/it]

1329
Accessing OpenAI API


 34%|███▍      | 34/100 [03:21<05:43,  5.20s/it]

1284
Accessing OpenAI API


 35%|███▌      | 35/100 [03:26<05:31,  5.10s/it]

1259
Accessing OpenAI API


 36%|███▌      | 36/100 [03:31<05:27,  5.11s/it]

1277
Accessing OpenAI API


 37%|███▋      | 37/100 [03:35<05:03,  4.82s/it]

1360
Accessing OpenAI API


 38%|███▊      | 38/100 [03:43<05:39,  5.48s/it]

1399
Accessing OpenAI API


 39%|███▉      | 39/100 [03:48<05:32,  5.45s/it]

1448
Accessing OpenAI API


 40%|████      | 40/100 [03:54<05:47,  5.79s/it]

1485
Saved progress for 40 genesets
Accessing OpenAI API


 41%|████      | 41/100 [04:02<06:06,  6.22s/it]

1379
Accessing OpenAI API


 42%|████▏     | 42/100 [04:11<07:01,  7.27s/it]

1432
Accessing OpenAI API


 43%|████▎     | 43/100 [04:21<07:39,  8.06s/it]

1571
Accessing OpenAI API


 44%|████▍     | 44/100 [04:28<07:00,  7.51s/it]

1254
Accessing OpenAI API


 45%|████▌     | 45/100 [04:33<06:19,  6.90s/it]

1298
Accessing OpenAI API


 46%|████▌     | 46/100 [04:38<05:36,  6.22s/it]

1306
Accessing OpenAI API


 47%|████▋     | 47/100 [04:41<04:51,  5.50s/it]

1628
Accessing OpenAI API


 48%|████▊     | 48/100 [04:47<04:43,  5.45s/it]

1258
Accessing OpenAI API


 49%|████▉     | 49/100 [04:51<04:25,  5.20s/it]

1304
Accessing OpenAI API


 50%|█████     | 50/100 [05:01<05:19,  6.40s/it]

1497
Saved progress for 50 genesets
Accessing OpenAI API


 51%|█████     | 51/100 [05:06<04:57,  6.07s/it]

1233
Accessing OpenAI API


 52%|█████▏    | 52/100 [05:13<05:08,  6.43s/it]

1416
Accessing OpenAI API


 53%|█████▎    | 53/100 [05:21<05:25,  6.93s/it]

1433
Accessing OpenAI API


 54%|█████▍    | 54/100 [05:27<05:07,  6.69s/it]

1519
Accessing OpenAI API


 55%|█████▌    | 55/100 [05:34<05:00,  6.68s/it]

1318
Accessing OpenAI API


 56%|█████▌    | 56/100 [05:41<04:53,  6.67s/it]

1284
Accessing OpenAI API


 57%|█████▋    | 57/100 [05:46<04:34,  6.37s/it]

1512
Accessing OpenAI API


 58%|█████▊    | 58/100 [05:52<04:17,  6.12s/it]

1261
Accessing OpenAI API


 59%|█████▉    | 59/100 [05:57<04:02,  5.92s/it]

1455
Accessing OpenAI API


 60%|██████    | 60/100 [06:00<03:17,  4.94s/it]

1415
Saved progress for 60 genesets
Accessing OpenAI API


 61%|██████    | 61/100 [06:06<03:20,  5.14s/it]

1596
Accessing OpenAI API


 62%|██████▏   | 62/100 [06:15<04:01,  6.34s/it]

1467
Accessing OpenAI API


 63%|██████▎   | 63/100 [06:21<03:48,  6.18s/it]

1316
Accessing OpenAI API


 64%|██████▍   | 64/100 [06:28<03:56,  6.58s/it]

1405
Accessing OpenAI API


 65%|██████▌   | 65/100 [06:33<03:27,  5.94s/it]

1210
Accessing OpenAI API


 66%|██████▌   | 66/100 [06:38<03:16,  5.78s/it]

1188
Accessing OpenAI API


 67%|██████▋   | 67/100 [06:44<03:16,  5.96s/it]

1512
Accessing OpenAI API


 68%|██████▊   | 68/100 [06:51<03:15,  6.11s/it]

1329
Accessing OpenAI API


 69%|██████▉   | 69/100 [06:53<02:33,  4.95s/it]

1484
Accessing OpenAI API


 70%|███████   | 70/100 [07:02<03:02,  6.08s/it]

1416
Saved progress for 70 genesets
Accessing OpenAI API


 71%|███████   | 71/100 [07:09<03:02,  6.31s/it]

1420
Accessing OpenAI API


 72%|███████▏  | 72/100 [07:14<02:50,  6.09s/it]

1284
Accessing OpenAI API


 73%|███████▎  | 73/100 [07:18<02:27,  5.46s/it]

1264
Accessing OpenAI API


 74%|███████▍  | 74/100 [07:28<02:56,  6.78s/it]

1778
Accessing OpenAI API


 75%|███████▌  | 75/100 [07:36<02:56,  7.07s/it]

1585
Accessing OpenAI API


 76%|███████▌  | 76/100 [07:39<02:24,  6.00s/it]

1287
Accessing OpenAI API


 77%|███████▋  | 77/100 [07:44<02:11,  5.72s/it]

1331
Accessing OpenAI API


 78%|███████▊  | 78/100 [07:49<02:02,  5.55s/it]

1648
Accessing OpenAI API


 79%|███████▉  | 79/100 [07:56<01:59,  5.71s/it]

1235
Accessing OpenAI API


 80%|████████  | 80/100 [08:03<02:06,  6.30s/it]

1609
Saved progress for 80 genesets
Accessing OpenAI API


 81%|████████  | 81/100 [08:06<01:36,  5.08s/it]

1257
Accessing OpenAI API


 82%|████████▏ | 82/100 [08:10<01:29,  4.95s/it]

1262
Accessing OpenAI API


 83%|████████▎ | 83/100 [08:13<01:14,  4.38s/it]

1377
Accessing OpenAI API


 84%|████████▍ | 84/100 [08:17<01:09,  4.32s/it]

1345
Accessing OpenAI API


 85%|████████▌ | 85/100 [08:21<01:00,  4.04s/it]

1252
Accessing OpenAI API


 86%|████████▌ | 86/100 [08:27<01:05,  4.67s/it]

1529
Accessing OpenAI API


 87%|████████▋ | 87/100 [08:33<01:08,  5.24s/it]

1368
Accessing OpenAI API


 88%|████████▊ | 88/100 [08:41<01:11,  5.96s/it]

1682
Accessing OpenAI API


 89%|████████▉ | 89/100 [08:45<00:58,  5.33s/it]

1286
Accessing OpenAI API


 90%|█████████ | 90/100 [08:50<00:52,  5.21s/it]

1358
Saved progress for 90 genesets
Accessing OpenAI API


 91%|█████████ | 91/100 [08:54<00:42,  4.77s/it]

1261
Accessing OpenAI API


 92%|█████████▏| 92/100 [09:06<00:55,  6.90s/it]

2013
Accessing OpenAI API


 93%|█████████▎| 93/100 [09:11<00:46,  6.58s/it]

1410
Accessing OpenAI API


 94%|█████████▍| 94/100 [09:14<00:31,  5.30s/it]

1382
Accessing OpenAI API


 95%|█████████▌| 95/100 [09:18<00:25,  5.15s/it]

1422
Accessing OpenAI API


 96%|█████████▌| 96/100 [09:26<00:23,  5.77s/it]

1454
Accessing OpenAI API


 97%|█████████▋| 97/100 [09:30<00:16,  5.46s/it]

1286
Accessing OpenAI API


 98%|█████████▊| 98/100 [09:34<00:09,  4.92s/it]

1195
Accessing OpenAI API


 99%|█████████▉| 99/100 [09:38<00:04,  4.72s/it]

1207
Accessing OpenAI API


100%|██████████| 100/100 [09:43<00:00,  5.83s/it]

1718
Saved progress for 100 genesets





In [20]:
# check if there is any None in the analysis column, then rerun the pipeline

initialize = False 

SEED = 42
model_options = ['gemini-pro','mistral:7b', 'mixtral:latest', 'llama2:7b', 'llama2:70b']
# model_options = ['mixtral:latest']  # llama2 7b has formatting issue, ingore 
input_sep = '\t'

if __name__ == "__main__":
    for m in model_options:
        input_file
        model = m
        
        if '-' in model:
            name_fix = '_'.join(model.split('-')[:2])
        else:
            name_fix = model.replace(':', '_')
        input_file = f'data/GO_term_analysis/LLM_processed_toy_example_w_contamination_{name_fix}.tsv' # replace with your actual input file
        out_file = f'data/GO_term_analysis/LLM_processed_toy_example_w_contamination_{name_fix}'  # save to the same file name as the input file
        LOG_FILE = config['LOG_NAME']+f'_{name_fix}'+'_log.json'

        df = pd.read_csv(input_file, sep=input_sep, index_col=set_index)
        # print(df.head())
        column_prefix = name_fix + '_default' #this is default
        print(column_prefix)
        
        gene_column = constant.GO_GENE_COL
        print(gene_column)
        if initialize:
            # initialize the input file with llm names, analysis and score to None
            df[f'{column_prefix} Name'] = None
            df[f'{column_prefix} Analysis'] = None
            df[f'{column_prefix} Score'] = None
        main(df)  ## run with the real set 
        
        ## run the pipeline for contaiminated gene sets 
        contaminated_columns = [col for col in df.columns if col.endswith('contaminated_Genes')]
        # print(contaminated_columns)
        for col in contaminated_columns:
            gene_column = col ## Note need to change the gene_column to the contaminated column
            print(gene_column)
            contam_prefix = '_'.join(col.split('_')[0:2])
            column_prefix = name_fix + '_' +contam_prefix
            print(column_prefix)

            if initialize:
                # initialize the input file with llm names, analysis and score to None
                df[f'{column_prefix} Name'] = None
                df[f'{column_prefix} Analysis'] = None
                df[f'{column_prefix} Score'] = None
            main(df)
            
print("Done")

mixtral_latest_default
Genes


  0%|          | 0/11 [00:00<?, ?it/s]

Using server model


100%|██████████| 11/11 [01:34<00:00,  8.58s/it]


50perc_contaminated_Genes
mixtral_latest_50perc_contaminated


100%|██████████| 11/11 [00:00<00:00, 15246.97it/s]


100perc_contaminated_Genes
mixtral_latest_100perc_contaminated


100%|██████████| 11/11 [00:00<00:00, 16098.17it/s]

Done





In [3]:
## set up parameters for running the pipeline for every 50 rows
import os 
from glob import glob
# Define start, step, and end values
start = 0
step = 50
end = 100

# Create a range list
range_list = list(range(start, end + step, step))

# Create tuples for each consecutive pair in the list
tuple_list = [(range_list[i], range_list[i+1]) for i in range(len(range_list)-1)]


initialize = True 
input_file = 'data/GO_term_analysis/model_comparison_terms.csv'
input_sep = constant.GO_FILE_SEP
set_index = constant.GO_INDEX_COL  
gene_column = constant.GO_GENE_COL 
gene_sep = ' '

## create a param file 
configs = glob('./jsonFiles/model_comparison_*.json')
params = []
for start, end in tuple_list:
    for conf_file in configs:
        model_names = '_'.join(conf_file.split('/')[-1].split('.')[0].split('_')[1:])
        print(model_names)
        
        out_file = f'data/GO_term_analysis/model_compare/LLM_processed_model_compare_{model_names}_{start}_{end}'  
        param = f"--config {conf_file} \
            --initialize \
            --input {input_file} \
            --input_sep  '{input_sep}'\
            --set_index {set_index} \
            --gene_column {gene_column}\
            --gene_sep '{gene_sep}' \
            --run_contaminated \
            --start {start} \
            --end {end} \
            --output_file {out_file}"
        print(param)
        params.append(param)
print('number of params: ', len(params))
    

with open('model_compare_params.txt', 'w') as f:
    for p in params:
        f.write(p+'\n')

comparison_gpt_4
--config ./jsonFiles/model_comparison_gpt_4.json             --initialize  True            --input data/GO_term_analysis/model_comparison_terms.csv             --input_sep  ','            --set_index GO             --gene_column Genes            --gene_sep ' '             --start 0             --end 50             --output_file data/GO_term_analysis/model_compare/LLM_processed_model_compare_comparison_gpt_4_0_50
comparison_mixtral_latest
--config ./jsonFiles/model_comparison_mixtral_latest.json             --initialize  True            --input data/GO_term_analysis/model_comparison_terms.csv             --input_sep  ','            --set_index GO             --gene_column Genes            --gene_sep ' '             --start 0             --end 50             --output_file data/GO_term_analysis/model_compare/LLM_processed_model_compare_comparison_mixtral_latest_0_50
comparison_gemini_pro
--config ./jsonFiles/model_comparison_gemini_pro.json             --initialize  True 

## Checkout and combine the output from the batch run 

In [2]:
from glob import glob
import pandas as pd
import json

processed_files = glob('data/GO_term_analysis/model_compare/LLM_processed_model_compare*.tsv')
# processed_files
# check any with None in the analysis column
for file in processed_files:
    model_names = '_'.join(file.split('/')[-1].split('.')[0].split('_')[-4:])
    
    df = pd.read_csv(file, sep='\t')
    # column names end with Analysis
    analysis_cols = [col for col in df.columns if col.endswith('Analysis')]
    for col in analysis_cols:
        if df[col].isna().sum() > 0:
            n_none = df[col].isna().sum()
            print(f'{model_names} {col} has {n_none} None in the analysis column')
        else:
            print(f'{model_names} {col} pass')
        print('-----------------------')
    

    

compare_100set_mixtral_latest mixtral_latest_default Analysis pass
-----------------------
compare_100set_mixtral_latest mixtral_latest_50perc_contaminated Analysis pass
-----------------------
compare_100set_mixtral_latest mixtral_latest_100perc_contaminated Analysis pass
-----------------------
llama2_70b_50_100 llama2_70b_default Analysis pass
-----------------------
llama2_70b_50_100 llama2_70b_50perc_contaminated Analysis pass
-----------------------
llama2_70b_50_100 llama2_70b_100perc_contaminated Analysis pass
-----------------------
gemini_pro_50_100 gemini_pro_default Analysis pass
-----------------------
gemini_pro_50_100 gemini_pro_50perc_contaminated Analysis pass
-----------------------
gemini_pro_50_100 gemini_pro_100perc_contaminated Analysis pass
-----------------------
mixtral_latest_0_50 mixtral_latest_default Analysis pass
-----------------------
mixtral_latest_0_50 mixtral_latest_50perc_contaminated Analysis pass
-----------------------
mixtral_latest_0_50 mixtral_

In [5]:
## combine the 0-50 and 50-100 files together
from glob import glob
import pandas as pd
import json

processed_files = glob('data/GO_term_analysis/model_compare/LLM_processed_model_compare*.tsv')

# model_names = ['mixtral_instruct']
for file in processed_files:
    model_name = '_'.join(file.split('/')[-1].split('.')[0].split('_')[-4:-2])
    model_names.append(model_name)
model_names = list(set(model_names))

for model in model_names:
    print(model)
    files = [file for file in processed_files if model in file]
    print(files)
    df = pd.concat([pd.read_csv(file, sep='\t', index_col='GO') for file in files])
    
    # add the toy example in as well 
    toy_file = f'data/GO_term_analysis/LLM_processed_toy_example_w_contamination_{model}.tsv'
    
    df = pd.concat([df, pd.read_csv(toy_file, sep='\t', index_col='GO')])
    # check any with None in the analysis column
    analysis_columns = [col for col in df.columns if col.endswith('Analysis')]
    for col in analysis_columns:
        if df[col].isna().sum() > 0:
            n_none = df[col].isna().sum()
            print(f'{model} {col} has {n_none} None in the analysis column')
    
    print(df.shape)
    df.to_csv(f'data/GO_term_analysis/model_compare/LLM_processed_model_compare_100set_{model}.tsv', sep='\t', index=True)
    print('------------saved--------------')

mixtral_instruct
['data/GO_term_analysis/model_compare/LLM_processed_model_compare_mixtral_instruct_0_50.tsv', 'data/GO_term_analysis/model_compare/LLM_processed_model_compare_mixtral_instruct_50_100.tsv']
(100, 14)
------------saved--------------


In [6]:
##check for each 100 set file, how many 'systems of unrelated proteins' are assigened to each gene set 
from glob import glob
import pandas as pd
import json

files = glob('data/GO_term_analysis/model_compare/LLM_processed_model_compare_100set*.tsv')
unnamed_dict = {}
model_names = []

for file in files:
    model_name = '_'.join(file.split('/')[-1].split('.')[0].split('_')[-2:])
    model_names.append(model_name)
    df = pd.read_csv(file, sep='\t', index_col='GO')
    name_columns = [col for col in df.columns if col.endswith('Name')]
    
    for col in name_columns:
        gene_set_type = col.split(' ')[0]
        # print(gene_set_type)
        #number of names contains 'unrelated proteins'
        n_unrelated = df[col].str.contains('unrelated proteins').sum()
        n_total = df.shape[0]
        print(f'{gene_set_type} has {n_unrelated} gene sets named with unrelated proteins, {n_unrelated/n_total*100:.2f}%')
        unnamed_dict[f'{gene_set_type}'] = {'n_unrelated': n_unrelated, 'n_named': n_total-n_unrelated}
    score_columns = [col for col in df.columns if col.endswith('Score')]
    for c in score_columns:
        gene_set_type = c.split(' ')[0]
        # print(gene_set_type)
        # number of scores are 0
        n_zero = df[c].eq(0).sum()
        n_total = df.shape[0]
        print(f'{gene_set_type} has {n_zero} gene sets with score 0, {n_zero/n_total*100:.2f}%')
        
    print('------------------')

mixtral_latest_default has 0 gene sets named with unrelated proteins, 0.00%
mixtral_latest_50perc_contaminated has 0 gene sets named with unrelated proteins, 0.00%
mixtral_latest_100perc_contaminated has 0 gene sets named with unrelated proteins, 0.00%
mixtral_latest_default has 0 gene sets with score 0, 0.00%
mixtral_latest_50perc_contaminated has 0 gene sets with score 0, 0.00%
mixtral_latest_100perc_contaminated has 0 gene sets with score 0, 0.00%
------------------
gpt_4_default has 4 gene sets named with unrelated proteins, 4.00%
gpt_4_50perc_contaminated has 28 gene sets named with unrelated proteins, 28.00%
gpt_4_100perc_contaminated has 87 gene sets named with unrelated proteins, 87.00%
gpt_4_default has 4 gene sets with score 0, 4.00%
gpt_4_50perc_contaminated has 28 gene sets with score 0, 28.00%
gpt_4_100perc_contaminated has 87 gene sets with score 0, 87.00%
------------------
llama2_70b_default has 0 gene sets named with unrelated proteins, 0.00%
llama2_70b_50perc_contamin