In [1]:
import os
import pandas as pd
import json 
import openai

In [2]:
from utils.openai_query import openai_chat
from utils.prompt_factory import make_user_prompt_with_score
from utils.llm_analysis_utils import process_analysis, save_progress

In [3]:
openai.api_key = os.environ["OPENAI_API_KEY"] # Environment variable

In [4]:
geneSep = " "
inputFilePath = "data/omics.txt"
jsonFilePath = "jsonFiles/OmicsRunLLM.json"
genesCol = "GeneList"
nameCol  = "GeneSetName"
outputFilePath = "data/omics_LLM_DF.tsv"

In [5]:
with open(jsonFilePath) as json_file:
    config = json.load(json_file)
    
context = config['CONTEXT']
gpt_model = config['GPT_MODEL']
temperature = config['TEMP']
max_tokens = config['MAX_TOKENS']
rate_per_token = config['RATE_PER_TOKEN']
LOG_FILE = config['LOG_NAME'] + '240109'+'log.json'
DOLLAR_LIMIT = config['DOLLAR_LIMIT']

In [6]:
SEED = 42

In [7]:
gpt_model

'gpt-4-1106-preview'

### Run GPT-4 query pipeline for NeST gene sets

In [8]:
df = pd.read_csv(inputFilePath, sep = "\t"); 

In [9]:
df['LLM Name'] = None
df['LLM Analysis'] = None
df['Score'] = None

In [10]:
for i, row in df.iterrows():
    
    term_genes = row[genesCol]
    genes = term_genes.split(geneSep) 
    
    prompt = make_user_prompt_with_score(genes)

    analysis, finger_print = openai_chat(context, prompt, gpt_model, temperature, max_tokens, rate_per_token, LOG_FILE, DOLLAR_LIMIT, SEED)

    if analysis:
        llm_name, llm_score, llm_analysis = process_analysis(analysis)
        df.loc[i, 'LLM Name'] = llm_name
        df.loc[i, 'LLM Analysis'] = llm_analysis
        df.loc[i, 'Score'] = float(llm_score)

    else:
        #go_term = row['GO']
        name = row[nameCol]
        print(f'No analysis for {name}')
        df.loc[i, 'LLM Name'] = None
        df.loc[i, 'LLM Analysis'] = None
        
    # Keep on saving to not loose data if something happens
    if (i%10 == 1):
        print(i)
        df.to_csv(outputFilePath, sep = "\t",  index=False)
    

2439
2272
1
1988
1732
2494
1621
2216
2077
1806
2157
2240
2092
11
2502
2370
2196
2205
2138
2398
2208
1689
2328
2436
21
2418
1857
2242
2035
2418
2249
2247
2391
2495
2606
31
1756
2440
1757
2617
2391
1733
2071
2085
2103
1744
41
1990
1910
2632
2251
2012
2156
1906
2378
1371
1354
51
1566
2125
1382
1661
2349
1469
1556
1850
1749
2167
61
1724
1626
1382
1512
1579
1627
1494
1743
1576
1707
71
1548
1762
1684
1390
1586
1640
2020
1757
1391
1455
81
1472
1546
1473
1485
1618
1628
1513
1427
2045
1507
91
1776
1605
1905
1482
1915
1997
1681
1296
1512
1609
101
1639
1508
1898
1579
1437
1639
1598
1902
1985
1815
111
1691
1725
1864
1830
2136
1894
1993
1703
1824
1661
121
1597
1742
1892
1478
1556
1706
1505
1562
1560
1473
131
1426
1426
1301
1573
1358
1594
1369
1418
1383
1463
141
1352
1415
1459
1358
1539
1406
1680
1334
1454
1439
151
1522
1512
1412
1712
1713
1366
1559
1494
1512
1382
161
1622
1786
1511
1563
2063
1788
1560
1637
1759
1731
171
1299
2186
2052
1341
1394
1413
1541
2091
1342
1469
181
1420
1817
1459
1873
1425


In [12]:
df.to_csv(outputFilePath, sep= '\t', index=False)
