In [1]:
import os
import pandas as pd
import json 
import openai

In [2]:
import math

In [3]:
from utils.openai_query import openai_chat
from utils.prompt_factory import make_user_prompt_with_score
from utils.llm_analysis_utils import process_analysis, save_progress

In [4]:
openai.api_key = os.environ["OPENAI_API_KEY"] # Environment variable

In [5]:
runVersion = "additional"; # initial

In [6]:
geneSep = " "
inputFilePath = "data/omics_revamped_LLM_DF.tsv"; #"data/omics_revamped.txt"
jsonFilePath = "jsonFiles/OmicsRunLLM.json"
genesCol = "GeneList"
nameCol  = "GeneSetName"
outputFilePath = "data/omics_revamped_LLM_DF.tsv"

In [7]:
with open(jsonFilePath) as json_file:
    config = json.load(json_file)
    
context = config['CONTEXT']
gpt_model = config['GPT_MODEL']
temperature = config['TEMP']
max_tokens = config['MAX_TOKENS']
rate_per_token = config['RATE_PER_TOKEN']
LOG_FILE = config['LOG_NAME'] + '240129'+'log.json'
DOLLAR_LIMIT = config['DOLLAR_LIMIT']

In [8]:
SEED = 42

In [9]:
gpt_model

'gpt-4-1106-preview'

### Run GPT-4 query pipeline for NeST gene sets

In [10]:
df = pd.read_csv(inputFilePath, sep = "\t"); 

In [11]:
if runVersion == "initial":
    df['LLM Name'] = None
    df['LLM Analysis'] = None
    df['Score'] = None

In [12]:
for i, row in df.iterrows():
    
    term_genes = row[genesCol]
    genes = term_genes.split(geneSep) 
    
    if runVersion == "additional":
        if type(row['LLM Name']) == str:
            continue # skip this row because already done
   
    
    prompt = make_user_prompt_with_score(genes)

    analysis, finger_print = openai_chat(context, prompt, gpt_model, temperature, max_tokens, rate_per_token, LOG_FILE, DOLLAR_LIMIT, SEED)

    if analysis:
        llm_name, llm_score, llm_analysis = process_analysis(analysis)
        df.loc[i, 'LLM Name'] = llm_name
        df.loc[i, 'LLM Analysis'] = llm_analysis
        df.loc[i, 'Score'] = float(llm_score)

    else:
        #go_term = row['GO']
        name = row[nameCol]
        print(f'No analysis for {name}')
        df.loc[i, 'LLM Name'] = None
        df.loc[i, 'LLM Analysis'] = None
    #if (i%10 ==1):
    #    break
        
    # Keep on saving to not loose data if something happens
    if (i%10 == 1):
        print(i)
        df.to_csv(outputFilePath, sep = "\t",  index=False)
    

1641
1564
1576
1328
1594
1565
1581
1550
1682
1726
71
1847
1644
1603
1544
1532
1471
1853
1649
1407
1694
81
1735
1781
1688
1628
1685
1456
1628
1685
1421
1523
91
1617
1618
1839
1598
1535
1357
1552
1896
1575
1729
101
1547
1779
1326
1383
1670
1617
1806
1679
1850
1767
111
1569
1570
1551
1774
1634
1643
1458
1458
1655
1438
121
1621
1513
1381
1908
1700
1767
1532
1550
1722
1736
131
1824
1753
1573
1660
1611
1863
1768
1559
1609
1888
141
1640
1662
1894
1610
1771
1781
1708
1538
1711
1720
151
1698
1794
1846
2007
1675
1757
1546
1737
1629
1630
161
1769
1643
1760
2009
1760
1782
1749
1621
1886
1606
171
1544
1632
1651
1638
1834
1645
1839
1747
1877
1936
181
1697
1654
1526
1742
1629
1735
1664
1542
1663
1702
191
1705
1909
1684
1561
1657
1744
1685
1701
1829
2070
201
1504
1586
1843
1623
1694
1805
1676
1604
1743
1708
211
1904
1810
1625
1983
1680
1597
1631
1553
1765
1795
221
1709
1685
1486
1612
1549
1837
1487
1874
1665
1648
231
1908
1877
1543
1743
1559
1647
1764
1723
1888
1937
241
1816
1876
1700
1606
1817
1688
1

In [15]:
df.to_csv(outputFilePath, sep= '\t', index=False)
