In [1]:
import os
import pandas as pd
import json 
import openai

In [2]:
from utils.openai_query import openai_chat
from utils.prompt_factory import make_user_prompt

In [3]:
openai.api_key = os.environ["OPENAI_API_KEY"] # Environment variable

In [8]:
# To Edit between runs
dataType = "NeST"
runVersion = "test" # "all"

In [9]:
if dataType == "NeST":
    geneSep  = ',' # It is different with different sources of data
    if runVersion == "test":
        inputFilePath = "data/NeST_table_subset.txt"
    else:
        inputFilePath = "data/NeST_table.txt"
    jsonFilePath = 'jsonFiles/NeSTRunLLM.json'
    genesCol = 'Genes'
    nameCol = 'NeST ID'
    if runVersion == "test":
        outputFilePath = 'data/NeST_table_subset_LLM_DF.tsv'
    else:
        outputFilePath = 'data/NeST_table_LLM_DF.tsv'


In [13]:
with open(jsonFilePath) as json_file:
    config = json.load(json_file)
    
context = config['CONTEXT']
gpt_model = config['GPT_MODEL']
temperature = config['TEMP']
max_tokens = config['MAX_TOKENS']
rate_per_token = config['RATE_PER_TOKEN']
LOG_FILE = config['LOG_NAME'] + 'log.json'
DOLLAR_LIMIT = config['DOLLAR_LIMIT']

### Run GPT-4 query pipeline for NeST gene sets

In [15]:
df = pd.read_csv(inputFilePath, sep = "\t"); 

df['LLM Name'] = None
df['LLM Analysis'] = None
# print(df.head())

#df.iloc.iterrows():
for i, row in df.iterrows():
    term_genes = row[genesCol]
    genes = term_genes.split(geneSep) 
    prompt = make_user_prompt(genes)
    
    # print(prompt)
    analysis = openai_chat(context, prompt, gpt_model,temperature, max_tokens, rate_per_token, LOG_FILE, DOLLAR_LIMIT)
    if analysis:
        llm_name = analysis.split("\n")[0].replace("Process: ", "")
        df.loc[i, 'LLM Name'] = llm_name
        
        llm_analysis = analysis.split('\n', 2)[2]
        df.loc[i, 'LLM Analysis'] = llm_analysis
    else:
        #go_term = row['GO']
        name = row[nameCol]
        print(f'No analysis for {name}')
        df.loc[i, 'LLM Name'] = None
        df.loc[i, 'LLM Analysis'] = None
    # print(go_name)
    

658
585
716


In [18]:
df.to_csv(outputFilePath, sep= '\t', index=False)


### Run GPT-4 query pipeline for MSigDB gene sets

In [4]:
import pandas as pd
import json 
import openai
from utils.openai_query import openai_chat
from utils.prompt_factory import make_user_prompt
## from here is loading yaml file
import yaml 
import pandas as pd
from glob import glob

In [5]:
with open('jsonFiles/MSigDBRunLLM.json') as json_file:
    config = json.load(json_file)

In [6]:
context = config['CONTEXT']
gpt_model = config['GPT_MODEL']
temperature = config['TEMP']
max_tokens = config['MAX_TOKENS']
rate_per_token = config['RATE_PER_TOKEN']
LOG_FILE = config['LOG_NAME'] + 'log.json'
DOLLAR_LIMIT = config['DOLLAR_LIMIT']

In [9]:
if runVersion == "test":
    filePaths = 'data/human_geneSets_subset//*.yaml'
else: 
    filePaths = 'data/human_geneSets/*.yaml'


In [10]:
yaml_files = glob(filePaths)


In [11]:
len(yaml_files)

3

In [12]:
## Remove .yaml files with no gene symbols
yaml_files_to_remove = []

for i, yaml_file in enumerate(yaml_files):
    with open(yaml_file, 'r') as file:
        data = yaml.safe_load(file)
    if 'gene_symbols' not in data:
        yaml_files_to_remove.append(yaml_file) 
    elif len(data['gene_symbols']) ==0:
        yaml_files_to_remove.append(yaml_file) 


In [13]:
[yaml_files.remove(yaml_file_to_remove) for yaml_file_to_remove in yaml_files_to_remove]

[]

In [14]:
len(yaml_files)

3

In [15]:
if runVersion == "test":
    outputFile =  'data/MSigDB_table_subset_LLM_DF.tsv'
else:
    outputFile = 'data/MSigDB_table_LLM_DF.tsv'

In [16]:
if runVersion ==  "initial" or runVersion == "test":
    df = pd.DataFrame(columns=['Name', 'Genes', 'LLM Name', 'LLM Analysis'])
else:
    # It was run before -- read in dataframe

    df = pd.read_csv(outputFile, sep = "\t") 

In [17]:
for i, yaml_file in enumerate(yaml_files):
    print(i)
    # Load your YAML file
    with open(yaml_file, 'r') as file:
        data = yaml.safe_load(file)
        
    if  (runVersion == "additional") and (df.loc[i, 'LLM Name'] is not None):
        continue # move on to next item; it was already run 
        
    # Get the list of genes from yaml
    genes = data['gene_symbols']

    #add to dataframe
    df.loc[i, ['Name', 'Genes']] = [data['name'],(' ').join(genes)]
    
    prompt = make_user_prompt(genes)
    # print(prompt)
    analysis = openai_chat(context, prompt, gpt_model,temperature, max_tokens, rate_per_token, LOG_FILE, DOLLAR_LIMIT)
    # print(analysis)
    
    
    if analysis:
        llm_name = analysis.split("\n")[0].replace("Process: ", "")
        df.loc[i, 'LLM Name'] = llm_name
        
        llm_analysis = analysis.split('\n', 2)[2]
        df.loc[i, 'LLM Analysis'] = llm_analysis
    else:
        name = data['name']
        print(f'No analysis for {name}')
        df.loc[i, 'LLM Name'] = None
        
    # Keep on saving to not loose data if something happens
    if (i%10 == 1):
        df.to_csv(outputFile, sep = "\t")

0
1536
1
1123
2
1533


In [39]:
df.to_csv(outputFile, sep = "\t")

In [19]:
outputFile

'data/MSigDB_table_subset_LLM_DF.tsv'