In [4]:
import pandas as pd
import json 
from utils.openai_query import openai_chat
from utils.prompt_factory import make_user_prompt
from tqdm import tqdm
import openai


In [None]:
## load variables
# Replace with your actual values
config_file = 'config.json'  # replace with your actual config file
input_file = 'data/example_gene_sets.tsv' # replace with your actual input file
input_sep = '\t'  # replace with the separator
set_index = 'term'  # replace with your column name that you want to set as index or None
gene_column = 'genes'  # replace with your actual column name for the gene list
gene_sep = ' '  # replace with your actual separator
gene_features = None  # replace with your path to the gene features or None
direct = False # if True, then the prompt will be a direct sentence asking for a name and analysis from the gene set 
out_file = 'data/example_output.tsv'  # replace with your actual output file name
default_prompt = True # if True, then the prompt will be the default prompt, if False, then the prompt will be the customized prompt

if default_prompt:
    customized_prompt = None
else:
    with open('data/example_customized_prompt.txt', 'r') as f: # replace with your actual customized prompt file
        customized_prompt = f.read()

# load the config file
with open(config_file) as json_file:
    config = json.load(json_file)

# Load OpenAI key, context, and model used 
openai.api_key = config['OPENAI_API_KEY']

context = config['CONTEXT']
gpt_model = config['GPT_MODEL']
temperature = config['TEMP']
max_tokens = config['MAX_TOKENS']
rate_per_token = config['RATE_PER_TOKEN']
LOG_FILE = config['LOG_NAME']+'_log.json'
DOLLAR_LIMIT = config['DOLLAR_LIMIT']

In [5]:
df = pd.read_csv(input_file, sep = input_sep)

if set_index:
    df.set_index(set_index, inplace=True)
# Initialize columns for LLM name and analysis
df['LLM Name'] = None
df['LLM Analysis'] = None

llm_response_dict = {}
# Only process the specified range of 
i = 0
for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
    term_genes = row[gene_column]
    genes = term_genes.split(gene_sep)
    if len(genes) >1000:
        print(f'Gene set {idx} has more than 1500 genes, skipping')
        continue
    try:
        prompt = make_user_prompt(genes, feature_df=gene_features, direct=direct, customized_prompt=customized_prompt)
        analysis = openai_chat(context, prompt, gpt_model, temperature, max_tokens, rate_per_token, LOG_FILE, DOLLAR_LIMIT)
    
        if analysis:
            llm_name = analysis.split("\n")[0].replace("Process: ", "")
            df.loc[idx, 'LLM Name'] = llm_name
            
            llm_analysis = analysis.split('\n', 2)[2]
            df.loc[idx, 'LLM Analysis'] = llm_analysis

            llm_response_dict[idx] = {'prompt': prompt, 
                                      'responses': analysis, 
                                      'status': 'SUCCESS'}
        else:
            print(f'No analysis for {idx}')
            df.loc[idx, 'LLM Name'] = None
            df.loc[idx, 'LLM Analysis'] = None
            llm_response_dict[idx] = {'prompt': prompt,
                                      'responses': None,
                                      'status': 'NO RESPONSE'}
        i += 1
    except Exception as e:
        print(f'Error for {idx}: {e}')
        df.loc[idx, 'LLM Name'] = None
        df.loc[idx, 'LLM Analysis'] = None
        llm_response_dict[idx] = {'prompt': prompt,
                                    'responses': None,
                                    'status': 'ERROR: '+str(e)}
        continue
    
    if i % 10 == 0:
        with open(f'{out_file}.json', 'w') as fp:
            json.dump(llm_response_dict, fp)
        df.to_csv(f'{out_file}.tsv', sep='\t', index=True)

with open(f'{out_file}.json', 'w') as fp:
    json.dump(llm_response_dict, fp)

df.to_csv(f'{out_file}.tsv', sep='\t', index=True) 


  3%|▎         | 1/31 [00:20<10:17, 20.60s/it]

4459


  6%|▋         | 2/31 [00:40<09:52, 20.43s/it]

3224


 10%|▉         | 3/31 [01:02<09:44, 20.87s/it]

2468


 13%|█▎        | 4/31 [01:27<10:09, 22.57s/it]

1390


 16%|█▌        | 5/31 [01:50<09:54, 22.85s/it]

1200


 19%|█▉        | 6/31 [02:16<09:52, 23.70s/it]

1070


 23%|██▎       | 7/31 [02:37<09:14, 23.09s/it]

841


 26%|██▌       | 8/31 [03:02<09:02, 23.60s/it]

807


 29%|██▉       | 9/31 [03:29<09:04, 24.75s/it]

784


 32%|███▏      | 10/31 [03:52<08:24, 24.01s/it]

665


 35%|███▌      | 11/31 [04:16<08:00, 24.02s/it]

765


 39%|███▊      | 12/31 [04:53<08:50, 27.93s/it]

902


 42%|████▏     | 13/31 [05:33<09:28, 31.56s/it]

846


 45%|████▌     | 14/31 [06:00<08:33, 30.21s/it]

745


 48%|████▊     | 15/31 [06:23<07:28, 28.06s/it]

662


 52%|█████▏    | 16/31 [06:48<06:45, 27.05s/it]

717


 55%|█████▍    | 17/31 [07:12<06:09, 26.41s/it]

706


 58%|█████▊    | 18/31 [07:37<05:36, 25.92s/it]

671


 61%|██████▏   | 19/31 [08:02<05:06, 25.51s/it]

677


 65%|██████▍   | 20/31 [08:26<04:35, 25.08s/it]

704


 68%|██████▊   | 21/31 [09:00<04:37, 27.74s/it]

708


 71%|███████   | 22/31 [09:23<03:58, 26.50s/it]

640


 74%|███████▍  | 23/31 [09:48<03:28, 26.08s/it]

648


 77%|███████▋  | 24/31 [10:14<03:01, 25.86s/it]

690


 81%|████████  | 25/31 [10:41<02:37, 26.24s/it]

691


 84%|████████▍ | 26/31 [11:03<02:04, 24.93s/it]

608


 87%|████████▋ | 27/31 [11:29<01:40, 25.20s/it]

649


 90%|█████████ | 28/31 [11:55<01:16, 25.53s/it]

693


 94%|█████████▎| 29/31 [12:21<00:51, 25.76s/it]

653


 97%|█████████▋| 30/31 [12:47<00:25, 25.78s/it]

624


100%|██████████| 31/31 [13:15<00:00, 25.65s/it]

656





In [7]:
df.head()

Unnamed: 0_level_0,size,genes,stability,LLM Name,LLM Analysis
term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Cluster2-2,897,TMEM51 CNST TBPL1 BAG6 EFNB2 XRCC2 PPP3CA ZMYM...,59,DNA Repair and Transcription Regulation,The system of interacting human proteins provi...
Cluster2-3,596,PRKACG CTSV PLA2G4D SCEL RAPSN ATP13A3 CUL5 GP...,33,Intracellular Signal Transduction and Cell Adh...,The system of interacting human proteins provi...
Cluster3-1,429,C3orf38 ZNF688 ABCB1 SPECC1 GPSM2 GNG2 FCHSD2 ...,19,Intracellular Signaling and Transport Regulation,The system of interacting human proteins prese...
Cluster3-5,175,ZMYM2 ARID4B JARID2 TFDP1 MIER2 ARID5B SET ZNF...,22,Chromatin Remodeling and Transcriptional Regul...,The system of interacting proteins provided is...
Cluster3-8,132,CEP112 CHEK1 HDAC5 CIC MAP3K3 SERF2 WEE1 HDAC6...,18,PI3K-AKT Signaling and Cell Cycle Regulation,The system of interacting proteins provided in...
