## Step 1: query GPT-4 to generate an analysis and a propose name for each gene set

In [1]:
import pandas as pd
import json 
from utils.openai_query import openai_chat
from utils.prompt_factory import make_user_prompt_with_score
from utils.server_model_query import server_model_chat
from utils.llm_analysis_utils import process_analysis, save_progress
from utils.genai_query import query_genai_model
from tqdm import tqdm
import constant
import openai
import os
import logging
import re
%load_ext autoreload

%autoreload 2


In [2]:
## demo of the data format
df = pd.read_csv('data/example_NeST_table_sub.tsv', sep='\t')
df.head()

Unnamed: 0,NEST ID,name_new,Genes
0,NEST:177,p53 regulation of cell cycle,"CDKN1A,CREBBP,EP300,RBL1,TP53"
1,NEST:169,Neg Regulation EGFR,"AKT1,CTNNB1,EGF,EGFR,NF2,PTEN"
2,NEST:3,Regulation of cellular homeostasis,"ABL1,ACVR1B,AKT1,AMER1,ANKRD11,AR,ARAF,ARID1A,..."
3,NEST:8,Immune system`,"AXL,B2M,BIRC3,BTK,CARD11,CD19,CD22,CD274,CD40,..."
4,NEST:97,Cell projection morphogenesis,"EPHA2,EPHA3,EPHA5,EPHA7,EPHB1,EPHB2,PTPRD,SLIT2"


Demo with a default prompt, but allow customized prompt

In [8]:
## load variables
# Replace with your actual values
initialize = True # if True, then initialize the input table with llm names, analysis and score to None 
# Replace with your actual values
config_file = 'gpt4_config.json'  # replace with your actual config file 
input_file = 'data/example_NeST_table_sub.tsv' # replace with your actual input file
input_sep = '\t'  # replace with the separator
set_index = 'NEST ID'  # replace with your column name that you want to set as index or None
gene_column = 'Genes'  # replace with your actual column name for the gene list
gene_sep = ','  # replace with your actual separator
gene_features = None  # replace with your path to the gene features or None if you don't want to include in the prompt
direct = False # if True, then the prompt will be a direct sentence asking for a name and analysis from the gene set, otherwise default or customized prompt
out_file = 'data/example_NEST_GPT4_output'  # replace with your actual output file name
customized_prompt = False # if True, then the prompt will be the custom prompt, if False, then the prompt will use default

# load the config file
with open(config_file) as json_file:
    config = json.load(json_file)

if customized_prompt:
    # make sure the file exist 
    if os.path.isfile(config['CUSTOM_PROMPT_FILE']):
        with open(config['CUSTOM_PROMPT_FILE'], 'r') as f: # replace with your actual customized prompt file
            customized_prompt = f.read()
            assert len(customized_prompt) > 1, "Customized prompt is empty"
    else:
        print("Customized prompt file does not exist")
        customized_prompt = None
else:
    customized_prompt = None

# Load OpenAI key, context, and model used 
openai.api_key = os.environ["OPENAI_API_KEY"]

context = config['CONTEXT']
model = config['MODEL']
temperature = config['TEMP']
max_tokens = config['MAX_TOKENS']
if model.startswith('gpt'):
    rate_per_token = config['RATE_PER_TOKEN']
    DOLLAR_LIMIT = config['DOLLAR_LIMIT']
LOG_FILE = config['LOG_NAME']+'_log.json'

SEED = constant.SEED
column_prefix = model.split('-')[0]

In [9]:
# handle the logger so it create a new one for each model run
def get_logger(filename):
    logger = logging.getLogger(filename)
    logger.setLevel(logging.INFO)
    if not logger.handlers:
        file_handler = logging.FileHandler(filename)
        formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
        file_handler.setFormatter(formatter)
        logger.addHandler(file_handler)
    return logger


def main(df):
    analysis_dict  = {}

    logger = get_logger(f'{out_file}.log')

    i = 0 #used for track progress and saving the file
    for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
        #only process None rows 
        if pd.notna(row[f'{column_prefix} Analysis']):
            continue
        
        gene_data = row[gene_column]
        # if gene_data is not a string, then skip
        if type(gene_data) != str:
            
            logger.warning(f'Gene set {idx} is not a string, skipping')
            continue
        genes = gene_data.split(gene_sep)
        
        if len(genes) >1000:
            logger.warning(f'Gene set {idx} is too big, skipping')
            continue

        try:
            prompt = make_user_prompt_with_score(genes)
            # print(prompt)
            finger_print = None
            if model.startswith('gpt'):
                print("Accessing OpenAI API")
                analysis, finger_print = openai_chat(context, prompt, model, temperature, max_tokens, rate_per_token, LOG_FILE, DOLLAR_LIMIT, SEED)
            elif model.startswith('gemini'):
                print("Using Google Gemini API")
                analysis, error_message = query_genai_model(f"{context}\n{prompt}", model, temperature, max_tokens, LOG_FILE) 
            else:
                print("Using server model")
                analysis, error_message= server_model_chat(context, prompt, model, temperature, max_tokens,LOG_FILE, SEED)

            
            if analysis:
                # print(analysis)
                llm_name, llm_score, llm_analysis = process_analysis(analysis)
                # clean up the score and return float
                try:
                    llm_score_value =  float(re.sub("[^0-9.-]", "", llm_score))
                except ValueError:
                    llm_score_value = llm_score
            
                
                df.loc[idx, f'{column_prefix} Name'] = llm_name
                df.loc[idx, f'{column_prefix} Analysis'] = llm_analysis
                df.loc[idx, f'{column_prefix} Score'] = llm_score_value
                analysis_dict[f'{idx}_{column_prefix}'] = analysis
                # Log success with fingerprint
                logger.info(f'Success for {idx} {column_prefix}.')
                if finger_print:
                    logger.info(f'GPT_Fingerprint for {idx}: {finger_print}')
                    
            else:
                logger.error(f'Error for query gene set {idx}: {error_message}')

        except Exception as e:
            logger.error(f'Error for {idx}: {e}')
            continue
        i += 1
        if i % 10 == 0:
            save_progress(df, analysis_dict, out_file)
            # df.to_csv(f'{out_file}.tsv', sep='\t', index=True)
            print(f"Saved progress for {i} genesets")
    # save the final file
    save_progress(df, analysis_dict, out_file)
    

In [10]:
#Define your own loop for running the pipeline
if __name__ == "__main__":
    
    df = pd.read_csv(input_file, sep=input_sep, index_col=set_index)
    
    if 'gpt' in model:
        name_fix = '_'.join(model.split('-')[:2])
    else:
        name_fix = model.replace(':', '_')
    column_prefix = name_fix + '_default'
    print(column_prefix)
    
    if initialize:
        # initialize the input file with llm names, analysis and score to None
        df[f'{column_prefix} Name'] = None
        df[f'{column_prefix} Analysis'] = None
        df[f'{column_prefix} Score'] = None
    main(df)  ## run with the real set 
    
    ## run the pipeline for contaiminated gene sets 
    contaminated_columns = [col for col in df.columns if col.endswith('contaminated_Genes')]
    # print(contaminated_columns)
    for col in contaminated_columns:
        gene_column = col ## Note need to change the gene_column to the contaminated column
        contam_prefix = '_'.join(col.split('_')[0:2])
        
        column_prefix = name_fix + '_' +contam_prefix
        print(column_prefix)

        if initialize:
            # initialize the input file with llm names, analysis and score to None
            df[f'{column_prefix} Name'] = None
            df[f'{column_prefix} Analysis'] = None
            df[f'{column_prefix} Score'] = None
        main(df)
    df.head()


gpt_4_default


  0%|          | 0/27 [00:00<?, ?it/s]

Accessing OpenAI API


  4%|▎         | 1/27 [00:14<06:08, 14.17s/it]

1443
Accessing OpenAI API


  7%|▋         | 2/27 [00:30<06:26, 15.47s/it]

1475
Accessing OpenAI API


 11%|█         | 3/27 [00:46<06:14, 15.58s/it]

2736
Accessing OpenAI API


In [None]:
### the pipeline can also be run from the command line using the following command:
# demo the first 2 row  
## used direct prompt
start = 0
end = 1
# run in the command line  
%run query_llm_for_analysis.py --config $config \
            --initialize \
            --input $input_file \
            --input_sep  ','\
            --set_index $set_index \
            --gene_column $gene_column\
            --gene_sep ' ' \
            --start $start \
            --end $end \
            --output_file 'data/demo_commandline.tsv

## MUSIC example

checkout query_MuSIC_using_gpt4.sh


## Step2: Get references checking 

demo run

In [4]:
# To Edit between runs
dataType = "NeST" 
runVersion = "initial" #"additional"; # "initial"
runOnlyExamples = False # True; # False
examplesTORun = None
LLM_analysisFilePath = 'data/example_NEST_GPT4_output.tsv'
toSaveFilePath = 'data/example_NEST_GPT4_ref_output.tsv'
jsonFilePath = 'gpt4_config_ref.json'
nameCol = 'name_new'

In [5]:
import pandas as pd
import json 
from Bio import Entrez
import openai
from utils.reference_checker import get_references_for_paragraphs

with open(jsonFilePath) as json_file:
    config = json.load(json_file)

openai.api_key = os.environ['OPENAI_API_KEY']
email = config['EMAIL']

 # save the paragraph with corresponding keywords and references to a json file 
savejsonPath = 'data/paragraph_ref_data' #this is the default, change to your own json file name (no need to add '.json') 

In [6]:
MarkedParagraphs = []
## Read in the LLM analysis file
df = pd.read_csv(LLM_analysisFilePath, sep='\t', keep_default_na=False, na_values=['NaN'])
df = df.replace({'None':None})
df.set_index(nameCol, inplace=True)
if runVersion == "initial":
    df['References'] = None
j = 1
for i, row in df.iterrows():
#for i in range(startRow, df.shape[0]):
#    row = df.iloc[i]
    if runOnlyExamples: # Only run examples
        if df.loc[i] not in examplesTORun: 
            continue

    
    if runVersion == "initial":
        if df.loc[i]['References'] is not None:
            continue
    if runVersion == "additional":
        if not (df.loc[i]['References'] == ''):
            continue # skip this row because already done

    print(['dataframe row', i])
    # check out the llm analysis 
    example_analysis = df.loc[i]['LLM Analysis']
    paragraphs = list(filter(lambda p: len(p.split()) > 5, example_analysis.split("\n")))
    
    try:
        references = get_references_for_paragraphs(paragraphs, email = email, config =config, n=3, verbose=True, MarkedParagraphs = MarkedParagraphs, saveto=savejsonPath)
        print(['Cannot get references for row', i])
    except:
        references = ''
        
    references.replace( '\n', '')
        
    df.loc[i, 'References'] = references
    if j%5==0:
        df.to_csv(toSaveFilePath, sep = '\t')
    j+=1

df.to_csv(toSaveFilePath, sep = '\t')
if MarkedParagraphs:
    with open('data/ref_check_Marked_paragraph.json', 'w') as fp:
        json.dump(MarkedParagraphs, fp)

['dataframe row', 'p53 regulation of cell cycle']
Extracting keywords from paragraph
Paragraph:
Proteins: CDKN1A, CREBBP, EP300, RBL1, TP53
172
Query:
 
I have a paragraph
Paragraph:
Proteins: CDKN1A, CREBBP, EP300, RBL1, TP53

I would like to search PubMed to find supporting evidence for the statements in this paragraph. Give me a list of gene symbols from the paragraph. Please only include genes. Return the genes as a comma separated list without spacing, if there are no genes in the statements, please return "Unknown" 
Result:
CDKN1A,CREBBP,EP300,RBL1,TP53
260
Query:

I would like to search PubMed to find supporting evidence for the statements in a paragraph. Give me a maximum of 3 keywords related to the protein functions or biological processes in the statements. 

Example paragraph:  Involvement of pattern recognition receptors: TLR1, TLR2, and TLR3 are part of the Toll-like receptor family, which recognize pathogen-associated molecular patterns and initiate innate immune respons

In [10]:
# Reformat to split the paragraphs 
import re
df = pd.read_csv(toSaveFilePath, sep = '\t')
# explode the paragraphs
split_paragraphs_df = {'name':[],'geneset':[], 'paragraph':[], 'references':[]}

for _, row in df.iterrows():
    paragraphs, references = row['References'].split('='*200)
    paragraphs =  list(filter(lambda p: len(p.split()) > 5, paragraphs.split("\n\n")))
    references = references.strip().split('\n')
    for paragraph in paragraphs:
        ref_index = re.findall(r'\[(\d+)\]', paragraph)
        if ref_index:
            for i in ref_index:
                split_paragraphs_df['name'].append(row[nameCol])
                split_paragraphs_df['geneset'].append(row['Genes'])
                split_paragraphs_df['paragraph'].append(paragraph)
                split_paragraphs_df['references'].append(references[int(i)-1])
        else:
            split_paragraphs_df['name'].append(row[nameCol])
            split_paragraphs_df['geneset'].append(row['Genes'])
            split_paragraphs_df['paragraph'].append(paragraph)
            split_paragraphs_df['references'].append(None)
split_paragraphs_df = pd.DataFrame(split_paragraphs_df)
print(split_paragraphs_df.head(10))

# save the dataframe
split_paragraphs_df.to_csv('data/example_NeST_gpt4_analysis_ref_per_paragraph.tsv', sep = '\t', index = False)


                           name                        geneset  \
0  p53 regulation of cell cycle  CDKN1A,CREBBP,EP300,RBL1,TP53   
1  p53 regulation of cell cycle  CDKN1A,CREBBP,EP300,RBL1,TP53   
2  p53 regulation of cell cycle  CDKN1A,CREBBP,EP300,RBL1,TP53   
3  p53 regulation of cell cycle  CDKN1A,CREBBP,EP300,RBL1,TP53   
4  p53 regulation of cell cycle  CDKN1A,CREBBP,EP300,RBL1,TP53   
5  p53 regulation of cell cycle  CDKN1A,CREBBP,EP300,RBL1,TP53   
6  p53 regulation of cell cycle  CDKN1A,CREBBP,EP300,RBL1,TP53   
7  p53 regulation of cell cycle  CDKN1A,CREBBP,EP300,RBL1,TP53   
8  p53 regulation of cell cycle  CDKN1A,CREBBP,EP300,RBL1,TP53   
9  p53 regulation of cell cycle  CDKN1A,CREBBP,EP300,RBL1,TP53   

                                           paragraph  \
0        Proteins: CDKN1A, CREBBP, EP300, RBL1, TP53   
1  The system of interacting proteins primarily f...   
2  The system of interacting proteins primarily f...   
3  The system of interacting proteins primarily f

In [15]:
music

Unnamed: 0,term,size,genes,stability
23,Cluster4-36,5,HMG20B KDM1A PHF21A RCOR1 ZNF608,30
13,Cluster3-37,13,PPP1R21 TBL1XR1 NCOR2 FOXJ2 GPS2 NCOR1 KANSL1L...,11
10,Cluster3-35,19,FKBP9 P4HA1 CRYGS TGM2 COL11A1 APEH COL22A1 CO...,12
18,Cluster4-28,6,TRRAP KCTD15 ASH1L YEATS4 EPC2 EPC1,10
27,Cluster2-71,4,NACC1 TTC5 WDR81 ZNF281,12
25,Cluster3-65,5,POP4 C18orf21 POP1 POP5 RPP30,25
29,Cluster4-46,4,STAG2 CDCA5 SMC3 STAG1,18
26,Cluster2-79,4,PIK3CA BMPR1A IFNLR1 CALCOCO2,52
16,Cluster5-5,9,ARID4B JARID2 TFDP1 MIER2 ARID5B SUDS3 KDM5A S...,10
14,Cluster4-19,12,FKBP9 P4HA1 COL11A1 COL22A1 COL5A1 COL16A1 COL...,12
