# Notebook for running literature search and validation 

In [None]:
import os
import pandas as pd
import json 
from Bio import Entrez
import openai
from utils.reference_checker import get_references_for_paragraphs
import pickle

In [None]:
# To Edit between runs
dataType = "NeST"
runVersion = "test" 
runOnlyExamples = True;

In [None]:
if dataType == "NeST":
    if runVersion == "test":
        LLM_analysisFilePath = 'data/NeST_table_subset_LLM_Enrichr_simVals_DF.tsv'
        toSaveFilePath = 'data/NeST_table_subset_LLM_Enrichr_simVals_refs_DF.txt'
        
    else:
        LLM_analysisFilePath = 'data/NeST_table_LLM_Enrichr_simVals.tsv'
        toSaveFilePath = 'data/NeST_table_LLM_Enrichr_simVals_refs.txt'
   
    examplesTORun = ["Apoptosis Regulation", "Nucleus"]
    nameCol = 'name_new'
    
elif dataType == "MSigDB":
    if runVersion == "test":
        LLM_analysisFilePath = 'data/MSigDB_table_subset_LLM_Enrichr_simVals_DF.tsv'
        toSaveFilePath = 'data/MSigDB_table_subset_LLM_Enrichr_simVals_refs_DF.txt'
    else:
        LLM_analysisFilePath = 'data/MSigDB_table_LLM_Enrichr_simVals.tsv'
        toSaveFilePath = 'data/MSigDB_table_LLM_Enrichr_simVals_refs.tsv'
        
    examplesTORun = ["ALLOGRAFT REJECTION", "ADIPOGENESIS"]
    nameCol = 'Name'

    
elif dataType == "GO_sets":
    LLM_analysisFilePath = 'data/GO_term_analysis/simrank_LLM_processed_selected_1000_go_terms.tsv'
    toSaveFilePath = 'data/GO_term_analysis/simrank_LLM_processed_selected_1000_go_terms_refs.tsv'
    jsonFilePath = 'reference_checking_task1.json'
    examplesTORun = ["GO:0019433"] 
    nameCol = 'GO'
    

else:
    print("Not implemented for dataType")
    

In [None]:
jsonFilePath = 'jsonFiles/reference_checking.json'
if runVersion == "additional":
    LLM_analysisFilePath = toSaveFilePath # save to the same file to not waste previous queries

In [None]:

with open(jsonFilePath) as json_file:
    config = json.load(json_file)

openai.api_key = os.environ["OPENAI_API_KEY"]
email = config['EMAIL']

 # save the paragraph with corresponding keywords and references to a json file 
savejsonPath = 'paragraph_ref_data' #this is the default, change to your own json file name (no need to add '.json') 

openai.api_key = os.environ["OPENAI_API_KEY"] # Environment variable

In [None]:
MarkedParagraphs = []

In [None]:
## Read in the LLM analysis file
df = pd.read_csv(LLM_analysisFilePath, sep='\t', keep_default_na=False, na_values=['NaN'])
df = df.replace({'None':None})

In [None]:
if (runVersion == "initial") or (runVersion == "rerun") or (runVersion == "test"):
    df['References'] = None

In [None]:
for i, row in df.iterrows():
#for i in range(startRow, df.shape[0]):
#    row = df.iloc[i]
    if runOnlyExamples: # Only run examples
        if df.iloc[i][nameCol] not in examplesTORun: 
            continue

    
    if runVersion == "initial":
        if df.iloc[i]['References'] is not None:
            continue
    if runVersion == "additional":
        if not (df.iloc[i]['References'] == ''):
            continue # skip this row because already done

    print(['dataframe row', i])
    # check out the llm analysis 
    example_analysis = df.iloc[i]['LLM Analysis']
    paragraphs = list(filter(lambda p: len(p.split()) > 5, example_analysis.split("\n")))
    
    try:
        references = get_references_for_paragraphs(paragraphs, email = email, config =config, n=3, verbose=True, MarkedParagraphs = MarkedParagraphs, saveto=savejsonPath)
        print(['Cannot get references for row', i])
    except:
        references = ''
        
    references.replace( '\n', '')
        
    df.loc[i, 'References'] = references
    if i%5==0:
        if not runOnlyExamples:
            df.to_csv(toSaveFilePath, sep = '\t')
        
if not runOnlyExamples:
    if MarkedParagraphs:
        with open('data/marked_paragraph_reference.json', 'w') as fp:
            json.dump(MarkedParagraphs, fp)
if not runOnlyExamples: 
    df.to_csv(toSaveFilePath, sep = '\t')

In [None]:
# save MarkedParagraphs
with open('data/MarkedParagraphs.pickle', 'wb') as f:
    pickle.dump(MarkedParagraphs, f)