# Notebook for running literature search and validation 

In [12]:
import os
import pandas as pd
import json 
from Bio import Entrez
import openai
from utils.reference_checker import get_references_for_paragraphs, iter_dataframe
import pickle

In [2]:
runVersion = 'initial'#'initial'

In [4]:
dataType = "Omics"

In [13]:
if dataType == "Omics":
    LLM_analysisFilePath  = 'data/omics_revamped_LLM_Enrichr_simVals_DF.tsv'
    toSaveFilePath = 'data/omics_revamped_LLM_Enrichr_simVals_refs_2_DF' # remove the .tsv, because output will be saved as a dataframe(.tsv) and a dictionary (.json) in the function
    jsonFilePath = 'jsonFiles/reference_checking_revision.json'
    nameCol = 'GeneSetName'
    LLM_analysisCol = 'LLM Analysis'

elif dataType == "GO_sets":
    LLM_analysisFilePath = 'data/GO_term_analysis/simrank_LLM_processed_selected_1000_go_terms.tsv'
    toSaveFilePath = 'data/GO_term_analysis/simrank_LLM_processed_selected_1000_go_terms_refs'
    jsonFilePath = 'reference_checking_task1.json'
    examplesTORun = ["GO:0019433"] 
    nameCol = 'GO'
    
else:
    print("Not implemented for dataType")
    

In [14]:
if runVersion == 'additional':
    LLM_analysisFilePath = toSaveFilePath + '.tsv'

In [15]:
LLM_analysisFilePath

'data/omics_revamped_LLM_Enrichr_simVals_DF.tsv'

In [17]:
with open(jsonFilePath) as json_file:
    config = json.load(json_file)

openai.api_key = os.environ["OPENAI_API_KEY"]
email = config['EMAIL']


In [19]:
## Read in the LLM analysis file
df = pd.read_csv(LLM_analysisFilePath, sep='\t', keep_default_na=False, na_values=['NaN'])
df = df.replace({'None':None})

In [20]:
## UPDATES 02/13/2023
#iterate through the df and save df and dict

res_df, res_dict = iter_dataframe(df, email, config, n=3, papers_query=20, verbose=False, return_paragraph_ref_data=False, id_col=nameCol, paragraph_col=LLM_analysisCol, runVersion=runVersion, save_path = toSaveFilePath)

=======================================

## use your own iterate dataframe loop

In [None]:
runOnlyExamples = False
examplesTORun = None

In [None]:
## Read in the LLM analysis file
df = pd.read_csv(LLM_analysisFilePath, sep='\t', keep_default_na=False, na_values=['NaN'])
df = df.replace({'None':None})

In [None]:
if runVersion == 'initial':
    df['referenced_analysis'] = None

In [None]:
#####USE get_references_for_paragraph####
for i, row in df.iterrows():
#for i in range(startRow, df.shape[0]):
#    row = df.iloc[i]
  #  if runOnlyExamples: # Only run examples
  #      if df.iloc[i][nameCol] not in examplesTORun: 
  #          continue
    if runVersion == "initial":
        if df.iloc[i]['referenced_analysis'] is not None:
            continue
    if runVersion == "additional":
        if not (df.iloc[i]['referenced_analysis'] == ''):
            continue # skip this row because already done
    
    print('=========================================')
    print('=========================================')
    print('=========================================')

    print(['dataframe row', i])
    # check out the llm analysis 
    example_analysis = df.iloc[i][LLM_analysisCol]
    paragraphs = list(filter(lambda p: len(p.split()) > 5, example_analysis.split("\n")))
    
    try:
        references, paragraph_dict = get_references_for_paragraphs(paragraphs, email = email, config =config, n=3, verbose=True, papers_query=20, return_paragraph_ref_data=True)
        print(['Cannot get references for row', i])
    except:
        references = ''
        
    references.replace( '\n', '')
        
    df.loc[i, 'referenced_analysis'] = references
    if i%5==0:
        df.to_csv(toSaveFilePath, sep = '\t')
        
if not runOnlyExamples: 
    df.to_csv(toSaveFilePath, sep = '\t')

['dataframe row', 0]
Extracting keywords from paragraph
Paragraph:
1. CTRL, or carboxyl-terminal esterase/lipase, is involved in lipid metabolism, particularly in the hydrolysis of triglycerides and cholesteryl esters. This activity is crucial for lipid digestion and the subsequent absorption and processing within cells.
187
Query:
 
I have a paragraph
Paragraph:
1. CTRL, or carboxyl-terminal esterase/lipase, is involved in lipid metabolism, particularly in the hydrolysis of triglycerides and cholesteryl esters. This activity is crucial for lipid digestion and the subsequent absorption and processing within cells.

I would like to search PubMed to find supporting evidence for the statements in this paragraph. Give me a list of gene symbols from the paragraph. Please only include genes. Return the genes as a comma separated list without spacing, if there are no genes in the statements, please return "Unknown" 
Result:
CTRL
301
Query:

I would like to search PubMed to find supporting evi

In [None]:
# save MarkedParagraphs
with open('data/MarkedParagraphs.pickle', 'wb') as f:
    pickle.dump(MarkedParagraphs, f)

In [None]:
df.to_csv(toSaveFilePath, sep = '\t')