# Notebook for running literature search and validation 

In [1]:
import os
import pandas as pd
import json 
from Bio import Entrez
import openai
from utils.reference_checker import get_references_for_paragraphs
import pickle

In [2]:
runVersion = 'initial'

In [3]:
dataType = "Omics"

In [9]:
if dataType == "Omics":
    LLM_analysisFilePath  = 'data/omics_LLM_Enrichr_simVals_DF.tsv'
    toSaveFilePath = 'data/omics_LLM_Enrichr_simVals_refs_DF.tsv'
    nameCol = 'GeneSetName'
    

elif dataType == "GO_sets":
    LLM_analysisFilePath = 'data/GO_term_analysis/simrank_LLM_processed_selected_1000_go_terms.tsv'
    toSaveFilePath = 'data/GO_term_analysis/simrank_LLM_processed_selected_1000_go_terms_refs.tsv'
    jsonFilePath = 'reference_checking_task1.json'
    examplesTORun = ["GO:0019433"] 
    nameCol = 'GO'
    
else:
    print("Not implemented for dataType")
    

In [10]:
jsonFilePath = 'jsonFiles/reference_checking_revision.json'

In [11]:
with open(jsonFilePath) as json_file:
    config = json.load(json_file)

openai.api_key = os.environ["OPENAI_API_KEY"]
email = config['EMAIL']

 # save the paragraph with corresponding keywords and references to a json file 
savejsonPath = 'paragraph_ref_data_revision' #this is the default, change to your own json file name (no need to add '.json') 

openai.api_key = os.environ["OPENAI_API_KEY"] # Environment variable

In [12]:
MarkedParagraphs = []

In [13]:
## Read in the LLM analysis file
df = pd.read_csv(LLM_analysisFilePath, sep='\t', keep_default_na=False, na_values=['NaN'])
df = df.replace({'None':None})

In [14]:
if (runVersion == "initial") or (runVersion == "rerun") or (runVersion == "test"):
    df['References'] = None

In [None]:
for i, row in df.iterrows():
#for i in range(startRow, df.shape[0]):
#    row = df.iloc[i]
  #  if runOnlyExamples: # Only run examples
  #      if df.iloc[i][nameCol] not in examplesTORun: 
  #          continue

    
    if runVersion == "initial":
        if df.iloc[i]['References'] is not None:
            continue
    if runVersion == "additional":
        if not (df.iloc[i]['References'] == ''):
            continue # skip this row because already done
    
    print('=========================================')

    print(['dataframe row', i])
    # check out the llm analysis 
    example_analysis = df.iloc[i]['LLM Analysis']
    paragraphs = list(filter(lambda p: len(p.split()) > 5, example_analysis.split("\n")))
    
    try:
        references = get_references_for_paragraphs(paragraphs, email = email, config =config, n=3, verbose=True, MarkedParagraphs = MarkedParagraphs, saveto=savejsonPath)
        print(['Cannot get references for row', i])
    except:
        references = ''
        
    references.replace( '\n', '')
        
    df.loc[i, 'References'] = references
    if i%5==0:
        df.to_csv(toSaveFilePath, sep = '\t')
        
if not runOnlyExamples:
    if MarkedParagraphs:
        with open('data/marked_paragraph_reference.json', 'w') as fp:
            json.dump(MarkedParagraphs, fp)
if not runOnlyExamples: 
    df.to_csv(toSaveFilePath, sep = '\t')

In [None]:
# save MarkedParagraphs
with open('data/MarkedParagraphs.pickle', 'wb') as f:
    pickle.dump(MarkedParagraphs, f)

In [None]:
df.to_csv(toSaveFilePath, sep = '\t')

In [15]:
i =1

In [16]:
row = df.iloc[1]

In [17]:
row

Unnamed: 0.2                                                               1
Unnamed: 0.1                                                               1
index                                                                  97309
Unnamed: 0                                                                 0
Source                                                               Disease
GeneSetName                                       Aortic valve calcification
GeneList                         NOTCH1 TP53 GATA5 NKX2-5 SMAD6 AKT1 SLC20A1
n_Genes                                                                    7
LLM Name                   Regulation of Cardiogenesis and Cellular Growt...
LLM Analysis               1. NOTCH1 is a transmembrane receptor involved...
Score                                                                   0.92
Rank                                                                       0
Overlap                                                               Apr-37

In [18]:
example_analysis = df.iloc[i]['LLM Analysis']


In [19]:
paragraphs = list(filter(lambda p: len(p.split()) > 5, example_analysis.split("\n")))


In [20]:
references = get_references_for_paragraphs(paragraphs, email = email, config =config, n=3, verbose=True, MarkedParagraphs = MarkedParagraphs, saveto=savejsonPath)


Extracting keywords from paragraph
Paragraph:
1. NOTCH1 is a transmembrane receptor involved in the Notch signaling pathway, which plays a critical role in cardiac development, particularly in the differentiation of cardiac progenitor cells and the formation of the heart valves. It also influences cell fate decisions and promotes endothelial-to-mesenchymal transition (EndMT), a process essential for heart development.
208
Query:
 
I have a paragraph
Paragraph:
1. NOTCH1 is a transmembrane receptor involved in the Notch signaling pathway, which plays a critical role in cardiac development, particularly in the differentiation of cardiac progenitor cells and the formation of the heart valves. It also influences cell fate decisions and promotes endothelial-to-mesenchymal transition (EndMT), a process essential for heart development.

I would like to search PubMed to find supporting evidence for the statements in this paragraph. Give me a list of gene symbols from the paragraph. Please only

ValueError: not enough values to unpack (expected 4, got 2)

In [21]:
references

NameError: name 'references' is not defined

In [23]:
len(paragraphs)

8

In [22]:
from utils.reference_checker import get_keywords_combinations


In [25]:
keywords, genes, functions, flag_working = get_keywords_combinations(paragraphs[0], config, verbose=False)

208
319


In [32]:
keywords6, genes6, functions6, flag_working6 = get_keywords_combinations(paragraphs[6], config, verbose=False)

188
297


In [34]:
paragraphs[6]

'7. SLC20A1, also known as Pit-1, is a sodium-dependent phosphate transporter. While its direct role in heart development is less clear, phosphate homeostasis is important for energy metabolism and may indirectly influence cardiac function and development.'

In [33]:
keywords6, genes6, functions6, flag_working6 

('("SLC20A1") AND ("phosphate transport" OR "energy metabolism" OR "cardiac development") AND (hasabstract[text])',
 ['SLC20A1'],
 ['phosphate transport', 'energy metabolism', 'cardiac development'],
 False)

In [31]:
keywords7, genes7, functions7, flag_working7 = get_keywords_combinations(paragraphs[7], config, verbose=False)

219
332


ValueError: not enough values to unpack (expected 4, got 2)

In [30]:
[keywords7, genes7, functions7, flag_working7

NameError: name 'keywords7' is not defined

In [28]:
get_keywords_combinations(paragraphs[7], config, verbose=False)

219
332


([], False)

In [29]:
paragraphs[7]

'To summarize, the proteins in this set are involved in the regulation of heart development and cardiogenesis, with overlapping roles in cellular growth and survival signaling pathways. The majority of these proteins are transcription factors or signaling molecules that directly contribute to the development and function of the heart, while others play supportive roles in cellular metabolism and stress responses. The interactions between these proteins suggest a coordinated network that ensures proper heart formation and adaptation to physiological demands.'

In [None]:

    references.replace( '\n', '')
        
    df.loc[i, 'References'] = references
    if i%5==0:
        df.to_csv(toSaveFilePath, sep = '\t')