# Notebook for running literature search and validation 

In [1]:
import os
import pandas as pd
import json 
from Bio import Entrez
import openai
from utils.reference_checker import get_references_for_paragraphs, iter_dataframe
import pickle
%load_ext autoreload
%autoreload 2

In [2]:
runVersion = 'initial'#'initial'

In [3]:
dataType = "Omics"

In [4]:
if dataType == "Omics":
    LLM_analysisFilePath  = 'data/omics_revamped_LLM_Enrichr_simVals_DF.tsv'
    toSaveFilePath = 'data/omics_revamped_LLM_Enrichr_simVals_refs_2_DF' # remove the .tsv, because output will be saved as a dataframe(.tsv) and a dictionary (.json) in the function
    jsonFilePath = 'jsonFiles/reference_checking_revision.json'
    nameCol = 'GeneSetName'
    LLM_analysisCol = 'LLM Analysis'

elif dataType == "GO_sets":
    LLM_analysisFilePath = 'data/GO_term_analysis/simrank_LLM_processed_selected_1000_go_terms.tsv'
    toSaveFilePath = 'data/GO_term_analysis/simrank_LLM_processed_selected_1000_go_terms_refs'
    jsonFilePath = 'reference_checking_task1.json'
    examplesTORun = ["GO:0019433"] 
    nameCol = 'GO'
    
else:
    print("Not implemented for dataType")
    

In [5]:
if runVersion == 'additional':
    LLM_analysisFilePath = toSaveFilePath + '.tsv'

In [6]:
LLM_analysisFilePath

'data/omics_revamped_LLM_Enrichr_simVals_DF.tsv'

In [7]:
with open(jsonFilePath) as json_file:
    config = json.load(json_file)

openai.api_key = os.environ["OPENAI_API_KEY"]
email = config['EMAIL']


In [15]:
## Read in the LLM analysis file
df = pd.read_csv(LLM_analysisFilePath, sep='\t', keep_default_na=False, na_values=['NaN'])
df = df.replace({'None':None})

In [16]:
# ### comment this cell if running the full dataset, this is for a toy example

# df = df.head(2) #only take the first 2 rows as a toy example
# print(df)
# # showing how to run a toy example
# toSaveFilePath = 'data/omics_toy' 

   Unnamed: 0.1  Unnamed: 0 Source    GeneSetID  GeneSetName  \
0             0           0   NeST  Cluster1-10  Cluster1-10   
1             1           0   NeST  Cluster1-11  Cluster1-11   

                                            GeneList  n_Genes  \
0  CTRL HSD17B14 KIAA0232 PAQR8 PLA2G1B RNF145 SG...       12   
1  LMF1 MFHAS1 MR1 PLA2G1B RASL11A RNF145 SLC2A6 ...       12   

                                    LLM Name  \
0     Lipid Metabolism and Membrane Dynamics   
1  Lipid Metabolism and Membrane Trafficking   

                                        LLM Analysis  Score  Rank Overlap  \
0  1. CTRL, or carboxyl-terminal esterase/lipase,...   0.85     0    2/21   
1  1. LMF1 (Lipase Maturation Factor 1) is crucia...   0.85     0    2/85   

    P-value  Adjusted P-value             Genes  \
0  0.000069          0.008264  PLA2G1B;HSD17B14   
1  0.001146          0.050873       PLA2G1B;MR1   

                                                Term  \
0               Lipid Ca

In [17]:
## UPDATES 02/13/2023
#iterate through the df and save df and dict

res_df, res_dict = iter_dataframe(df, email, config, n=3, papers_query=20, verbose=False, return_paragraph_ref_data=False, id_col=nameCol, paragraph_col=LLM_analysisCol, runVersion=runVersion, save_path = toSaveFilePath)

187
301
Serching paper with keywords...
20 references are queried
166
165
171
173
163
167
176
201
167
194
194
169
180
The number of title matching paper: 1
551
In paragraph 1, 0 references are matched


204
314
Serching paper with keywords...
6 references are queried
193
205
184
202
195
The number of title matching paper: 2
457
Development of a clinical prediction model for diabetic kidney disease with glucose and lipid metabolism disorders based on machine learning and bioinformatics technology.
405
In paragraph 2, 2 references are matched


176
285
Serching paper with keywords...
1 references are queried
156
The number of title matching paper: 0
In paragraph 3, 0 references are matched


174
286
Serching paper with keywords...
2 references are queried
151
163
The number of title matching paper: 1
609
Expression of membrane progesterone receptors (mPR/PAQR) in ovarian cancer cells: implications for progesterone-induced signaling events.
In paragraph 4, 1 references are matched


196
3

=======================================

## use your own iterate dataframe loop

In [9]:
LLM_analysisFilePath = './data/CH_omics_revamped_LLM_Enrichr_simVals_failure_refs_newThresh_DF.tsv'
toSaveFilePath  = './data/CH_omics_revamped_LLM_Enrichr_simVals_failure_refs_newThresh_DF'
jsonFilePath = 'jsonFiles/reference_checking_revision_test.json'
with open(jsonFilePath) as json_file:
    config = json.load(json_file)

openai.api_key = os.environ["OPENAI_API_KEY"]
email = config['EMAIL']
nameCol = 'GeneSetName'
LLM_analysisCol = 'LLM Analysis'
runVersion = 'additional'

In [3]:
runOnlyExamples = True
if runOnlyExamples:
    examplesTORun = ['BRD-A00993607 Alprenolol hydrochloride MCF7 6.0 h 10.0 um',
       'BRD-A13964793 -666 MCF7 6.0 h 10.0 um',
       'BRD-A19633847 PERHEXILINE MALEATE MCF7 6.0 h 10.0 um',
       'BRD-A31204924 -666 MCF7 6.0 h 10.0 um', 'Cluster2-126',
       'Cluster2-140', 'Cluster2-147', 'Cluster2-169', 'Cluster2-183',
       'Cluster2-191', 'Cluster2-200']


In [10]:
## Read in the LLM analysis file
df = pd.read_csv(LLM_analysisFilePath, sep='\t', keep_default_na=False, na_values=['NaN'])
df = df.replace({'None':None})
df.set_index(nameCol, inplace=True)
# # rename 'References' to 'referenced_analysis' 
# df = df.rename(columns={'References':'referenced_analysis'})

In [11]:
df[df['referenced_analysis'] == '']

Unnamed: 0_level_0,Unnamed: 0.1,...1,...2,Unnamed: 0,Source,GeneSetID,GeneList,n_Genes,LLM Name,LLM Analysis,...,Term,GO term,GO ID,GO_term_genes,LLM_name_GO_term_sim,enrichr_JI,LLM_success_TF,enrichr_success_TF,referenced_analysis,enrichr_success_TF_0.1
GeneSetName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
BRD-A31204924 -666 MCF7 6.0 h 10.0 um,85,85,9762,1,L1000,BRD-A31204924_-666_MCF7_6.0_h_10.0_um,1060P11.3 ANPEP DPP4 FAM129A HNMT MKNK1 MUC5B ...,15,System of unrelated proteins,1. 1060P11.3 is a hypothetical protein with li...,...,Histamine Metabolic Process (GO:0001692),Histamine Metabolic Process,GO:0001692,SLC22A3 PRG3 TRH HNMT SLC29A4,0.264298,0.052632,False,False,,False


In [None]:
if runVersion == 'initial':
    df['referenced_analysis'] = None

In [14]:
#####USE get_references_for_paragraph####
saved_dict = {}
i = 0
for set_id, row in df.iterrows():
#for i in range(startRow, df.shape[0]):
#    row = df.iloc[i]
    # if runOnlyExamples: # Only run examples
    #    if df.iloc[i][nameCol] not in examplesTORun: 
    #        continue
    if runVersion == "initial":
        if df.loc[set_id, 'referenced_analysis'] is not None:
            continue
    if runVersion == "additional":
        with open(toSaveFilePath + '.json') as f:
            saved_dict = json.load(f)
        if not (df.loc[set_id,'referenced_analysis'] == ''):
            continue # skip this row because already done
            
    print('=========================================')
    print('=========================================')
    print('=========================================')

    print(['dataframe row', set_id])
    # check out the llm analysis 
    example_analysis = df.loc[set_id, LLM_analysisCol]
    paragraphs = list(filter(lambda p: len(p.split()) > 5, example_analysis.split("\n")))
    
    try:
        references, paragraph_dict = get_references_for_paragraphs(paragraphs, email = email, config =config, n=3, verbose=True, papers_query=20, return_paragraph_ref_data=True)
        
    except Exception as e:
        print('Cannot get references for row', set_id, e)
        references = ''
        saved_dict[set_id] = None
        
    references.replace( '\n', '')
    
    df.loc[set_id, 'referenced_analysis'] = references
    saved_dict[set_id] = paragraph_dict
    i += 1
    if i%5==0:
        df.to_csv(toSaveFilePath+ '.tsv', sep = '\t')
        with open(toSaveFilePath + '.json', 'w') as f:
            json.dump(saved_dict, f)
        
# if not runOnlyExamples: 
df.to_csv(toSaveFilePath+'.tsv', sep = '\t')
with open(toSaveFilePath + '.json', 'w') as f:
    json.dump(saved_dict, f)
# check there is no None
print(len(df[df['referenced_analysis'] == '']))

['dataframe row', 'BRD-A31204924 -666 MCF7 6.0 h 10.0 um']
Extracting keywords from paragraph
Paragraph:
1. 1060P11.3 is a hypothetical protein with limited information available regarding its function. Without clear evidence of its biological role, it cannot be easily integrated into a specific process with the other proteins listed.
177
Query:
 
I have a paragraph
Paragraph:
1. 1060P11.3 is a hypothetical protein with limited information available regarding its function. Without clear evidence of its biological role, it cannot be easily integrated into a specific process with the other proteins listed.

I would like to search PubMed to find supporting evidence for the statements in this paragraph. Give me a list of gene symbols from the paragraph. Please only include genes. Return the genes as a comma separated list without spacing, if there are no genes in the statements, please return "Unknown" 
Result:
Unknown
291
Query:

I would like to search PubMed to find supporting evidence f

In [16]:
df.to_csv(toSaveFilePath+'.tsv', sep = '\t')

In [15]:
with open('./logs/reference_checker_revision_test_11log.json', 'r') as log:
    log_data = json.load(log)

print('est time per set: ', log_data['time_taken_total']/11)
print('est. cost per set: ', log_data['dollars_spent']/11)


est time per set:  70.13306418332186
est. cost per set:  0.8977145454545454
