In [1]:
import sys
import os
sys.path.append("..")

from  pathlib            import  Path
from  utils.pubmed_utils import Neural_Retriever_PubMed

# Using Clinfo.AI 

In this tutorial, we will go through each step of the Clinfo.AI workflow. Before we start, we need to set up a few things. 


### 1.- Setting up enviorment:
1.a.- Install the conda enviroment using the yml file provided.

``` conda env create -f environment.yaml ```

1.b.- Select your enviorment to run notebook. I recommend using vscode: 



### 2.- Creating Accounts

You will need at least one account and at most two (depending on how many calls/hour you plan to do):
* OPENAI account: If you start a free account for the first time, you will get $5 in API credits.
* NCBI_API_KEY: This is only necessary if you plan to make more than 10 calls per hour.


Once you have created both accounts  go to **src\config.py** file and: 

* Set OPENAI_API_KEY to your openAI API key

If you created an NCBI API account add your key and email in the following values: 
* NCBI_API_KEY 
* EMAIL 
Otherwise leave them as None





### 3.- Defining your own promts:
We have designed prompts for each step of Clinfo.ai Workflow, leaveriging the power of in-contex-learning. If you want to us your own promps you can edit them **src\prompts**


In [2]:
# Make Sure you followed at least step 1-2 before running this cell.
from  config import OPENAI_API_KEY, NCBI_API_KEY, EMAIL
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [3]:

## 4.- Init Neural Retriver from path. Do not change path if you want to use base  prombts, otherwise specify your own architecture
file_path   = os.path.join("..","prompts","PubMed","Architecture_1","master.json")
nrpm        = Neural_Retriever_PubMed(architecture_path=file_path,verbose=True,debug=False,open_ai_key=OPENAI_API_KEY,email=EMAIL)


Task Name: pubmed_query_prompt
------------------------------------------------------------------------
Loading prompt: system  from file task_1_sys.json
Loading prompt: template  from file task_1_prompt.json

Task Name: relevance_prompt
------------------------------------------------------------------------
Loading prompt: system  from file task_2_sys.json
Loading prompt: template  from file task_2_prompt.json

Task Name: summarization_prompt
------------------------------------------------------------------------
Loading prompt: system  from file task_3_sys.json
Loading prompt: template  from file task_3_prompt.json

Task Name: synthesize_prompt
------------------------------------------------------------------------
Loading prompt: system  from file task_4_sys.json
Loading prompt: template  from file task_4_prompt.json
{'$schema': {'pubmed_query_prompt': {'system': PromptTemplate(input_variables=[], output_parser=None, partial_variables={}, template='You are a helpful assistant th

# Let's start!

In [4]:
### Step 0 : Ask a question ###
question    = "What is the prevalence of COVID-19 in the United States?"

## Step 1: Search PubMed: ###
# To achive this we will:
# 1.a Convert the question into a query using LLMs
# 1.b Use this Pubmed Query to look for pubmed abstracts about the topic. 
# This step will  returns a tupple:  ( [list of queries used to retrive articles] , [list of article ids  (PMIDs) that were retrieved])
pubmed_queries, article_ids = nrpm.search_pubmed(question,
                                                 num_results=10,        # Limit the max number of results you can retrive per query
                                                 num_query_attempts=1) # Number of attemps to generate query (use more than 1 for better results)


print(f"Articles retrived: {len(article_ids)}")
print(pubmed_queries)
print(article_ids)


********************************************************
Generated pubmed query: ("prevalence" AND "COVID-19" AND "United States")



            Email address is not specified.

            To make use of NCBI's E-utilities, NCBI requires you to specify your
            email address with each request.  As an example, if your email address
            is A.N.Other@example.com, you can specify it as follows:
               from Bio import Entrez
               Entrez.email = 'A.N.Other@example.com'
            In case of excessive usage of the E-utilities, NCBI will attempt to contact
            a user at the email address provided before blocking access to the
            E-utilities.


Retrieved 10 IDs
['35429399', '35121209', '35816430', '32711058', '32876685', '34762110', '33481900', '36279944', '33627448', '33932476']
Search IDs: {'35121209', '33932476', '33627448', '36279944', '33481900', '32711058', '35816430', '32876685', '34762110', '35429399'}
Articles retrived: 10
['("prevalence" AND "COVID-19" AND "United States")']
['35121209', '33932476', '33627448', '36279944', '33481900', '32711058', '35816430', '32876685', '34762110', '35429399']


In [5]:
## Step 2: Fetch article data
# Preiously, we only extracted he PMIDs. No we will use those  PMIDs to retrive the metadata:
articles = nrpm.fetch_article_data(article_ids)
print(articles)

# Print example fo first Article: 
article_num = 1
print(f"Article :{article_num}")
print(articles[article_num].keys())
print(articles[article_num]['PubmedData'])
print(articles[article_num]["MedlineCitation"]["Article"]["Abstract"]["AbstractText"])
print(articles[article_num]["MedlineCitation"]["Article"])


[{'MedlineCitation': DictElement({'InvestigatorList': [], 'CitationSubset': ['IM'], 'SpaceFlightMission': [], 'KeywordList': [ListElement([StringElement('COVID-19', attributes={'MajorTopicYN': 'Y'}), StringElement('ICU', attributes={'MajorTopicYN': 'Y'}), StringElement('Long-COVID', attributes={'MajorTopicYN': 'Y'}), StringElement('Long-Haulers', attributes={'MajorTopicYN': 'Y'}), StringElement('Neuro-COVID-19', attributes={'MajorTopicYN': 'Y'}), StringElement('PCNS', attributes={'MajorTopicYN': 'Y'}), StringElement('Post-COVID-19 neurological syndrome', attributes={'MajorTopicYN': 'Y'}), StringElement('Post-COVID-19 syndrome', attributes={'MajorTopicYN': 'Y'}), StringElement('SARS-CoV-2', attributes={'MajorTopicYN': 'Y'})], attributes={'Owner': 'NOTNLM'})], 'OtherID': [], 'OtherAbstract': [], 'GeneralNote': [], 'PMID': StringElement('35121209', attributes={'Version': '1'}), 'DateCompleted': {'Year': '2022', 'Month': '04', 'Day': '01'}, 'DateRevised': {'Year': '2022', 'Month': '12', 'D

In [6]:
# STEP 3 Summarize each article
# This step is parallelized, though it might look like one single call, it performs one call per article to summarize.
# Then the relevancy of the article (based on the original question) is provided by another LLM call.

article_summaries,irrelevant_articles =  nrpm.summarize_each_article(articles, question)

McAuley JL, Corcilius L, Tan HX, Payne RJ, McGuckin MA, Brown LE. The cell surface mucin MUC1 limits the severity of influenza A virus infection. Mucosal Immunol . 2017;10:1581–1593.Logue J.K., Franko N.M., McCulloch D.J., et al. Sequelae in Adults at 6 Months After COVID-19 Infection. JAMA Netw. Open. 2021;4(2):e210830.
~~~~~~~~~~
IMPORTANCE:
Neurological and neuropsychiatric symptoms that persist or develop three months after the onset of COVID-19 pose a significant threat to the global healthcare system. These symptoms are yet to be synthesized and quantified via meta-analysis.

OBJECTIVE:
To determine the prevalence of neurological and neuropsychiatric symptoms reported 12 weeks (3 months) or more after acute COVID-19 onset in adults.

DATA SOURCES:
A systematic search of PubMed, EMBASE, Web of Science, Google Scholar and Scopus was conducted for studies published between January 1st, 2020 and August 1st, 2021. The systematic review was guided by Preferred Reporting Items for Syste

In [7]:
# Summaries for relevant articles
article_summaries

[{'title': 'COVID-19 Prevalence and Related Practices among Dental Hygienists in the United States.',
  'url': 'https://pubmed.ncbi.nlm.nih.gov/33627448/',
  'abstract': "<b>Purpose:</b> Throughout the COVID-19 pandemic, health care professionals have been challenged to provide appropriate preventive and therapeutic measures while using precautions to minimize disease transmission. The purpose of this study was to estimate the prevalence of COVID-19 among United States (US) dental hygienists, describe infection prevention and control procedures and any associated trends in mental health.<b>Methods:</b> Registered dental hygienists (RDHs) licensed in the US were invited to participate in a 30-question web-based survey. COVID-19 infection items included probable and confirmed results, COVID-19 related symptoms experienced in the last month, and level of concern about COVID-19 transmission to patients and themselves. The validated Patient Health Questionnaire 4 screened respondents for de

In [8]:
# Articles deemed irelevant
irrelevant_articles 

[{'title': 'Mid and long-term neurological and neuropsychiatric manifestations of post-COVID-19 syndrome: A meta-analysis.',
  'url': 'https://pubmed.ncbi.nlm.nih.gov/35121209/',
  'abstract': 'IMPORTANCE:\nNeurological and neuropsychiatric symptoms that persist or develop three months after the onset of COVID-19 pose a significant threat to the global healthcare system. These symptoms are yet to be synthesized and quantified via meta-analysis.\n\nOBJECTIVE:\nTo determine the prevalence of neurological and neuropsychiatric symptoms reported 12\xa0weeks (3\xa0months) or more after acute COVID-19 onset in adults.\n\nDATA SOURCES:\nA systematic search of PubMed, EMBASE, Web of Science, Google Scholar and Scopus was conducted for studies published between January 1st, 2020 and August 1st, 2021. The systematic review was guided by Preferred Reporting Items for Systematic Review and Meta-Analyses.\n\nSTUDY SELECTION:\nStudies were included if the length of follow-up satisfied the National In

In [9]:
# STEP 4 do a synthesis of all summaries to answer question: 
synthesis =   nrpm.synthesize_all_articles(article_summaries, question)
print("synthesis")

=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#
Literature Summary: Two cross-sectional surveys were conducted to estimate the prevalence of COVID-19 in the United States. The first study, involving 4,776 dental hygienists, reported a prevalence of 3.1% as of October 2020 (Estrich et al., 2020). The second study, conducted among a community-based sample of 5,203 adults, found that 14.5% of participants reported COVID-19-like symptoms, and 24.3% of those who obtained a diagnostic test tested positive for COVID-19 (CDC, 2020). Both studies relied on self-reported data, which could introduce bias. Additionally, the studies were conducted during specific timeframes, which may not capture the full extent of COVID-19 prevalence in the US.

TL;DR: The prevalence of COVID-19 in the United States varies based on the population and timeframe studied, with reported rates ranging from 3.1% among dental hygienists to 24.3% among symptomatic adults who obtained a diagnostic test. 

References:
1. Estrich C

# Great! We answered our first question using Clinfo.AI!
## Here are all the steps condensed:

In [32]:
file_path   = os.path.join("..","prompts","PubMed","Architecture_1","master.json")
nrpm        = Neural_Retriever_PubMed(architecture_path=file_path,verbose=False,debug=False,open_ai_key=OPENAI_API_KEY,email=EMAIL)


### Step 0 : Ask a question ###
question    = "What is the prevalence of COVID-19 in the United States?"

## Step 1: Search PubMed ###
# Convert the question into a query using gpt 
# This returns a list of queries (used to retrive articles) and a list of article ids that were retrieved
pubmed_queries, article_ids = nrpm.search_pubmed(question,num_results=10,num_query_attempts=1)

## Step 1.a: Fetch article data
#  Convert  list of Ids into a list of dictionaries (populated by pumbed API)
articles = nrpm.fetch_article_data(article_ids)

###  STEP 2 Summarize each article (only if they are relevant [Step 3]) ###
article_summaries,irrelevant_articles =  nrpm.summarize_each_article(articles, question)


### STEP 4: Synthesize the results ###
synthesis =   nrpm.synthesize_all_articles(article_summaries, question)

#synthesis, article_summaries, irrelevant_articles, articles, article_ids, pubmed_queries,

print(synthesis)


Task Name: pubmed_query_prompt
------------------------------------------------------------------------

Task Name: relevance_prompt
------------------------------------------------------------------------

Task Name: summarization_prompt
------------------------------------------------------------------------

Task Name: synthesize_prompt
------------------------------------------------------------------------
********************************************************
Generated pubmed query: COVID-19 prevalence United States

Retrieved 10 IDs
['34888288', '33663642', '34281357', '37639043', '34311990', '35996224', '36508742', '35206474', '36333051', '37422043']
Search IDs: {'34888288', '34311990', '35996224', '33663642', '35206474', '37639043', '36508742', '34281357', '37422043', '36333051'}
