In [None]:
!pip install langchain_community
!pip install -qU langchain-openai

In [2]:
import requests
import getpass
import json
from typing import List, Optional
from pydantic import BaseModel, Field
from langchain.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
import xml.etree.ElementTree as ET

In [3]:
api_key = getpass.getpass("Enter your NCBI API key: ")

Enter your NCBI API key: ··········


In [4]:
import os
os.environ["OPENAI_API_KEY"] = getpass.getpass()

··········


In [5]:
with open('/content/BioASQ-taskSynergy_v2025-testset4.json', 'r') as file:
    data = json.load(file)

In [6]:
len(data["questions"])

74

Creating Question keywords using LLM - don't run this again

In [29]:
query_list_prompt = PromptTemplate(
    input_variables=["question_body"],
    template="""
You are an expert in formulating optimal PubMed search queries and biomedical information retrieval. Given the following question, generate 5 PubMed search queries that can be used to retrieve relevant articles from pubmed. Each query should:
- Contain only single-word keywords separated by a space.
- Replace any dash-connected words (e.g. "early-onset") with separate words ("early onset").
- Be ordered from most specific (i.e. containing the most keywords) to least specific (with fewer keywords).
- Remove less relevant or stop words first while keeping core biomedical or medical terms intact.
- Queries should be ordered in decreasing order from most words to less words in them. Queries with more words will definetely be more useful to fetch
more relevant articles but sometimes very specific search returns no articles, thus we gradually remove less relevant words usually not the biomedical terms
from queries in hopes to retrieve some articles.
- Do not explicitly insert logical operators like AND
- Do not include stop words, punctuation , helper words (remove them outright), then gradually remove less relevant words over the 5 quesries generated.
- The query you form will be used to get most relevant articles that can help answer the given question, thus the search query formed should be the most optimal.

Question: {question_body}

Return your answer in JSON format exactly matching the structure below:
{{
    "queries": [<string>, <string>, <string>, <string>, <string>]
}}
"""
)
class QueryList(BaseModel):
    queries: List[str] = Field(
        ...,
        description="A list of 5 PubMed queries ordered from most specific (with the most keywords) to least specific (with fewer keywords)."
    )

llm_key = ChatOpenAI(model="gpt-4o", temperature=0.1)
structured_key_llm = llm_key.with_structured_output(QueryList)

In [30]:
final_mapping = {}

for question in data["questions"]:
    response = structured_key_llm.invoke(query_list_prompt.format(question_body=question["body"]))
    query_list = response.queries

    selected_query = None
    for q in query_list:
        processed_query = q.replace("-", " ")
        _, pmids, _ = generate_article_context(processed_query, api_key)
        if pmids:
            selected_query = processed_query
            break

    if not selected_query:
        print(f"No relevant articles found for question: '{question['body']}' using queries: {query_list}")
        selected_query = query_list[-1].replace("-", " ")

    final_mapping[question["id"]] = selected_query

In [32]:
print (len(final_mapping))

74


In [33]:
with open("mapping2.json", "w") as json_file:
    json.dump(final_mapping, json_file, indent=4)

load the mapping directly

In [34]:
with open("/content/mapping2.json", "r") as json_file:
    mapping = json.load(json_file)

In [43]:
len(mapping)

74

Main task

In [42]:
def retrieve_pubmed_documents(question_body, api_key):
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
    db = "pubmed"
    retmax = 20  # Fetch atmax 20
    esearch_url = f"{base_url}esearch.fcgi?db={db}&term={question_body}&retmax={retmax}&retmode=xml&sort=relevance&api_key={api_key}"

    try:
        # Fetch PMIDs
        esearch_response = requests.get(esearch_url)
        esearch_response.raise_for_status()
        esearch_data_root = ET.fromstring(esearch_response.text)
        pmids = [id_elem.text for id_elem in esearch_data_root.findall(".//IdList/Id")]

        if not pmids:
            # print("No PMIDs found for the query.")
            return {}, []

        # Fetch article details
        efetch_url = f"{base_url}efetch.fcgi?db={db}&id={','.join(pmids)}&rettype=abstract&retmode=xml&api_key={api_key}"
        efetch_response = requests.get(efetch_url)
        efetch_response.raise_for_status()
        efetch_data_root = ET.fromstring(efetch_response.text)

        documents = {}
        pmid_ret = []

        for article in efetch_data_root.findall(".//PubmedArticle"):
            pmid = article.find(".//PMID").text

            # Extract title
            title_elem = article.find(".//ArticleTitle")
            title = " ".join(title_elem.itertext()).strip() if title_elem is not None else None

            # Extract abstract
            abstract_texts = []
            for abstract in article.findall(".//AbstractText"):
                abstract_texts.append(" ".join(abstract.itertext()).strip())

            abstract = "\n".join(abstract_texts) if abstract_texts else None

            # Skip if title or abstract is missing
            if not title or not abstract:
                continue

            # Skip retracted articles
            if any("Retracted Publication" in pub_type.text for pub_type in article.findall(".//PublicationTypeList/PublicationType")):
                continue

            if len(abstract.split())>4000:
              continue

            documents[pmid] = {"title": title, "abstract": abstract}
            pmid_ret.append(pmid)

            if len(documents) >= 10:
                break

        return documents, pmid_ret

    except requests.exceptions.RequestException as e:
        print(f"Request error: {e}")
        return None
    except ET.ParseError as e:
        print(f"XML Parse error: {e}")
        return None

In [41]:
class YesNoAnswer(BaseModel):
    ideal_answer: str = Field(
        ...,
        description="A paragraph-sized ideal answer summarizing the most relevant information (max 200 words)."
    )
    exact_answer: str = Field(
        ...,
        description="Exact answer: must be either 'yes' or 'no'."
    )

class FactoidAnswer(BaseModel):
    ideal_answer: str = Field(
        ...,
        description="A paragraph-sized ideal answer summarizing the most relevant information (max 200 words)."
    )
    exact_answer: List[List[str]] = Field(
        ...,
        description=("A list of lists containing up to 5 entity names, ordered by decreasing confidence. "
                     "Each inner list must contain only one element.")
    )

class ListAnswer(BaseModel):
    ideal_answer: str = Field(
        ...,
        description="A paragraph-sized ideal answer summarizing the most relevant information (max 200 words)."
    )
    exact_answer: List[List[str]] = Field(
        ...,
        description=("A list of lists corresponding to the entities sought by the question. "
                     "Each inner list must contain exactly one element (no synonyms allowed).")
    )

class SummaryAnswer(BaseModel):
    ideal_answer: str = Field(
        ...,
        description="A paragraph-sized ideal answer summarizing the most relevant information (max 200 words)."
    )

In [40]:
yesno_prompt = PromptTemplate(
    input_variables=["question_body", "articles_context"],
    template="""
You are a biomedical expert. Answer the following yes/no question based on the provided articles context. The articles are ordered by decreasing relevance.
Don't mention about given articles in ideal answer, just use the knowledge given in articles and answer as if you are a biomedical expert.

Question:
{question_body}

Articles Context:
{articles_context}

Type of question: yes/no

Instructions:
1. Provide an ideal answer summarizing the most relevant information from the context provided (maximum 150 words).
2. Provide an exact answer that must be either "yes" or "no".
3. Return your answer in JSON format exactly matching the structure below:
{{
    "ideal_answer": <string>,
    "exact_answer": <string>
}}
"""
)

In [39]:
factoid_prompt = PromptTemplate(
    input_variables=["question_body", "articles_context"],
    template="""
You are a biomedical expert. Answer the following factoid question based on the provided articles context. The articles are ordered by decreasing relevance.
Don't mention about given articles in ideal answer, just use the knowledge given in articles and answer as if you are a biomedical expert.

Question:
{question_body}

Articles Context:
{articles_context}

Type of question: factoid

Instructions:
1. Provide an ideal answer summarizing the most relevant information from the context provided (maximum 150 words).
2. Provide an exact answer as a list of lists containing up to 5 entity names (up to 5 inner lists are allowed), (e.g., up to 5 names of drugs),numbers, or similar short expressions, ordered by decreasing confidence.
   Each entity can be considered as a factoid answer to the question.
   It is not neccessary to return upto 5 inner list , it is the max limit, if you think only less than 5 answers exist for the question , return those only in decreasing order of confidence.
   Note: Each inner list must contain exactly one element/entity/string (1 element but not neccesarily 1 word, an entity can be a combination of words).
3. Return your answer in JSON format exactly matching the structure below:
{{
    "ideal_answer": <string>,
    "exact_answer": <list of lists of strings>
}}
"""
)

In [44]:
list_prompt = PromptTemplate(
    input_variables=["question_body", "articles_context"],
    template="""
You are a biomedical expert. Answer the following list question based on the provided articles context. The articles are ordered by decreasing relevance.
Don't mention about given articles in ideal answer, just use the knowledge given in articles and answer as if you are a biomedical expert.

Question:
{question_body}

Articles Context:
{articles_context}

Type of question: list

Instructions:
1. Provide an ideal answer summarizing the most relevant information from the context provided (maximum 150 words).
2. Provide an exact answer as a list of lists corresponding to the entities sought by the question.
Each inner list entity will be jointly taken to constitute a single answer for the list type question asked.
Each inner list will contain one of the entities (or numbers, or other similar short expressions) seeked by the question.
Note: Each inner list must contain exactly one element/entity/string (1 element but not neccesarily 1 word, an entity can be a combination of words).
3. Return your answer in JSON format exactly matching the structure below:
{{
    "ideal_answer": <string>,
    "exact_answer": <list of lists of strings>
}}
"""
)

In [45]:
summary_prompt = PromptTemplate(
    input_variables=["question_body", "articles_context"],
    template="""
You are a biomedical expert. Answer the following summary question based on the provided articles context. The articles are ordered by decreasing relevance.
Don't mention about given articles in ideal answer, just use the knowledge given in articles and answer as if you are a biomedical expert.

Question:
{question_body}

Articles Context:
{articles_context}

Instructions:
1. Provide an ideal answer summarizing the most relevant information from the context provided (maximum 150 words).
2. Return your answer in JSON format exactly matching the structure below:
{{
    "ideal_answer": <string>
}}
"""
)

In [46]:
snippet_prompt = PromptTemplate(
    input_variables=["question_body", "articles_context"],
    template="""
You are a biomedical expert. Given the following question and the content of several articles (each article has its document ID, title, and abstract), extract relevant text snippets that answer the question.
The snippets should not be modified in any form and should just be a part of original text provided and should contain somewhat relevant information that can answer or answer atleast a part of the question.
This can be seen as extractive summarization.
Return at most 10 snippets in descending order of relevance in usefulness to answer the question.

Question:
{question_body}

Articles Context:
{articles_context}

For each snippet, include:
- "document": the article's unique identifier (PMID)
- "section": indicate whether the snippet is extracted from the "title" or "abstract"
- "text": the exact snippet text.
Return your answer in JSON format exactly matching the structure below:
{{
    "snippets": [
        {{
            "document": <string>,
            "section": <string>,
            "text": <string>
        }},
        ... (up to 10 snippets)
    ]
}}
"""
)

In [47]:
class ExtractedSnippet(BaseModel):
    document_id: int = Field(
        ...,
        description="The document (PMID) from which the snippet is extracted. Just the id as an integer."
    )
    section: str = Field(
        ...,
        description="The section from which the snippet is extracted (either 'title' or 'abstract')."
    )
    text: str = Field(
        ...,
        description="The extracted snippet text."
    )
class ExtractedSnippetsOutput(BaseModel):
    snippets: List[ExtractedSnippet] = Field(
        ...,
        description="A list of extracted snippets ordered by decreasing relevance. Max length of list = 10."
    )

In [48]:
def generate_snippets(question_body, article_context, documents, llm):
    structured_snippets_llm = llm.with_structured_output(ExtractedSnippetsOutput)
    snippet_response = structured_snippets_llm.invoke(
        snippet_prompt.format(question_body=question_body, articles_context=article_context)
    )
    extracted_snippets = snippet_response.snippets
    final_snippets = []
    for snippet in extracted_snippets[:10]:
        doc_id = str(snippet.document_id)
        section = snippet.section.lower().strip()
        snippet_text = " ".join(snippet.text.lower().strip().rstrip('.').split())
        if doc_id in documents and section in documents[doc_id]:
            section_text = " ".join(documents[doc_id][section].lower().strip().split())
            offset = section_text.find(snippet_text)

            if offset != -1:
                offset_in_begin = offset
                offset_in_end = offset + len(snippet_text) - 1
            else:
                offset_in_begin = 0
                offset_in_end = len(section_text) - 1
        else:
          continue

        final_snippet = {
            "document": doc_id,
            "beginSection": section,
            "endSection": section,
            "offsetInBeginSection": offset_in_begin,
            "offsetInEndSection": offset_in_end,
            "text": snippet_text
        }
        final_snippets.append(final_snippet)
    return final_snippets

In [49]:
llm = ChatOpenAI(model="gpt-4o", temperature=0.05)
structured_factoid_llm = llm.with_structured_output(FactoidAnswer)
structured_list_llm = llm.with_structured_output(ListAnswer)
structured_summary_llm = llm.with_structured_output(SummaryAnswer)
structured_yesno_llm = llm.with_structured_output(YesNoAnswer)

In [50]:
def generate_article_context(question_body, api_key):
    documents,pmids = retrieve_pubmed_documents(question_body, api_key)
    context = ""
    article_num = 1
    for pmid, doc_info in documents.items():
        context += f"Article {article_num}:\n"
        article_num += 1
        context += f"PMID: {pmid}\n"
        context += f"Title: {doc_info['title']}\n"
        context += f"Abstract: {doc_info['abstract']}\n\n"
        context += " --- END OF ARTICLE ---\n\n"
    if not context:
        return None,None,None
    return context, pmids, documents

In [51]:
def generate_answer(question_text,api_key,question):
    article_context,pmids,documents = generate_article_context(question_text, api_key)
    answer_json = {}
    answer_json["body"] = question["body"]
    answer_json["type"] = question["type"]
    answer_json["id"] = question["id"]
    answer_json["answer_ready"] = question["answerReady"]
    answer_json["documents"]=[]
    answer_json["snippets"] =[]
    answer_json["exact_answer"] = ""
    answer_json["ideal_answer"] = ""

    if not article_context:
        return answer_json

    if question["type"] == "yesno":
        prompt = yesno_prompt
        answer = structured_yesno_llm.invoke(prompt.format(question_body=question["body"], articles_context=article_context))
    elif question["type"] == "factoid":
        prompt = factoid_prompt
        answer = structured_factoid_llm.invoke(prompt.format(question_body=question["body"], articles_context=article_context))
    elif question["type"] == "list":
        prompt = list_prompt
        answer = structured_list_llm.invoke(prompt.format(question_body=question["body"], articles_context=article_context))
    elif question["type"] == "summary":
        prompt = summary_prompt
        answer = structured_summary_llm.invoke(prompt.format(question_body=question["body"], articles_context=article_context))

    answer_json["documents"] = pmids[:10]
    snippets = generate_snippets(question["body"], article_context, documents, llm)
    answer_json["snippets"] = snippets
    answer_json["ideal_answer"] = answer.ideal_answer
    if question["type"]!="summary":
      answer_json["exact_answer"] = answer.exact_answer
    return answer_json

find which question queries are not retrieving any pubmed articles

In [52]:
for question in data["questions"]:
  _,ids,_ = generate_article_context(mapping[question["id"]],api_key)
  if not ids:
    q = mapping[question["id"]]
    print(f"No relevant articles found from pubmed. Please recheck the query - {q}. For question - {question['body']}")
print ()




In [53]:
ans = {"questions": []}
for question in data["questions"]:
  ans["questions"].append(generate_answer(mapping[question["id"]],api_key,question))

In [54]:
with open("answer.json", "w") as json_file:
    json.dump(ans, json_file, indent=4)