In [None]:
!pip install langchain_community
!pip install -qU langchain-openai

In [181]:
import requests
import getpass
import json
from typing import List, Optional
from pydantic import BaseModel, Field
from langchain.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
import xml.etree.ElementTree as ET
import re

In [3]:
api_key = getpass.getpass("Enter your NCBI API key: ")

Enter your NCBI API key: ··········


In [4]:
import os
os.environ["OPENAI_API_KEY"] = getpass.getpass()

··········


In [7]:
# with open('/content/BioASQ-taskSynergy_v2025-testset1', 'r') as file:
#     data = json.load(file)
with open('/content/BioASQ-taskSynergy_v2025-testset3', 'r') as file:
    data = json.load(file)

Creating Question keywords using LLM - don't run this again

In [146]:
key_prompt = PromptTemplate(
    input_variables=["question_body"],
    template="""
I am working on a task which depends on extracting the most relevant pubmed articles for a given query/question. I am using pubmed api to get the articles. To search on pubmed we have to effectively structure our query so that pubmed search can get most relevant articles for our query.
I will be giving you a question, generate the most appropriate pubmed search query for it which will help in getting most accurate and relevant articles back from api.
The strategy should be keeping important keywords from question and removing rest while maintaing the order. Don't use multi-word phrases, use single words separated by a space.
Question - {question_body}
Please return the answer in a JSON format exactly matching the structure below:
{{
    "query": <string>
}}
"""
)
class KeyAnswer(BaseModel):
    query: str = Field(
        ...,
        description="A string representing appropriate pubmed query for given question."
    )

llm_key = ChatOpenAI(model="gpt-4o", temperature=0.2)
structured_key_llm = llm_key.with_structured_output(KeyAnswer)

In [147]:
mapping = {}
for question in data["questions"]:
  mapping[question["id"]] = structured_key_llm.invoke(key_prompt.format(question_body=question["body"]))

In [148]:
for key in mapping:
  mapping[key]=mapping[key].query

In [150]:
with open("mapping.json", "w") as json_file:
    json.dump(mapping, json_file, indent=4)

load the mapping directly

In [152]:
with open("mapping.json", "r") as json_file:
    mapping = json.load(json_file)

Main task

In [215]:
def retrieve_pubmed_documents(question_body, api_key):
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
    db = "pubmed"
    retmax = 15  # Fetch atmax 15
    esearch_url = f"{base_url}esearch.fcgi?db={db}&term={question_body}&retmax={retmax}&retmode=xml&sort=relevance&api_key={api_key}"

    try:
        # Fetch PMIDs
        esearch_response = requests.get(esearch_url)
        esearch_response.raise_for_status()
        esearch_data_root = ET.fromstring(esearch_response.text)
        pmids = [id_elem.text for id_elem in esearch_data_root.findall(".//IdList/Id")]

        if not pmids:
            print("No PMIDs found for the query.")
            return {}, []

        # Fetch article details
        efetch_url = f"{base_url}efetch.fcgi?db={db}&id={','.join(pmids)}&rettype=abstract&retmode=xml&api_key={api_key}"
        efetch_response = requests.get(efetch_url)
        efetch_response.raise_for_status()
        efetch_data_root = ET.fromstring(efetch_response.text)

        documents = {}
        pmid_ret = []

        for article in efetch_data_root.findall(".//PubmedArticle"):
            pmid = article.find(".//PMID").text

            # Extract title
            title_elem = article.find(".//ArticleTitle")
            title = " ".join(title_elem.itertext()).strip() if title_elem is not None else None

            # Extract abstract
            abstract_texts = []
            for abstract in article.findall(".//AbstractText"):
                abstract_texts.append(" ".join(abstract.itertext()).strip())

            abstract = "\n".join(abstract_texts) if abstract_texts else None

            # Skip if title or abstract is missing
            if not title or not abstract:
                continue

            # Skip retracted articles
            if any("Retracted Publication" in pub_type.text for pub_type in article.findall(".//PublicationTypeList/PublicationType")):
                continue

            if len(abstract.split())>4000:
              continue

            documents[pmid] = {"title": title, "abstract": abstract}
            pmid_ret.append(pmid)

            if len(documents) >= 10:
                break

        return documents, pmid_ret

    except requests.exceptions.RequestException as e:
        print(f"Request error: {e}")
        return None
    except ET.ParseError as e:
        print(f"XML Parse error: {e}")
        return None

In [216]:
class YesNoAnswer(BaseModel):
    ideal_answer: str = Field(
        ...,
        description="A paragraph-sized ideal answer summarizing the most relevant information (max 200 words)."
    )
    exact_answer: str = Field(
        ...,
        description="Exact answer: must be either 'yes' or 'no'."
    )

class FactoidAnswer(BaseModel):
    ideal_answer: str = Field(
        ...,
        description="A paragraph-sized ideal answer summarizing the most relevant information (max 200 words)."
    )
    exact_answer: List[List[str]] = Field(
        ...,
        description=("A list of lists containing up to 5 entity names, ordered by decreasing confidence. "
                     "Each inner list must contain only one element.")
    )

class ListAnswer(BaseModel):
    ideal_answer: str = Field(
        ...,
        description="A paragraph-sized ideal answer summarizing the most relevant information (max 200 words)."
    )
    exact_answer: List[List[str]] = Field(
        ...,
        description=("A list of lists corresponding to the entities sought by the question. "
                     "Each inner list must contain exactly one element (no synonyms allowed).")
    )

class SummaryAnswer(BaseModel):
    ideal_answer: str = Field(
        ...,
        description="A paragraph-sized ideal answer summarizing the most relevant information (max 200 words)."
    )

In [217]:
yesno_prompt = PromptTemplate(
    input_variables=["question_body", "articles_context"],
    template="""
You are a biomedical expert. Answer the following yes/no question based on the provided articles context. The articles are ordered by decreasing relevance.
Don't mention about given articles in ideal answer, just use the knowledge given in articles and answer as if you are a biomedical expert.

Question:
{question_body}

Articles Context:
{articles_context}

Type of question: yes/no

Instructions:
1. Provide an ideal answer summarizing the most relevant information from the context provided (maximum 150 words).
2. Provide an exact answer that must be either "yes" or "no".
3. Return your answer in JSON format exactly matching the structure below:
{{
    "ideal_answer": <string>,
    "exact_answer": <string>
}}
"""
)

In [218]:
factoid_prompt = PromptTemplate(
    input_variables=["question_body", "articles_context"],
    template="""
You are a biomedical expert. Answer the following factoid question based on the provided articles context. The articles are ordered by decreasing relevance.
Don't mention about given articles in ideal answer, just use the knowledge given in articles and answer as if you are a biomedical expert.

Question:
{question_body}

Articles Context:
{articles_context}

Type of question: factoid

Instructions:
1. Provide an ideal answer summarizing the most relevant information from the context provided (maximum 150 words).
2. Provide an exact answer as a list of lists containing up to 5 entity names (up to 5 inner lists are allowed), (e.g., up to 5 names of drugs),numbers, or similar short expressions, ordered by decreasing confidence.
   Note: Each inner list must contain exactly one element/entity/string (1 element but not neccesarily 1 word) (also no multiple names (synonyms) should be submitted for any entity).
3. Return your answer in JSON format exactly matching the structure below:
{{
    "ideal_answer": <string>,
    "exact_answer": <list of lists of strings>
}}
4. If any of the seeked entities has multiple names (synonyms), the corresponding inner list should only contain one of them. In the following example the exact golden answer to the list question (in the feedback file) contains three entities and the second entity has two names, i.e, "influenza" and "grippe":
"exact_answer": [["pneumonia"], ["influenza", "grippe"], ["bronchitis"]]
However, the submitted answer by the participants should be one of the following:
"exact_answer": [["pneumonia"], ["influenza"], ["bronchitis"]]
or "exact_answer": [["pneumonia"], ["grippe"], ["bronchitis"]]
"""
)

In [219]:
list_prompt = PromptTemplate(
    input_variables=["question_body", "articles_context"],
    template="""
You are a biomedical expert. Answer the following list question based on the provided articles context. The articles are ordered by decreasing relevance.
Don't mention about given articles in ideal answer, just use the knowledge given in articles and answer as if you are a biomedical expert.

Question:
{question_body}

Articles Context:
{articles_context}

Type of question: list

Instructions:
1. Provide an ideal answer summarizing the most relevant information from the context provided (maximum 150 words).
2. Provide an exact answer as a list of lists corresponding to the entities sought by the question. Each element of the outmost list is a list corresponding to one of the entities (or numbers, or other similar short expressions) seeked by the question.
   Note: no multiple names (synonyms) should be submitted for any entity, therefore each inner list should only contain one element/entity/string (1 element but not neccesarily 1 word).
3. Return your answer in JSON format exactly matching the structure below:
{{
    "ideal_answer": <string>,
    "exact_answer": <list of lists of strings>
}}
4. If any of the seeked entities has multiple names (synonyms), the corresponding inner list should only contain one of them. In the following example the exact golden answer to the list question (in the feedback file) contains three entities and the second entity has two names, i.e, "influenza" and "grippe":
"exact_answer": [["pneumonia"], ["influenza", "grippe"], ["bronchitis"]]
However, the submitted answer by the participants should be one of the following:
"exact_answer": [["pneumonia"], ["influenza"], ["bronchitis"]]
or "exact_answer": [["pneumonia"], ["grippe"], ["bronchitis"]]
"""
)

In [220]:
summary_prompt = PromptTemplate(
    input_variables=["question_body", "articles_context"],
    template="""
You are a biomedical expert. Answer the following summary question based on the provided articles context. The articles are ordered by decreasing relevance.
Don't mention about given articles in ideal answer, just use the knowledge given in articles and answer as if you are a biomedical expert.

Question:
{question_body}

Articles Context:
{articles_context}

Instructions:
1. Provide an ideal answer summarizing the most relevant information from the context provided (maximum 150 words).
2. Return your answer in JSON format exactly matching the structure below:
{{
    "ideal_answer": <string>
}}
"""
)

In [221]:
snippet_prompt = PromptTemplate(
    input_variables=["question_body", "articles_context"],
    template="""
You are a biomedical expert. Given the following question and the content of several articles (each article has its document ID, title, and abstract), extract relevant text snippets that answer the question.
The snippets should not be modified in any form and should just be a part of original text provided and should contain somewhat relevant information that can answer or answer atleast a part of the question.
This can be seen as extractive summarization.
Return at most 10 snippets in descending order of relevance in usefulness to answer the question.

Question:
{question_body}

Articles Context:
{articles_context}

For each snippet, include:
- "document": the article's unique identifier (PMID)
- "section": indicate whether the snippet is extracted from the "title" or "abstract"
- "text": the exact snippet text.
Return your answer in JSON format exactly matching the structure below:
{{
    "snippets": [
        {{
            "document": <string>,
            "section": <string>,
            "text": <string>
        }},
        ... (up to 10 snippets)
    ]
}}
"""
)

In [222]:
class ExtractedSnippet(BaseModel):
    document_id: int = Field(
        ...,
        description="The document (PMID) from which the snippet is extracted. Just the id as an integer."
    )
    section: str = Field(
        ...,
        description="The section from which the snippet is extracted (either 'title' or 'abstract')."
    )
    text: str = Field(
        ...,
        description="The extracted snippet text."
    )
class ExtractedSnippetsOutput(BaseModel):
    snippets: List[ExtractedSnippet] = Field(
        ...,
        description="A list of extracted snippets ordered by decreasing relevance. Max length of list = 10."
    )

In [223]:
def generate_snippets(question_body, article_context, documents, llm):
    structured_snippets_llm = llm.with_structured_output(ExtractedSnippetsOutput)
    snippet_response = structured_snippets_llm.invoke(
        snippet_prompt.format(question_body=question_body, articles_context=article_context)
    )
    extracted_snippets = snippet_response.snippets
    final_snippets = []
    for snippet in extracted_snippets[:10]:
        doc_id = str(snippet.document_id)
        section = snippet.section.lower().strip()
        snippet_text = " ".join(snippet.text.lower().strip().rstrip('.').split())
        if doc_id in documents and section in documents[doc_id]:
            section_text = " ".join(documents[doc_id][section].lower().strip().split())
            offset = section_text.find(snippet_text)

            if offset != -1:
                offset_in_begin = offset
                offset_in_end = offset + len(snippet_text) - 1
            else:
                offset_in_begin = 0
                offset_in_end = len(section_text) - 1
        else:
          continue

        final_snippet = {
            "document": doc_id,
            "beginSection": section,
            "endSection": section,
            "offsetInBeginSection": offset_in_begin,
            "offsetInEndSection": offset_in_end,
            "text": snippet_text
        }
        final_snippets.append(final_snippet)
    return final_snippets

In [224]:
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.1)
structured_factoid_llm = llm.with_structured_output(FactoidAnswer)
structured_list_llm = llm.with_structured_output(ListAnswer)
structured_summary_llm = llm.with_structured_output(SummaryAnswer)
structured_yesno_llm = llm.with_structured_output(YesNoAnswer)

In [225]:
def generate_article_context(question_body, api_key):
    documents,pmids = retrieve_pubmed_documents(question_body, api_key)
    context = ""
    article_num = 1
    for pmid, doc_info in documents.items():
        context += f"Article {article_num}:\n"
        article_num += 1
        context += f"PMID: {pmid}\n"
        context += f"Title: {doc_info['title']}\n"
        context += f"Abstract: {doc_info['abstract']}\n\n"
        context += " --- END OF ARTICLE ---\n\n"
    if not context:
        return None,None,None
    return context, pmids, documents

In [226]:
def generate_answer(question_text,api_key,question):
    article_context,pmids,documents = generate_article_context(question_text, api_key)
    answer_json = {}
    answer_json["body"] = question["body"]
    answer_json["type"] = question["type"]
    answer_json["id"] = question["id"]
    answer_json["answer_ready"] = question["answerReady"]
    answer_json["documents"]=[]
    answer_json["snippets"] =[]
    answer_json["exact_answer"] = ""
    answer_json["ideal_answer"] = ""

    if not article_context:
        return answer_json

    if question["type"] == "yesno":
        prompt = yesno_prompt
        answer = structured_yesno_llm.invoke(prompt.format(question_body=question["body"], articles_context=article_context))
    elif question["type"] == "factoid":
        prompt = factoid_prompt
        answer = structured_factoid_llm.invoke(prompt.format(question_body=question["body"], articles_context=article_context))
    elif question["type"] == "list":
        prompt = list_prompt
        answer = structured_list_llm.invoke(prompt.format(question_body=question["body"], articles_context=article_context))
    elif question["type"] == "summary":
        prompt = summary_prompt
        answer = structured_summary_llm.invoke(prompt.format(question_body=question["body"], articles_context=article_context))

    answer_json["documents"] = pmids[:10]
    snippets = generate_snippets(question["body"], article_context, documents, llm)
    answer_json["snippets"] = snippets
    answer_json["ideal_answer"] = answer.ideal_answer
    if question["type"]!="summary":
      answer_json["exact_answer"] = answer.exact_answer
    return answer_json

find which question queries are not retrieving any pubmed articles

In [165]:
for question in data["questions"]:
  _,ids,_ = generate_article_context(mapping[question["id"]],api_key)
  if not ids:
    q = mapping[question["id"]]
    print(f"No relevant articles found from pubmed. Please recheck the query - {q}. For question id - {question['id']}")
print ()

No PMIDs found for the query.
No relevant articles found from pubmed. Please recheck the query - percentage women successfully fertility treatment European Union. For question id - 6593d3ab06a2ea257c00001a
No PMIDs found for the query.
No relevant articles found from pubmed. Please recheck the query - four gene diagnostic signature neonatal early-onset sepsis bacterial infection. For question id - 677ecc65592fa48873000029
No PMIDs found for the query.
No relevant articles found from pubmed. Please recheck the query - mechanism action Ponsegromab. For question id - 677e8514592fa48873000020
No PMIDs found for the query.
No relevant articles found from pubmed. Please recheck the query - disadvantages screening dementia asymptomatic older populations. For question id - 6777bfbe592fa48873000016
No PMIDs found for the query.
No relevant articles found from pubmed. Please recheck the query - transcriptomic profiling differentiate gram-positive gram-negative sepsis preterm infants. For questio

In [228]:
ans = {"questions": []}
for question in data["questions"]:
  ans["questions"].append(generate_answer(mapping[question["id"]],api_key,question))

In [231]:
with open("answer.json", "w") as json_file:
    json.dump(ans, json_file, indent=4)

In [232]:
print (ans)

