In [70]:
import pandas as pd
import json
from Bio import Entrez
import time

# Define topic for summary document. Don't be too broad or you will retrieve a very large number of papers.
topic = "hemochromatosis"  # THOUGHTS - could use LLM to refine topic?
topics = [topic]

# Define date ranges
start_date = "2024/12/01"
end_date = "2025/07/05"
date_range = f'("{start_date}"[Date - Create] : "{end_date}"[Date - Create])'

# Define max number of results to return.
# If you plan to search for more than about 15 articles, you will need to create your own Entrez account and generate an API key and enter them below.
max_results = 20

# Enter your Entrez account email address abd API key. If you plan on only summarizing less than about 15 articles, you can leave these empty.
Entrez.email = "jasonbitt@gmail.com"  # Enter your email address
Entrez.api_key = "a21f5cd33f0e3f0f0730d4562ebdacdefb09"  # Enter your Entrez API key (can be generated at https://account.ncbi.nlm.nih.gov/settings/)

# Build the query dynamically based on the available topics
queries = []

if topics:
    topic_queries = ["{}[Title/Abstract]".format(topic) for topic in topics]
    queries.append("(" + " OR ".join(topic_queries) + ")")

full_query = " AND ".join(queries) + " AND " + date_range

# Search PubMed for relevant records
handle = Entrez.esearch(db="pubmed", retmax=max_results, term=full_query)
record = Entrez.read(handle)
id_list = record["IdList"]

In [71]:
record

{'Count': '96', 'RetMax': '5', 'RetStart': '0', 'IdList': ['40607335', '40603805', '40603799', '40603795', '40603794'], 'TranslationSet': [], 'QueryTranslation': '"hemochromatosis"[Title/Abstract] AND 2024/12/01:2025/07/05[Date - Create]'}

In [63]:
import pandas as pd
import json
from Bio import Entrez
import time


# DataFrame to store the extracted data
df = pd.DataFrame(
    columns=[
        "PMID",
        "Title",
        "Abstract",
        "Journal",
        "Keywords",
        "URL",
        "PubDate",
    ]
)

# Fetch information for each record in the id_list
for pmid in id_list:
    handle = Entrez.efetch(db="pubmed", id=pmid, retmode="xml")
    records = Entrez.read(handle)

    # Process each PubMed article in the response
    for record in records["PubmedArticle"]:
        # Print the record in a formatted JSON style
        # print(
        # json.dumps(record, indent=4, default=str)
        # )  # default=str handles types JSON can't serialize like datetime

        article = record["MedlineCitation"]["Article"]

        title = article["ArticleTitle"]
        abstract = (
            " ".join(article["Abstract"]["AbstractText"])
            if "Abstract" in article and "AbstractText" in article["Abstract"]
            else ""
        )

        journal_title = article["Journal"]["Title"]
        keywords = (
            ", ".join(
                keyword["DescriptorName"]
                for keyword in record["MedlineCitation"]["MeshHeadingList"]
            )
            if "MeshHeadingList" in record["MedlineCitation"]
            else ""
        )
        url = f"https://www.ncbi.nlm.nih.gov/pubmed/{pmid}"
        pub_date = article["Journal"]["JournalIssue"]["PubDate"]

        new_row = pd.DataFrame(
            {
                "PMID": [pmid],
                "Title": [title],
                "Abstract": [abstract],
                "Journal": [journal_title],
                "Keywords": [keywords],
                "URL": [url],
                "PubDate": [pub_date],
            }
        )

        df = pd.concat([df, new_row], ignore_index=True)
    time.sleep(0.33)

In [64]:
# Convert PubDate to standardized MM/DD/YYYY format
def standardize_date(pub_date):
    year = pub_date.get("Year", "")
    month = pub_date.get("Month", "01")
    day = pub_date.get("Day", "01")

    # Convert month name to number if needed
    month_map = {
        "Jan": "01",
        "Feb": "02",
        "Mar": "03",
        "Apr": "04",
        "May": "05",
        "Jun": "06",
        "Jul": "07",
        "Aug": "08",
        "Sep": "09",
        "Oct": "10",
        "Nov": "11",
        "Dec": "12",
    }
    if month in month_map:
        month = month_map[month]

    # Ensure month and day are 2 digits
    month = month.zfill(2)
    day = str(day).zfill(2)

    return f"{month}/{day}/{year}"


# Apply the date standardization to the PubDate column
df["PubDate"] = df["PubDate"].apply(standardize_date)

# Create markdown content
markdown_content = "# PubMed Search Results\n\n"

for _, row in df.iterrows():
    markdown_content += f"## {row['Title']}\n\n"
    markdown_content += f"**Journal:** {row['Journal']}\n\n"
    markdown_content += f"**Publication Date:** {row['PubDate']}\n\n"
    markdown_content += f"**Keywords:** {row['Keywords']}\n\n"
    markdown_content += f"**Abstract:**\n{row['Abstract']}\n\n"
    markdown_content += f"**URL:** [{row['URL']})\n\n"
    markdown_content += f"**PMID:** [{row['PMID']}]\n\n"
    markdown_content += "---\n\n"

# Save to markdown file
with open("PubMed_results.md", "w", encoding="utf-8") as f:
    f.write(markdown_content)
file_name = "PubMed_results.md"

In [65]:
# Importing basics for LLM (OpenAI)

import os
import requests
import time
from io import BytesIO
from openai import OpenAI

from dotenv import load_dotenv

load_dotenv()

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

client = OpenAI()

In [66]:
def create_file(client, file_path):
    if file_path.startswith("http://") or file_path.startswith("https://"):
        # Download the file content from the URL
        response = requests.get(file_path)
        file_content = BytesIO(response.content)
        file_name = file_path.split("/")[-1]
        file_tuple = (file_name, file_content)
        result = client.files.create(file=file_tuple, purpose="assistants")
    else:
        # Handle local file path
        with open(file_path, "rb") as file_content:
            result = client.files.create(file=file_content, purpose="assistants")
    print(result.id)
    return result.id


# Replace with your own file path or URL
file_id = create_file(client, file_name)

# Create a vector store
vector_store = client.vector_stores.create(name="knowledge_base")
print(vector_store.id)
print("\n")

# Add the file to the vector store
client.vector_stores.files.create(vector_store_id=vector_store.id, file_id=file_id)

result = client.vector_stores.files.list(vector_store_id=vector_store.id)
print(result)

file-S57pphsZedBRN5D9tK4v6r
vs_68745130c6488191a9fe4c24c0b35180


SyncCursorPage[VectorStoreFile](data=[VectorStoreFile(id='file-S57pphsZedBRN5D9tK4v6r', created_at=1752453426, last_error=None, object='vector_store.file', status='completed', usage_bytes=69570, vector_store_id='vs_68745130c6488191a9fe4c24c0b35180', attributes={}, chunking_strategy=StaticFileChunkingStrategyObject(static=StaticFileChunkingStrategy(chunk_overlap_tokens=400, max_chunk_size_tokens=800), type='static'))], has_more=False, object='list', first_id='file-S57pphsZedBRN5D9tK4v6r', last_id='file-S57pphsZedBRN5D9tK4v6r')


In [67]:
response = client.responses.create(
    model="gpt-4.1",
    instructions="You are a skilled and detail oriented analyst with a background in data interpretation and technical writing. You have a talent for identifying patterns and extracting meaningful insights from research data, then communicating those insights effectively and succinctly through well-crafted reports. You are creating monthly documents for a medical team the summarizes papers what is new in a specific medical topic.",
    input="""Create a summary document titled 'What's New in XXX' (where XXX is the topic of the papers). The document summarizes every single research papers included the file submitted to you. Make sure you summarize every research paper, do not skip any papers. The summary document should be formatted as follows:
    # What's New in XXX
    ## Randomized Controlled Trials
    ## Observational Studies (cohort studies, case-control studies, cross-sectional studies)
    ## Basic Science Research
    ## Meta-analyses
    ## Systematic Reviews
    ## Narrative Reviews
    ## Case Reports and Case Series
    ## Other
    One line below each paper's summary, include a brief citation in the format of: 
    <em>Citation: Title, Journal, PubDate, PMID: [PMID] (URL)</em>
    Return your response in markdown format.""",
    tools=[{"type": "file_search", "vector_store_ids": [vector_store.id]}],
)
# print(response.output_text)

# Save the summary as a markdown file
from datetime import datetime

today = datetime.today().strftime("%m-%d-%Y")
filename = f"{today}_{topic}_summary.md"

with open(filename, "w") as f:
    f.write(response.output_text)

print(f"Summary saved to {filename}")

Summary saved to 2025-07-13_gabapentin_summary.md
