In [72]:
import pandas as pd
import json
from Bio import Entrez
import time

# 1. Define the topic for your summary document. Broad topics will retrieve a very large number of papers, I recommend either being more specific or have a narrow date range.
topic = "carpal tunnel syndrome"
topics = [topic]

# Define date ranges for paper publication
start_date = "2025/06/20"
end_date = "2025/07/20"
date_range = f'("{start_date}"[Date - Create] : "{end_date}"[Date - Create])'

# Define max number of papers to return
max_results = 30

# Enter your Entrez account email address and API key.
# Entrez accounts are free and can be created at https://account.ncbi.nlm.nih.gov
# Entrez API keys an be generated at https://account.ncbi.nlm.nih.gov/settings/
# If you plan on only summarizing less than about 15 articles, you can leave these fields empty.
Entrez.email = "jasonbitt@gmail.com"
Entrez.api_key = "a21f5cd33f0e3f0f0730d4562ebdacdefb09"

# Build the query dynamically based on the available topics. This element of the code was adapted from https://github.com/TLDWTutorials/PubmedAPI/tree/main

queries = []

if topics:
    topic_queries = ["{}[Title/Abstract]".format(topic) for topic in topics]
    queries.append("(" + " OR ".join(topic_queries) + ")")

full_query = " AND ".join(queries) + " AND " + date_range

# Search PubMed for relevant records
handle = Entrez.esearch(db="pubmed", retmax=max_results, term=full_query)
record = Entrez.read(handle)
id_list = record["IdList"]

In [73]:
record

{'Count': '36', 'RetMax': '30', 'RetStart': '0', 'IdList': ['40684234', '40670253', '40667672', '40661096', '40656870', '40655054', '40653641', '40647592', '40645791', '40635968', '40634896', '40633087', '40631542', '40630820', '40616458', '40612066', '40612064', '40612063', '40608955', '40605207', '40599036', '40592791', '40589440', '40583556', '40583360', '40579745', '40579300', '40576202', '40563051', '40557028'], 'TranslationSet': [], 'QueryTranslation': '"carpal tunnel syndrome"[Title/Abstract] AND 2025/06/20:2025/07/20[Date - Create]'}

In [74]:
import pandas as pd
import json
from Bio import Entrez
import time


# DataFrame to store the extracted data
df = pd.DataFrame(
    columns=[
        "PMID",
        "Title",
        "Abstract",
        "Journal",
        "Keywords",
        "URL",
        "PubDate",
    ]
)

# Fetch information for each record in the id_list
for pmid in id_list:
    handle = Entrez.efetch(db="pubmed", id=pmid, retmode="xml")
    records = Entrez.read(handle)

    # Process each PubMed article in the response
    for record in records["PubmedArticle"]:
        # Print the record in a formatted JSON style
        # print(
        # json.dumps(record, indent=4, default=str)
        # )  # default=str handles types JSON can't serialize like datetime

        article = record["MedlineCitation"]["Article"]

        title = article["ArticleTitle"]
        abstract = (
            " ".join(article["Abstract"]["AbstractText"])
            if "Abstract" in article and "AbstractText" in article["Abstract"]
            else ""
        )

        journal_title = article["Journal"]["Title"]
        keywords = (
            ", ".join(
                keyword["DescriptorName"]
                for keyword in record["MedlineCitation"]["MeshHeadingList"]
            )
            if "MeshHeadingList" in record["MedlineCitation"]
            else ""
        )
        url = f"https://www.ncbi.nlm.nih.gov/pubmed/{pmid}"
        pub_date = article["Journal"]["JournalIssue"]["PubDate"]

        new_row = pd.DataFrame(
            {
                "PMID": [pmid],
                "Title": [title],
                "Abstract": [abstract],
                "Journal": [journal_title],
                "Keywords": [keywords],
                "URL": [url],
                "PubDate": [pub_date],
            }
        )

        df = pd.concat([df, new_row], ignore_index=True)
    time.sleep(0.33)

In [75]:
# Convert PubDate to standardized MM/DD/YYYY format
def standardize_date(pub_date):
    year = pub_date.get("Year", "")
    month = pub_date.get("Month", "01")
    day = pub_date.get("Day", "01")

    # Convert month name to number if needed
    month_map = {
        "Jan": "01",
        "Feb": "02",
        "Mar": "03",
        "Apr": "04",
        "May": "05",
        "Jun": "06",
        "Jul": "07",
        "Aug": "08",
        "Sep": "09",
        "Oct": "10",
        "Nov": "11",
        "Dec": "12",
    }
    if month in month_map:
        month = month_map[month]

    # Ensure month and day are 2 digits
    month = month.zfill(2)
    day = str(day).zfill(2)

    return f"{month}/{day}/{year}"


# Apply the date standardization to the PubDate column
df["PubDate"] = df["PubDate"].apply(standardize_date)

# Create markdown content
markdown_content = "# PubMed Search Results\n\n"

for _, row in df.iterrows():
    markdown_content += f"## {row['Title']}\n\n"
    markdown_content += f"**Journal:** {row['Journal']}\n\n"
    markdown_content += f"**Publication Date:** {row['PubDate']}\n\n"
    markdown_content += f"**Keywords:** {row['Keywords']}\n\n"
    markdown_content += f"**Abstract:**\n{row['Abstract']}\n\n"
    markdown_content += f"**URL:** [{row['URL']})\n\n"
    markdown_content += f"**PMID:** [{row['PMID']}]\n\n"
    markdown_content += "---\n\n"

# Save to markdown file
with open("PubMed_results.md", "w", encoding="utf-8") as f:
    f.write(markdown_content)
file_name = "PubMed_results.md"

In [76]:
# Importing basics for LLM (OpenAI)

import os
import requests
import time
from io import BytesIO
from openai import OpenAI

from dotenv import load_dotenv

load_dotenv()

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

client = OpenAI()

In [84]:
response = client.responses.create(
    model="gpt-4.1",
    text={"format": {"type": "json_object"}},
    input=[
        {
            "role": "user",
            "content": f"""Please categorize all of the papers into the following research types and output as JSON. Each paper should only be categorized into one research type. Use only the PMID to identify each paper.
            
            {{
                "randomized_controlled_trials": [PMID1,PMID2],
                "observational_studies": [], 
                "basic_science_research": [],
                "meta_analyses_and_systemic_reviews": [],
                "systematic_reviews": [],
                "narrative_reviews": [],
                "case_reports_and_series": [],
                "other": []
            }}

            Note: observational studies include retrospective cohort studies, prospective cohort studies, case-control studies, and cross-sectional studies.

            Note: "meta_analyses_and_systemic_reviews" includes both meta-analyses and systematic reviews
            
            """,
        },
        {
            "role": "user",
            "content": "Here is the content to categorize: "
            + open("PubMed_results.md").read(),
        },
    ],
)
# print(response.output_text)

# Save the summary as a markdown file
from datetime import datetime

today = datetime.today().strftime("%m-%d-%Y")
filename = f"{today}_{topic}_categories.md"

with open(filename, "w") as f:
    f.write(response.output_text)

print(f"Categorization saved to {filename}")

Categorization saved to 07-20-2025_carpal tunnel syndrome_categories.md


In [85]:
response = client.responses.create(
    model="gpt-4.1",
    input=[
        {
            "role": "user",
            "content": f"""Please create a summary document of the papers. Give a short (1-2 sentence) summary of each paper as well based on the PubMed_results.md file. If there is no abstract available, your summary sentence should just be: "No abstract available for summarization."

            One line below each paper's summary, include a brief citation in the format of:<em>Citation: Title, Journal, PubDate, PMID: [PMID] (URL)</em>.
            
            Use the categories in the categories.md file to organize the papers by paper type. 
            
            Your summary document should be formatted as:
            # What's new in {topic} research"
            ## Randomized Controlled Trials
            ## Observational Studies (cohort studies, case-control studies, cross-sectional studies)
            ## Basic Science Research
            ## Meta-analyses and Systematic Reviews
            ## Narrative Reviews
            ## Case Reports & Case Series
            ## Other
            """,
        },
        {
            "role": "user",
            "content": "Here are the categories, referenced by PMID: "
            + open(f"{today}_{topic}_categories.md").read()
            + "\n\nHere is the content to summarize: "
            + open("PubMed_results.md").read(),
        },
    ],
)
# print(response.output_text)

# Save the summary as a markdown file
from datetime import datetime

today = datetime.today().strftime("%m-%d-%Y")
filename = f"{today}_{topic}_summary.md"

with open(filename, "w") as f:
    f.write(response.output_text)

print(f"Summary saved to {filename}")

Summary saved to 07-20-2025_carpal tunnel syndrome_summary.md
