In [1]:
%pip install biopython pandas tqdm


Collecting biopython
  Downloading biopython-1.86-cp311-cp311-win_amd64.whl.metadata (13 kB)
Collecting tqdm
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
     ---------------------------------------- 0.0/57.7 kB ? eta -:--:--
     ---------------------------------------- 0.0/57.7 kB ? eta -:--:--
     -------------------- ----------------- 30.7/57.7 kB 435.7 kB/s eta 0:00:01
     -------------------------------------- 57.7/57.7 kB 504.6 kB/s eta 0:00:00
Downloading biopython-1.86-cp311-cp311-win_amd64.whl (2.7 MB)
   ---------------------------------------- 0.0/2.7 MB ? eta -:--:--
   - -------------------------------------- 0.1/2.7 MB 3.0 MB/s eta 0:00:01
   --- ------------------------------------ 0.2/2.7 MB 2.4 MB/s eta 0:00:02
   ----- ---------------------------------- 0.4/2.7 MB 2.5 MB/s eta 0:00:01
   ------- -------------------------------- 0.5/2.7 MB 2.7 MB/s eta 0:00:01
   --------- ------------------------------ 0.6/2.7 MB 2.9 MB/s eta 0:00:01
   ----------- -


[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: C:\Users\Juan\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [3]:
%pip install python-dotenv

Collecting python-dotenv
  Downloading python_dotenv-1.2.1-py3-none-any.whl.metadata (25 kB)
Downloading python_dotenv-1.2.1-py3-none-any.whl (21 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.2.1
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: C:\Users\Juan\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [2]:
import os
import time
import pandas as pd
from Bio import Entrez
from tqdm import tqdm
from dotenv import load_dotenv

In [3]:
# -------------------------
# LOAD ENV VARIABLES
# -------------------------
load_dotenv()

ENTREZ_EMAIL = os.getenv("ENTREZ_EMAIL")
if not ENTREZ_EMAIL:
    raise ValueError("ENTREZ_EMAIL not found. Please set it in a .env file.")

Entrez.email = ENTREZ_EMAIL

# -------------------------
# USER INPUT
# -------------------------
def build_search_query():
    print("PubMed Abstract Extraction")
    print("--------------------------")

    raw_terms = input(
        "Enter keywords separated by commas (e.g. vegan diet, health outcome): "
    ).strip()

    if not raw_terms:
        raise ValueError("At least one keyword is required.")

    terms = [t.strip() for t in raw_terms.split(",") if t.strip()]

    operator = input("Choose operator (AND / OR) [AND]: ").strip().upper()
    if operator not in ["AND", "OR", ""]:
        raise ValueError("Operator must be AND or OR.")

    operator = operator if operator else "AND"

    query_parts = [
        f'("{term}"[Title/Abstract])'
        for term in terms
    ]

    query = f" {operator} ".join(query_parts)
    return query


# -------------------------
# CONFIGURATION
# -------------------------
MAX_RESULTS = 200
REQUEST_DELAY = 0.4

# -------------------------
# PUBMED FUNCTIONS
# -------------------------
def search_pubmed(query, max_results):
    handle = Entrez.esearch(
        db="pubmed",
        term=query,
        retmax=max_results
    )
    results = Entrez.read(handle)
    handle.close()
    return results["IdList"]

def fetch_abstracts(id_list, search_query):
    records = []

    for pmid in tqdm(id_list, desc="Fetching abstracts"):
        try:
            handle = Entrez.efetch(
                db="pubmed",
                id=pmid,
                rettype="abstract",
                retmode="xml"
            )
            article = Entrez.read(handle)
            handle.close()

            article_data = article["PubmedArticle"][0]["MedlineCitation"]["Article"]

            records.append({
                "pmid": pmid,
                "title": article_data.get("ArticleTitle", ""),
                "abstract": " ".join(
                    article_data.get("Abstract", {}).get("AbstractText", [])
                ),
                "journal": article_data["Journal"]["Title"],
                "year": article_data["Journal"]["JournalIssue"]["PubDate"].get("Year"),
                "search_query": search_query
            })

            time.sleep(REQUEST_DELAY)

        except Exception as e:
            print(f"Error fetching PMID {pmid}: {e}")

    return pd.DataFrame(records)


# -------------------------
# MAIN PIPELINE
# -------------------------
if __name__ == "__main__":
    query = build_search_query()
    print(f"\nSearching PubMed for:\n{query}\n")

    pmids = search_pubmed(query, MAX_RESULTS)
    df = fetch_abstracts(pmids, query)

    output_path = "data/pubmed_abstracts.csv"
    df.to_csv(output_path, index=False)

    print(f"\nSaved {len(df)} abstracts to {output_path}")


PubMed Abstract Extraction
--------------------------

Searching PubMed for:
("breast cancer"[Title/Abstract]) AND ("multiple imputation"[Title/Abstract])



Fetching abstracts: 100%|██████████| 137/137 [02:12<00:00,  1.03it/s]


Saved 137 abstracts to data/pubmed_abstracts.csv



