# NCBI PubMed Literature Search Strategy & CSV Generation
This notebook allows you to test different keyword strategies for the NCBI API (using PubMed database)

Obtain your API key through your account (optional): https://account.ncbi.nlm.nih.gov/settings/

Edit the `groups` and `logic` in the next code cell, then run the subsequent cells to see the results


In [None]:
import subprocess
import sys

try:
    import requests
    print("'requests' is already installed.")
except ModuleNotFoundError:
    print("'requests' not found. Installing now...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "requests"])
    import requests
    print("'requests' has been installed successfully.")


# 1. Setup & Define Your Folders and API Key

In the below section uncomment (ctrl+/ on PC or command+/ on Mac) the relevant lines to define the csv and summary folder and to include your API key. API key for the NCBI API is optional but recommended

In [None]:
# === SECTION: USER SETUP (PC/Windows) ===
# Uncomment below and edit these variables to match your Windows setup
# csv_folder = r"C:\Users\YOUR_USERNAME\Documents\csvs\pubmed_csv"
# summary_folder = r"C:\Users\YOUR_USERNAME\Documents\csvs\summaries"
# api_key = "YOUR_NCBI_API_KEY"  # Replace with your NCBI API key

# === SECTION: USER SETUP (Mac) ===
# Uncomment below and edit these lines to match your Mac setup
csv_folder = r"/Users/YOUR_USERNAME/Documents/csvs/pubmed_csv"
summary_folder = r"/Users/YOUR_USERNAME/Documents/csvs/summaries"
api_key = "YOUR_NCBI_API_KEY"  # Replace with your NCBI API key (https://developer.ieee.org/)

# === SECTION: FOLDER CREATION AND CHECK ===
import os

os.makedirs(csv_folder, exist_ok=True)
os.makedirs(summary_folder, exist_ok=True)
missing = []
if not api_key or api_key == "YOUR_NCBI_API_KEY":
    missing.append("API key")
if not os.path.isdir(csv_folder):
    missing.append("CSV folder")
if not os.path.isdir(summary_folder):
    missing.append("Summary folder")

if missing:
    print(f"⚠️ WARNING: Please check the following: {', '.join(missing)}")
else:
    print("✅ Output folders and API key are set up and ready.")

# 2. Test and adjust your keyword strategy

The below 4 sections will help test different keyword groups and their combinations.
- 2.1 Run to define groups of keywords and your exclusion keyword group using AND/OR rules, then define a combination logic
- 2.2. Run to see the number of results returned for each keyword group and the combined query
- 2.3. Run to see the first 10 titles for each keyword group
- 2.4. Run to see the first 10 titles for the combined keyword group

In [None]:
# === EDIT THIS CELL TO CHANGE YEAR RANGE ===
year_range = "2016:3000[dp]"

# === EDIT THIS CELL TO CHANGE KEYWORDS/LOGIC ===
groups = {
    'group1': 'keyword OR keyword',
    'group2': 'keyword OR keyword AND keyword',
    'excluded': 'NOT (keyword or keyword)'
}

logic = "({group1}) AND ({group2}) {excluded}"
combined_query = logic.format(**groups)

print(f"Keyword groups and logic defined.\nYear filter: {year_range}")
print("Combined PubMed query:", combined_query)

In [None]:
# === SECTION: Run API Query and Return Total Results for Each Group and Combined Query ===

import requests
import xml.etree.ElementTree as ET

def run_pubmed_query(query, max_results=0, year_range="2016:3000[dp]"):
    """Run PubMed API query and return count/PMIDs/titles"""
    query_with_year = f"{query} AND {year_range}"
    params = {
        'db': 'pubmed',
        'term': query_with_year,
        'retmax': max_results or 1,
        'retmode': 'xml'
    }
    esearch_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
    esearch_resp = requests.get(esearch_url, params=params)
    root = ET.fromstring(esearch_resp.content)
    count = int(root.findtext('.//Count', '0'))
    pmids = [id_elem.text for id_elem in root.findall('.//Id')]
    titles = []
    if max_results > 0 and pmids:
        efetch_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
        efetch_params = {'db': 'pubmed', 'id': ','.join(pmids), 'retmode': 'xml'}
        efetch_resp = requests.get(efetch_url, params=efetch_params)
        articles = ET.fromstring(efetch_resp.content).findall('.//PubmedArticle')
        for article in articles:
            title = article.findtext('.//ArticleTitle', default='').replace('\n', ' ').strip()
            titles.append(title)
    return count, pmids, titles

# Print year filter info
print(f"Year limit applied to all queries: {year_range}\n")

# Test individual keyword groups
print("="*50)
print("INDIVIDUAL GROUP RESULTS:")
print("="*50)
for name, query in groups.items():
    if name != 'excluded':  # Skip exclusion group
        count, _, _ = run_pubmed_query(query)
        print(f"{name.upper():<25}: {count} results")

# Print combined query
print("\n" + "="*50)
print("COMBINED LOGIC RESULTS:")
print("="*50)
print(f"Logic: {logic}\n")
print(f"Combined query: {combined_query}")
combined_count, pmids, _ = run_pubmed_query(combined_query)
print(f"Combined results: {combined_count}")

The next block will show the first 10 titles for each keyword group (except the excluded keyword group).

Based on this, you can go back and adust your keyword groups.

In [None]:
# === SECTION: Print First 10 Titles for Each Group (Excluding 'excluded') ===

max_titles = 10  # Number of titles to print for each group

for name, query in groups.items():
    if name != 'excluded':
        print("\n" + "="*50)
        print(f"FIRST {max_titles} TITLES FOR GROUP: {name.upper()}")
        print("="*50)
        count, pmids, titles = run_pubmed_query(query, max_results=max_titles, year_range=year_range)
        if titles:
            for i, title in enumerate(titles, 1):
                print(f"{i}. {title}")
        else:
            print("No titles found for this group.")


The next block will show the first 10 titles for the combined query.

Based on the results you can go back and adjust your groups and logic.

In [None]:
# === SECTION: Show First 10 Titles for Combined Keyword Group ===

max_titles_to_print = 10 # <-- Adjust this if you want more or fewer

# Fetch results and titles
combined_count, _, _ = run_pubmed_query(combined_query, max_results=0, year_range=year_range)

if combined_count > 0:
    print("\n" + "="*50)
    print(f"FIRST {max_titles_to_print} TITLES FOR COMBINED QUERY:")
    print("="*50)
    _, _, titles = run_pubmed_query(combined_query, max_results=max_titles_to_print, year_range=year_range)
    for i, title in enumerate(titles, 1):
        print(f"{i}. {title}")
else:
    print("\nNo results found for the combined query.")


# 2. Export PubMed Results to CSV

The below script will use your combined query to download titles and abstracts and save them to a CSV file, including author name, title, abstract, year and doi. It will also update the summary table to include the total of found and downloaded records, the source the final query and a timestamp for record keeping purposes.

In [None]:
# === SECTION: Download CSV and Update Summary ===

# === SECTION: IMPORTS ===
import os
import csv
import requests
import xml.etree.ElementTree as ET
from datetime import datetime

# === SECTION: UTILITY FUNCTION: Get Next CSV Name ===
def get_next_csv_name(folder, base_name):
    i = 1
    while True:
        csv_name = f"{base_name}_v{i}.csv"
        csv_path = os.path.join(folder, csv_name)
        if not os.path.exists(csv_path):
            return f"{base_name}_v{i}", csv_path  # returns base name for summary and full path
        i += 1

# === SECTION: MAIN FUNCTION: Download PubMed Results to CSV ===
def download_ncbi_to_csv(query, csv_folder, api_key, year_range, max_results=None):
    os.makedirs(csv_folder, exist_ok=True)
    esearch_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
    efetch_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"

    query_with_year = f"{query} AND {year_range}"
    params = {
        'db': 'pubmed',
        'term': query_with_year,
        'retmax': max_results if max_results is not None else 10000,
        'retmode': 'xml',
        'api_key': api_key
    }
    print(f"Querying PubMed for: {query_with_year}")
    esearch_resp = requests.get(esearch_url, params=params)
    root = ET.fromstring(esearch_resp.content)
    ids = [id_elem.text for id_elem in root.findall('.//Id')]
    print(f"Retrieved {len(ids)} PubMed IDs")

    base_name, csv_path = get_next_csv_name(csv_folder, "pubmed")
    print(f"Writing results to CSV: {csv_path}")

    count_downloaded = 0
    with open(csv_path, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['first_author', 'all_authors', 'title', 'abstract', 'year', 'doi'])

        if ids:
            efetch_params = {
                'db': 'pubmed',
                'id': ','.join(ids),
                'retmode': 'xml',
                'api_key': api_key
            }
            efetch_resp = requests.get(efetch_url, params=efetch_params)
            root = ET.fromstring(efetch_resp.content)

            for article in root.findall('.//PubmedArticle'):
                authors = []
                first_author = ''
                author_list = article.find('.//AuthorList')
                if author_list is not None:
                    for i, author in enumerate(author_list.findall('Author')):
                        last = author.findtext('LastName', default='')
                        fore = author.findtext('ForeName', default='')
                        full_name = f"{fore} {last}".strip()
                        if full_name:
                            authors.append(full_name)
                        if i == 0 and full_name:
                            first_author = full_name
                all_authors = '; '.join(authors)

                title = article.findtext('.//ArticleTitle', default='').replace('\n', ' ').strip()

                abstract = ''
                abstract_elem = article.find('.//Abstract')
                if abstract_elem is not None:
                    abstract = ' '.join([abst.text for abst in abstract_elem.findall('AbstractText') if abst.text]).replace('\n', ' ').strip()

                year = ''
                pubdate = article.find('.//Journal/JournalIssue/PubDate/Year')
                if pubdate is not None:
                    year = pubdate.text
                else:
                    year = article.findtext('.//Article/Journal/JournalIssue/PubDate/Year', default='')
                year = year.strip() if year else ''

                doi = article.findtext('.//ArticleId[@IdType="doi"]', default='')

                writer.writerow([first_author, all_authors, title, abstract, year, doi])
                count_downloaded += 1

    print(f"Downloaded {count_downloaded} records to {csv_path}")
    # Capture timestamp at end of download in ISO format without seconds
    timestamp = datetime.now().strftime("%Y-%m-%dT%H:%M")
    return len(ids), count_downloaded, base_name, timestamp  # <--- return timestamp

# === SECTION: SUMMARY ROW APPEND WITH TIMESTAMP ===
def append_summary_row(summary_folder, base_name, found, downloaded, query, timestamp):
    summary_csv_path = os.path.join(summary_folder, "summary_csv.csv")
    os.makedirs(summary_folder, exist_ok=True)
    header = "source,found,downloaded,query combination,timestamp\n"
    row = f"{base_name},{found},{downloaded},\"{query}\",{timestamp}\n"

    # Check if file exists and if it's empty
    file_exists = os.path.exists(summary_csv_path)
    is_empty = not file_exists or os.path.getsize(summary_csv_path) == 0

    with open(summary_csv_path, 'a', encoding='utf-8', newline='') as f:
        if is_empty:
            f.write(header)
        f.write(row)
    print(f"Summary row added for {base_name}")

# === SECTION: RUN DOWNLOAD AND SUMMARY ===
found, downloaded, base_name, timestamp = download_ncbi_to_csv(
    combined_query,
    csv_folder,
    api_key,
    year_range=year_range,
    max_results=None  # Change to None to download all results
)

append_summary_row(
    summary_folder,
    base_name,
    found=found,
    downloaded=downloaded,
    query=combined_query,
    timestamp=timestamp
)
