## Requirements

In [3]:
import argparse
import csv
import requests
import xml.etree.ElementTree as ET
import pandas as pd
import re
from typing import List, Dict, Optional, Union
import xml.dom.minidom

## URLs

In [2]:
PUBMED_API_BASE_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
FETCH_BASE_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"

## Fetch Pubmed IDs

In [4]:
"""Fetch PubMed IDs based on the query."""
def fetch_pubmed_ids(query: str,retmax: 100, debug = False) -> List[str]:
    params = {
        "db": "pubmed",
        "term": query,
        "retmode": "json",
        "retmax": retmax,
    }
    if debug:
        print(f"Fetching PubMed IDs with query: {query}")
    response = requests.get(PUBMED_API_BASE_URL, params=params)
    response.raise_for_status()
    data = response.json()
    return data.get("esearchresult", {}).get("idlist", [])

### Test

In [7]:
test_query = "Cardiology [tiab]"

pubmed_ids = fetch_pubmed_ids(test_query, 10)
print(pubmed_ids)

['39787625', '39786967', '39786044', '39784919', '39784423', '39782713', '39781229', '39781152', '39780469', '39780464']


## Fetch Paper Details

In [8]:
"""Fetch paper details for the given PubMed IDs."""

def fetch_paper_details(pubmed_ids: List[str], debug = False) -> str:
    ids = ",".join(pubmed_ids)
    params = {
        "db": "pubmed",
        "id": ids,
        "retmode": "xml",
    }
    if debug:
        print(f"Fetching details for PubMed IDs: {ids}")
    response = requests.get(FETCH_BASE_URL, params=params)
    response.raise_for_status()
    return response.text

### Test fetch paper details

In [9]:
papers_xml = fetch_paper_details(pubmed_ids)
# print(papers_xml)

file_path = "pubmed_sample.xml"

# Writing the content to the file
with open(file_path, "w") as file:
    file.write(papers_xml)

file_path


'pubmed_sample.xml'

In [None]:
# Pretty print the first child of the XML
dom = xml.dom.minidom.parseString(papers_xml)
print(dom.toprettyxml())

<?xml version="1.0" ?>
<!DOCTYPE PubmedArticleSet
  PUBLIC '-//NLM//DTD PubMedArticle, 1st January 2024//EN'
  'https://dtd.nlm.nih.gov/ncbi/pubmed/out/pubmed_240101.dtd'>
<PubmedArticleSet>
	

	<PubmedArticle>
		<MedlineCitation Status="MEDLINE" Owner="NLM" IndexingMethod="Automated">
			<PMID Version="1">39779262</PMID>
			<DateCompleted>
				<Year>2025</Year>
				<Month>01</Month>
				<Day>08</Day>
			</DateCompleted>
			<DateRevised>
				<Year>2025</Year>
				<Month>01</Month>
				<Day>08</Day>
			</DateRevised>
			<Article PubModel="Electronic">
				<Journal>
					<ISSN IssnType="Electronic">2044-6055</ISSN>
					<JournalIssue CitedMedium="Internet">
						<Volume>15</Volume>
						<Issue>1</Issue>
						<PubDate>
							<Year>2025</Year>
							<Month>Jan</Month>
							<Day>08</Day>
						</PubDate>
					</JournalIssue>
					<Title>BMJ open</Title>
					<ISOAbbreviation>BMJ Open</ISOAbbreviation>
				</Journal>
				<ArticleTitle>Study protocol for an international registry obser

In [11]:
'''
import xml.dom.minidom

# Parse the XML string
dom = xml.dom.minidom.parseString(papers_xml)

# Find the first Author element
first_author = dom.getElementsByTagName("Author")[0]

# Find the Affiliation element within the first Author
affiliation = first_author.getElementsByTagName("Affiliation")[]

# Extract and print the affiliation text content
affiliation_text = affiliation.firstChild.nodeValue
print("First Author's Affiliation:", affiliation_text)

'''

'\nimport xml.dom.minidom\n\n# Parse the XML string\ndom = xml.dom.minidom.parseString(papers_xml)\n\n# Find the first Author element\nfirst_author = dom.getElementsByTagName("Author")[0]\n\n# Find the Affiliation element within the first Author\naffiliation = first_author.getElementsByTagName("Affiliation")[]\n\n# Extract and print the affiliation text content\naffiliation_text = affiliation.firstChild.nodeValue\nprint("First Author\'s Affiliation:", affiliation_text)\n\n'

## Parse Pubmed XML

In [12]:
"""Parse PubMed XML to extract details."""

def parse_pubmed_xml(xml_data: str) ->  List[List[Optional[Union[str, Dict]]]]:
    root = ET.fromstring(xml_data)
    articles = []
    academic_keywords = {"university", "college", "institute", "academy", "labs", "school"}
    company_keywords = {"company", "biotech", "pharmaceutical", "corporation", "inc", "ltd"}

    for article in root.findall(".//PubmedArticle"):
        # Extract PubMed ID
        pmid = article.find(".//PMID").text if article.find(".//PMID") is not None else ""

        # Extract and handle ArticleTitle with nested tags
        title_element = article.find(".//ArticleTitle")
        title = "".join(title_element.itertext()) if title_element is not None else ""

        # Extract publication date with year, month, and day
        pub_date_element = article.find(".//PubDate")
        pub_date = ""
        if pub_date_element is not None:
            year = pub_date_element.find("Year").text if pub_date_element.find("Year") is not None else ""
            month = pub_date_element.find("Month").text if pub_date_element.find("Month") is not None else ""
            day = pub_date_element.find("Day").text if pub_date_element.find("Day") is not None else ""
            pub_date = "-".join(filter(None, [year, month, day])) or None

        authors_data = []
        for author in article.findall(".//Author"):  # Fix scope of authors
            # Extract author details
            last_name = author.find("LastName").text if author.find("LastName") is not None else ""
            fore_name = author.find("ForeName").text if author.find("ForeName") is not None else ""
            full_name = f"{fore_name} {last_name}".strip()
            
            # Extract affiliation
            affiliation_element = author.find("AffiliationInfo/Affiliation")
            affiliation = affiliation_element.text if affiliation_element is not None else ""
            
            # Extract email if present
            email = next((word for word in (affiliation or "").split() if "@" in word), "")

            # Determine if the author is non-academic or part of a company
            affiliation_lower = affiliation.lower()
            is_academic = any(keyword in affiliation_lower for keyword in academic_keywords)
            is_company = any(keyword in affiliation_lower for keyword in company_keywords)
            
            # Add non-academic authors and their details
            if not is_academic:
                # Remove email from affiliation
                if email:
                    affiliation = affiliation.replace(email, "").strip()
                
                authors_data.append({
                    "name": full_name,
                    "affiliation": affiliation,
                    "email": email,
                })
        if not authors_data: authors_data = "There are no non-academic authors"
        articles.append([pmid, title, pub_date, authors_data])

    return articles

In [56]:
def parse_pubmed_xml(xml_data):
    """Parse PubMed XML to extract details."""
    root = ET.fromstring(xml_data)
    articles = []
    academic_keywords = {"university", "college", "institute", "academy", "labs", "school"}
    company_keywords = {"company", "biotech", "pharmaceutical", "corporation", "inc", "ltd"}

    for article in root.findall(".//PubmedArticle"):
        # Extract PubMed ID
        pmid = article.find(".//PMID").text if article.find(".//PMID") is not None else ""

        # Extract and handle ArticleTitle with nested tags
        title_element = article.find(".//ArticleTitle")
        title = "".join(title_element.itertext()) if title_element is not None else ""

        # Extract publication date with year, month, and day
        pub_date_element = article.find(".//PubDate")
        pub_date = ""
        if pub_date_element is not None:
            year = pub_date_element.find("Year").text if pub_date_element.find("Year") is not None else ""
            month = pub_date_element.find("Month").text if pub_date_element.find("Month") is not None else ""
            day = pub_date_element.find("Day").text if pub_date_element.find("Day") is not None else ""
            pub_date = "-".join(filter(None, [year, month, day])) or None

        authors_data = []
        for author in article.findall(".//Author"):  # Fix scope of authors
            # Extract author details
            last_name = author.find("LastName").text if author.find("LastName") is not None else ""
            fore_name = author.find("ForeName").text if author.find("ForeName") is not None else ""
            full_name = f"{fore_name} {last_name}".strip()
            
            # Extract affiliation
            affiliation_element = author.find("AffiliationInfo/Affiliation")
            affiliation = affiliation_element.text if affiliation_element is not None else ""
            
            # Extract email if present
            email = next((word for word in (affiliation or "").split() if "@" in word), "")

            # Determine if the author is non-academic or part of a company
            affiliation_lower = affiliation.lower()
            is_academic = any(keyword in affiliation_lower for keyword in academic_keywords)
            is_company = any(keyword in affiliation_lower for keyword in company_keywords)
            
            # Add non-academic authors and their details
            if not is_academic:
                # Remove email from affiliation
                if email:
                    affiliation = affiliation.replace(email, "").strip()
                
                authors_data.append({
                    "name": full_name,
                    "affiliation": affiliation,
                    "email": email,
                })
        if not authors_data: authors_data = "There are no non-academic authors"
        articles.append([pmid, title, pub_date, authors_data])

    return articles

### Test

In [13]:
processed_data = parse_pubmed_xml(papers_xml)
# print(processed_data[0])
pd.DataFrame(processed_data, columns=["PubmedID", "Title", "Publication Date", "Non academic Authors -  Company Affiliations - Corresponding Author Email"])

Unnamed: 0,PubmedID,Title,Publication Date,Non academic Authors - Company Affiliations - Corresponding Author Email
0,39787625,Prevalence and factors associated with potenti...,2024-Dec-01,There are no non-academic authors
1,39786967,A PAS-targeting hERG1 activator reduces arrhyt...,2025-Jan-09,There are no non-academic authors
2,39786044,Biomarkers.,2024-Dec,There are no non-academic authors
3,39784919,57th Annual Meeting of the Association for Eur...,2024-Oct,There are no non-academic authors
4,39784423,Public Health.,2024-Dec,"[{'name': 'Nathaniel A Chin', 'affiliation': '..."
5,39782713,Comprehensive exploration of unexplained dyspn...,2025-Jan-09,"[{'name': 'Emmanuelle Berthelot', 'affiliation..."
6,39781229,Perspectives of Policymakers on Barriers to an...,2024,There are no non-academic authors
7,39781152,The Critical Role of Pulse Oximetry Screening ...,2024-Dec,"[{'name': 'Alina Turenschi', 'affiliation': 'P..."
8,39780469,The prevalence of obesity in children with CHD...,2025-Jan-09,There are no non-academic authors
9,39780464,Achieving excellence in paediatric cardiac car...,2025-Jan-09,"[{'name': 'Colin J McMahon', 'affiliation': 'D..."
