In [1]:
pip install notebook requests lxml

Note: you may need to restart the kernel to use updated packages.


In [2]:
import requests
import xml.etree.ElementTree as ET
import pandas as pd
from typing import List, Dict, Optional

In [3]:
def fetch_pubmed_ids(query: str, max_results: int = 20) -> List[str]:
    """Fetches a list of PubMed IDs for a given query"""
    url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
    params = {
        "db": "pubmed",
        "term": query,
        "retmode": "xml",
        "retmax": max_results
    }
    response = requests.get(url, params=params)
    root = ET.fromstring(response.content)
    ids = [elem.text for elem in root.findall(".//Id")]
    return ids


In [4]:
def fetch_paper_details(pubmed_ids: List[str]) -> ET.Element:
    """Fetches detailed metadata for given PubMed IDs"""
    url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
    params = {
        "db": "pubmed",
        "id": ",".join(pubmed_ids),
        "retmode": "xml"
    }
    response = requests.get(url, params=params)
    root = ET.fromstring(response.content)
    return root


In [5]:
def is_non_academic(affiliation: str) -> bool:
    academic_keywords = [
        "university", "institute", "college", "school", "hospital", 
        "center", "faculty", "department", "academy", "laboratory", "lab"
    ]
    return not any(keyword.lower() in affiliation.lower() for keyword in academic_keywords)


In [6]:
def parse_pubmed_articles(root: ET.Element) -> List[Dict[str, str]]:
    """Extracts and filters metadata for non-academic authors"""
    papers = []
    for article in root.findall(".//PubmedArticle"):
        pmid = article.findtext(".//PMID")
        title = article.findtext(".//ArticleTitle") or "No Title"
        pub_date = article.findtext(".//PubDate/Year") or "Unknown"

        authors = article.findall(".//Author")
        non_academic_authors = []
        affiliations = set()
        email = None

        for author in authors:
            aff = author.findtext(".//AffiliationInfo/Affiliation")
            if aff and is_non_academic(aff):
                name = f"{author.findtext('ForeName') or ''} {author.findtext('LastName') or ''}".strip()
                non_academic_authors.append(name)
                affiliations.add(aff)
                if "@" in aff and not email:
                    email = aff.split()[-1]  # crude but effective

        if non_academic_authors:
            papers.append({
                "PubmedID": pmid,
                "Title": title,
                "Publication Date": pub_date,
                "Non-academic Author(s)": "; ".join(non_academic_authors),
                "Company Affiliation(s)": "; ".join(affiliations),
                "Corresponding Author Email": email or "Not found"
            })

    return papers


In [7]:
query = "cancer immunotherapy AND 2025[dp]"
max_results = 500

# Step 1: Fetch PubMed IDs
ids = fetch_pubmed_ids(query, max_results=max_results)
print(f"✅ Found {len(ids)} PubMed IDs")

# Step 2: Fetch paper metadata in batches
def fetch_paper_details_in_batches(pubmed_ids, batch_size=100):
    roots = []
    for i in range(0, len(pubmed_ids), batch_size):
        batch = pubmed_ids[i:i + batch_size]
        try:
            root = fetch_paper_details(batch)
            roots.append(root)
        except Exception as e:
            print(f"❌ Error in batch {i}-{i+batch_size}: {e}")
    return roots

roots = fetch_paper_details_in_batches(ids)

# Step 3: Parse and filter all papers
all_papers = []
for root in roots:
    all_papers.extend(parse_pubmed_articles(root))

# Step 4: Save to CSV
df = pd.DataFrame(all_papers)
df.to_csv("filtered_pubmed_papers.csv", index=False)

# Step 5: Preview last 5 results
df.head()


✅ Found 500 PubMed IDs


Unnamed: 0,PubmedID,Title,Publication Date,Non-academic Author(s),Company Affiliation(s),Corresponding Author Email
0,40628625,Expression of Kappa Myeloma Antigen (KMA) and ...,2025,Rosanne Dunn,"Haemalogix Ltd, Sydney, New South Wales 2001, ...",Not found
1,40628289,Understanding antibody-target antigen interact...,2025,Armin Sepp; James Yates,"Certara UK Limited, Sheffield, UK.; GSK plc, S...",Not found
2,40628264,Neutralization of acyl coenzyme A binding prot...,2025,Long Pan; Lucie Poupel; Christophe Klein,"CHICS, Centre de Recherche des Cordeliers, Ins...",Not found
3,40627846,EML4-ALK rearrangement creates a distinctive m...,2025,Yukari Nishito; Hideaki Mizuno; Yukiko Sonobe;...,"Roche (Switzerland), Basel, Basel-stadt, Switz...",Not found
4,40627813,RP1 Combined With Nivolumab in Advanced Anti-P...,2025,Judith Michels; Praveen K Bommareddy; Junhong ...,"Replimune, Inc., Woburn, MA, USA.; Département...",Not found


In [8]:
query = "cancer immunotherapy AND 2025[dp]"
max_results = 500

# Step 1: Fetch PubMed IDs
ids = fetch_pubmed_ids(query, max_results=max_results)
print(f"✅ Found {len(ids)} PubMed IDs")

# Step 2: Fetch paper metadata in batches
def fetch_paper_details_in_batches(pubmed_ids, batch_size=100):
    roots = []
    for i in range(0, len(pubmed_ids), batch_size):
        batch = pubmed_ids[i:i + batch_size]
        try:
            root = fetch_paper_details(batch)
            roots.append(root)
        except Exception as e:
            print(f"❌ Error in batch {i}-{i+batch_size}: {e}")
    return roots

roots = fetch_paper_details_in_batches(ids)

# Step 3: Parse and filter all papers
all_papers = []
for root in roots:
    all_papers.extend(parse_pubmed_articles(root))

# Step 4: Save to CSV
df = pd.DataFrame(all_papers)
df.to_csv("filtered_pubmed_papers.csv", index=False)

# Step 5: Preview last 5 results
df.tail()


✅ Found 500 PubMed IDs


Unnamed: 0,PubmedID,Title,Publication Date,Non-academic Author(s),Company Affiliation(s),Corresponding Author Email
78,40597707,A scoping review of human papillomavirus relat...,2025,Ruixuan Wang; Jun Zhang; Ya-Ting Chen; Wei Wang,"MSD R&D (China) Co., Ltd, Beijing, 110105, Chi...",wei.wang40@merck.com.
79,40597177,Faecalibacterium prausnitzii promotes anti-PD-...,2025,Hongwen Li,Lymphoma Diagnosis and Treatment Centre of Hen...,Not found
80,40596656,Prognostic and immunotherapeutic significance ...,2025,Yao Tian,"Qilu Pharmaceutical Co., Ltd, Jinan, 250100, C...",Not found
81,40595293,Macrophages foster anti-tumor immunity by ZEB1...,2025,Christoph Becker,"Deutsches Zentrum Immuntherapie (DZI), Erlange...",Not found
82,40594568,General practitioners' recommendation of HPV v...,2025,Laura X Gil Sánchez; Charlotte Bauquier; Thoma...,"Unité Inserm 1296 « Radiations : Défense, Sant...",patriciavillain1@gmail.com.
