In [8]:
# PMC fulltext Save in single and separate both at the same time
import os
import requests
from xml.etree import ElementTree

def search_pmc_ids(keyword, retmax=1000):
    print(f"🔍 Searching PMC for: {keyword}")
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
    params = {
        "db": "pmc",
        "term": keyword + " open access[filter]",
        "retmax": retmax,
        "retmode": "xml"
    }

    response = requests.get(base_url, params=params)
    response.raise_for_status()
    root = ElementTree.fromstring(response.content)
    return [elem.text for elem in root.findall(".//Id")]

def fetch_pmc_article_xml(pmcid):
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
    params = {
        "db": "pmc",
        "id": pmcid,
        "retmode": "xml"
    }

    response = requests.get(base_url, params=params)
    response.raise_for_status()
    return response.text

def save_pmc_xml(xml_data, pmcid, output_folder):
    os.makedirs(output_folder, exist_ok=True)
    filepath = os.path.join(output_folder, f"PMC{pmcid}.xml")
    with open(filepath, "w", encoding="utf-8") as f:
        f.write(xml_data)
    print(f"✅ Saved: {filepath}")

# === MAIN ===
if __name__ == "__main__":
    keyword = "Alum vaccine adjuvant immune mechanism"
    output_dir = "Dataset/Dataset_PMC"

    pmc_ids = search_pmc_ids(keyword)

    if not pmc_ids:
        print("❌ No full-text articles found in PMC.")
    else:
        for pmcid in pmc_ids:
            try:
                xml = fetch_pmc_article_xml(pmcid)
                save_pmc_xml(xml, pmcid, output_dir)
            except Exception as e:
                print(f"⚠️ Failed for PMC{pmcid}: {e}")


🔍 Searching PMC for: Alum vaccine adjuvant immune mechanism
❌ No full-text articles found in PMC.


In [2]:
# PMC fulltext with boolean query Save in single and separate both at the same time
import os
import requests
import time
from xml.etree import ElementTree

def get_boolean_query():
    return (
        '("aluminum sulfate"[Supplementary Concept] OR "aluminum sulfate"[All Fields] OR "alum"[All Fields]) '
        'AND ("adjuvants, vaccine"[Supplementary Concept] OR "adjuvants, vaccine"[All Fields] OR "vaccine adjuvant"[All Fields] '
        'OR "adjuvants, vaccine"[MeSH Terms] OR ("adjuvants"[All Fields] AND "vaccine"[All Fields]) '
        'OR "vaccine adjuvants"[All Fields] OR ("vaccine"[All Fields] AND "adjuvant"[All Fields])) '
        'AND ("immunity"[MeSH Terms] OR "immunity"[All Fields] OR ("immune"[All Fields] AND "response"[All Fields]) '
        'OR "immune response"[All Fields]) AND mechanism[All Fields]'
    )

    

def search_pmc(query, retmax=10000):
    print("🔍 Searching PMC...")
    url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
    params = {
        "db": "pmc",
        "term": query,
        "retmode": "xml",
        "retmax": retmax
    }
    response = requests.get(url, params=params)
    response.raise_for_status()

    root = ElementTree.fromstring(response.content)
    pmc_ids = [id_elem.text for id_elem in root.findall(".//Id")]
    print(f"✅ Found {len(pmc_ids)} PMC IDs.")
    return pmc_ids

def fetch_pmc_article_xml(pmcid):
    url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
    params = {
        "db": "pmc",
        "id": pmcid,
        "retmode": "xml"
    }
    response = requests.get(url, params=params)
    response.raise_for_status()
    return response.text

def save_xml(xml_data, pmcid, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    file_path = os.path.join(output_dir, f"PMC{pmcid}.xml")
    with open(file_path, "w", encoding="utf-8") as f:
        f.write(xml_data)
    print(f"📄 Saved: {file_path}")

# === MAIN ===
if __name__ == "__main__":
    query = get_boolean_query()
    output_dir = "Dataset/Dataset_PMC"

    pmc_ids = search_pmc(query)

    if not pmc_ids:
        print("❌ No articles found.")
    else:
        for pmcid in pmc_ids:
            try:
                xml = fetch_pmc_article_xml(pmcid)
                save_xml(xml, pmcid, output_dir)
                # after each fetch
                time.sleep(0.4)  # 400ms pause (NCBI recommends <= 3 requests/sec)
            except Exception as e:
                print(f"⚠️ Error with PMC{pmcid}: {e}")


🔍 Searching PMC...
✅ Found 7074 PMC IDs.
📄 Saved: Dataset/Dataset_PMC/PMC12282187.xml
📄 Saved: Dataset/Dataset_PMC/PMC12281594.xml
📄 Saved: Dataset/Dataset_PMC/PMC12280796.xml
📄 Saved: Dataset/Dataset_PMC/PMC12268350.xml
📄 Saved: Dataset/Dataset_PMC/PMC12261873.xml
📄 Saved: Dataset/Dataset_PMC/PMC12257635.xml
📄 Saved: Dataset/Dataset_PMC/PMC12247452.xml
📄 Saved: Dataset/Dataset_PMC/PMC12242436.xml
📄 Saved: Dataset/Dataset_PMC/PMC12242420.xml
📄 Saved: Dataset/Dataset_PMC/PMC12239572.xml
📄 Saved: Dataset/Dataset_PMC/PMC12239286.xml
📄 Saved: Dataset/Dataset_PMC/PMC12235371.xml
📄 Saved: Dataset/Dataset_PMC/PMC12229859.xml
📄 Saved: Dataset/Dataset_PMC/PMC12223812.xml
📄 Saved: Dataset/Dataset_PMC/PMC12221512.xml
📄 Saved: Dataset/Dataset_PMC/PMC12197771.xml
📄 Saved: Dataset/Dataset_PMC/PMC12197412.xml
📄 Saved: Dataset/Dataset_PMC/PMC12193398.xml
📄 Saved: Dataset/Dataset_PMC/PMC12190922.xml
📄 Saved: Dataset/Dataset_PMC/PMC12184675.xml
📄 Saved: Dataset/Dataset_PMC/PMC12175286.xml
📄 Saved: Datas

In [1]:
import os
from xml.etree import ElementTree
from collections import Counter

def detect_article_type(xml_path):
    try:
        tree = ElementTree.parse(xml_path)
        root = tree.getroot()
        article = root.find(".//article")
        if article is not None:
            article_type = article.attrib.get("article-type", "").lower()
            return article_type if article_type else "unknown"
    except Exception as e:
        print(f"⚠️ Failed to parse {xml_path}: {e}")
    return "unknown"

def summarize_article_types(folder):
    stats = Counter()
    files = [f for f in os.listdir(folder) if f.endswith(".xml")]
    total = len(files)

    for filename in files:
        path = os.path.join(folder, filename)
        article_type = detect_article_type(path)
        stats[article_type] += 1

    print(f"📊 Stats from folder: {folder}")
    print(f"Total articles: {total}")
    for art_type, count in stats.items():
        print(f"- {art_type}: {count}")
    return stats

# === MAIN ===
if __name__ == "__main__":
    folder = "Dataset/Dataset_PMC"  # Change if needed
    summarize_article_types(folder)


📊 Stats from folder: Dataset/Dataset_PMC
Total articles: 7029
- research-article: 4654
- abstract: 21
- review-article: 2060
- chapter-article: 65
- brief-report: 61
- editorial: 28
- discussion: 9
- methods-article: 2
- other: 43
- news: 7
- article-commentary: 20
- meeting-report: 14
- systematic-review: 13
- case-report: 10
- in-brief: 2
- calendar: 3
- reply: 5
- rapid-communication: 5
- letter: 5
- product-review: 2


In [2]:
import os
from xml.etree import ElementTree
import shutil

SOURCE_DIR = "Dataset/Dataset_PMC"
DEST_DIR = "Dataset/Dataset_PMC_Filtered"
os.makedirs(DEST_DIR, exist_ok=True)

def has_full_text(root):
    return root.find(".//body") is not None

def is_nih_funded(root):
    text = ElementTree.tostring(root, encoding="utf-8").decode().lower()
    return "nih" in text or "national institutes of health" in text

def get_publication_year(root):
    year_elem = root.find(".//pub-date/year")
    if year_elem is not None and year_elem.text and year_elem.text.isdigit():
        return int(year_elem.text)
    return None

def filter_pmc_articles(source_dir, dest_dir):
    total = 0
    matched = 0

    for fname in os.listdir(source_dir):
        if not fname.endswith(".xml"):
            continue
        total += 1
        src_path = os.path.join(source_dir, fname)

        try:
            root = ElementTree.parse(src_path).getroot()
        except Exception as e:
            print(f"⚠️ Failed to parse {fname}: {e}")
            continue

        if not has_full_text(root):
            continue
        if not is_nih_funded(root):
            continue
        pub_year = get_publication_year(root)
        if pub_year is None or pub_year <= 2015:
            continue

        # All filters passed — copy the file
        dst_path = os.path.join(dest_dir, fname)
        shutil.copy(src_path, dst_path)
        matched += 1

    print(f"\n📊 Filter Summary:")
    print(f"Total scanned: {total}")
    print(f"Matched (fulltext, NIH, year>2015): {matched}")

# === MAIN ===
if __name__ == "__main__":
    filter_pmc_articles(SOURCE_DIR, DEST_DIR)



📊 Filter Summary:
Total scanned: 7029
Matched (fulltext, NIH, year>2015): 1674


In [3]:
import os
from xml.etree import ElementTree
import shutil

SOURCE_DIR = "Dataset/Dataset_PMC_Filtered"
DEST_DIR = "Dataset/Dataset_PMC_Filtered_Reviews"
os.makedirs(DEST_DIR, exist_ok=True)

TARGET_TYPES = {"review-article", "systematic-review"}

def get_article_type(root):
    article = root.find(".//article")
    return article.attrib.get("article-type", "unknown") if article is not None else "unknown"

def filter_review_articles(source_dir, dest_dir):
    total = 0
    matched = 0

    for fname in os.listdir(source_dir):
        if not fname.endswith(".xml"):
            continue
        total += 1
        path = os.path.join(source_dir, fname)

        try:
            root = ElementTree.parse(path).getroot()
            article_type = get_article_type(root)
        except Exception as e:
            print(f"⚠️ Error parsing {fname}: {e}")
            continue

        if article_type in TARGET_TYPES:
            shutil.copy(path, os.path.join(dest_dir, fname))
            matched += 1

    print(f"\n📊 Review Filtering Summary:")
    print(f"Total scanned: {total}")
    print(f"Matched reviews: {matched}")

# === MAIN ===
if __name__ == "__main__":
    filter_review_articles(SOURCE_DIR, DEST_DIR)



📊 Review Filtering Summary:
Total scanned: 1674
Matched reviews: 603
