f0b970f706ba8babb0bb582a384964d8b508

In [4]:
import csv
import requests
import xml.etree.ElementTree as ET
import time
from tinydb import TinyDB, Query
import os
import sys

# 配置您的电子邮件（NCBI 要求提供）
EMAIL = "lhua0420@gmail.com"  # 请替换为您的有效电子邮件地址

# 如果您有 NCBI 的 API Key，可以在这里添加
API_KEY = "f0b970f706ba8babb0bb582a384964d8b508"  # 可选，若无则留空

# 输入和输出 CSV 文件路径
INPUT_CSV = "./maude_papers.csv"
OUTPUT_CSV = "./maude_papers_output.csv"

# 初始化 TinyDB 数据库
db = TinyDB('articles.json')

# NCBI E-utilities 基础 URL
EUTILS_BASE_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"

# 函数：标准化字段名（去除空格、BOM并转换为小写）
def normalize_fieldnames(fieldnames):
    return [field.replace('\ufeff', '').strip().lower() for field in fieldnames]

# 函数：通过 PMID 获取摘要
def get_abstract(pmid):
    fetch_url = f"{EUTILS_BASE_URL}efetch.fcgi"
    params = {
        "db": "pubmed",
        "id": pmid,
        "rettype": "abstract",
        "retmode": "xml",
        "email": EMAIL
    }
    if API_KEY:
        params["api_key"] = API_KEY
    try:
        response = requests.get(fetch_url, params=params)
        response.raise_for_status()
        root = ET.fromstring(response.text)
        # 查找摘要文本
        abstract_text = ""
        for abstract in root.findall(".//Abstract/AbstractText"):
            label = abstract.attrib.get('Label')
            text = abstract.text if abstract.text else ""
            if label:
                abstract_text += f"{label}: {text} "
            else:
                abstract_text += f"{text} "
        abstract_text = abstract_text.strip()
        return abstract_text if abstract_text else "No Abstract Available"
    except Exception as e:
        print(f"Error fetching abstract for PMID {pmid}: {e}")
        return "Error Fetching Abstract"

# 函数：通过标题搜索并获取摘要
def get_abstract_by_title(title):
    search_url = f"{EUTILS_BASE_URL}esearch.fcgi"
    search_params = {
        "db": "pubmed",
        "term": title,
        "retmode": "xml",
        "email": EMAIL
    }
    if API_KEY:
        search_params["api_key"] = API_KEY
    try:
        response = requests.get(search_url, params=search_params)
        response.raise_for_status()
        root = ET.fromstring(response.text)
        # 获取第一个匹配的 PMID
        id_list = root.find(".//IdList")
        if id_list is not None:
            first_id = id_list.find("Id")
            if first_id is not None and first_id.text:
                # 使用该 PMID 获取摘要
                abstract = get_abstract(first_id.text)
                if abstract != "No Abstract Available":
                    print(f"Abstract found by title search for Title: '{title}' using PMID: {first_id.text}")
                return abstract
        return "No Abstract Available"
    except Exception as e:
        print(f"Error fetching abstract by title '{title}': {e}")
        return "Error Fetching Abstract by Title"

# 函数：通过 PMID 获取全文链接（PMC）
def get_fulltext_link(pmid):
    elink_url = f"{EUTILS_BASE_URL}elink.fcgi"
    params = {
        "dbfrom": "pubmed",
        "db": "pmc",
        "id": pmid,
        "retmode": "xml",
        "email": EMAIL
    }
    if API_KEY:
        params["api_key"] = API_KEY
    try:
        response = requests.get(elink_url, params=params)
        response.raise_for_status()
        root = ET.fromstring(response.text)
        # 查找 PMC ID
        pmc_id = None
        for linkset in root.findall(".//LinkSet"):
            for link in linkset.findall(".//Link"):
                if link.attrib.get('DbTo') == 'pmc':
                    pmc_id = link.find('Id').text
                    break
        if pmc_id:
            return f"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC{pmc_id}/"
        else:
            return "No Full Text Available"
    except Exception as e:
        print(f"Error fetching full-text link for PMID {pmid}: {e}")
        return "Error Fetching Full Text Link"

def main():
    # 检查输出CSV是否存在，并收集已处理的PMID
    processed_pmids = set()
    if os.path.exists(OUTPUT_CSV):
        with open(OUTPUT_CSV, mode='r', encoding='utf-8') as outfile:
            reader = csv.DictReader(outfile)
            for row in reader:
                pmid = row.get('PMID', '').strip()
                if pmid:
                    # 检查是否已经有摘要和全文链接且未标记错误
                    if row.get('Abstract') not in ["Error Fetching Abstract", "Missing PMID", "Error Fetching Abstract by Title"] and \
                       row.get('FullTextLink') not in ["Error Fetching Full Text Link", "Missing PMID"]:
                        processed_pmids.add(pmid)

    # 读取输入 CSV
    try:
        with open(INPUT_CSV, mode='r', encoding='utf-8-sig') as infile:
            reader = csv.DictReader(infile)
            # 打印实际读取到的字段名
            print(f"Original CSV Headers: {reader.fieldnames}")
            # 标准化字段名
            normalized_headers = normalize_fieldnames(reader.fieldnames)
            print(f"Normalized CSV Headers: {normalized_headers}")
            # 检查是否包含必要的字段
            expected_fields = ['pmid', 'title', 'publication date']
            missing_fields = [field for field in expected_fields if field not in normalized_headers]
            if missing_fields:
                print(f"Error: Missing expected fields in CSV headers: {', '.join(missing_fields)}")
                print(f"Available headers: {reader.fieldnames}")
                return
            # 创建字段映射（标准化后的字段名 -> 原始字段名）
            header_mapping = {field.replace('\ufeff', '').strip().lower(): field for field in reader.fieldnames}
            # 获取实际字段名
            pmid_field = header_mapping['pmid']
            title_field = header_mapping['title']
            pubdate_field = header_mapping['publication date']
            # 添加新的字段名
            fieldnames = reader.fieldnames + ['Abstract', 'FullTextLink']
            articles = list(reader)
    except FileNotFoundError:
        print(f"Error: File '{INPUT_CSV}' not found.")
        return
    except Exception as e:
        print(f"Error reading '{INPUT_CSV}': {e}")
        return

    # 处理的文章列表
    to_process = []
    for article in articles:
        pmid = article.get(pmid_field, '').strip()
        if not pmid or pmid in processed_pmids:
            continue
        to_process.append(article)

    total_to_process = len(to_process)
    print(f"Total articles to process: {total_to_process}")

    # 如果没有需要处理的文章，退出
    if total_to_process == 0:
        print("No articles need processing. Exiting.")
        return

    # 打开输出 CSV文件，准备追加模式
    write_header = not os.path.exists(OUTPUT_CSV)
    with open(OUTPUT_CSV, mode='a', encoding='utf-8', newline='') as outfile:
        writer = csv.DictWriter(outfile, fieldnames=fieldnames)
        if write_header:
            writer.writeheader()
        batch_count = 0
        for index, article in enumerate(to_process, start=1):
            pmid = article.get(pmid_field, '').strip()
            title = article.get(title_field, '').strip()
            publication_date = article.get(pubdate_field, '').strip()

            # 调试输出
            print(f"Processing PMID={pmid}, Title='{title}', Publication Date='{publication_date}'")

            # 获取摘要
            abstract = get_abstract(pmid)
            if abstract == "No Abstract Available":
                # 通过标题搜索获取摘要
                abstract = get_abstract_by_title(title)
                if abstract == "No Abstract Available":
                    print(f"No abstract found for PMID {pmid} and Title '{title}'.")
            elif abstract.startswith("Error"):
                # 已经标记为错误，无需进一步操作
                pass

            # 获取全文链接
            fulltext_link = get_fulltext_link(pmid)

            # 检查并标记错误
            if abstract.startswith("Error"):
                article['Abstract'] = abstract
            else:
                article['Abstract'] = abstract

            if fulltext_link.startswith("Error"):
                article['FullTextLink'] = fulltext_link
            else:
                article['FullTextLink'] = fulltext_link

            # 将信息插入 TinyDB
            db.insert({
                'PMID': pmid,
                'Title': title,
                'Publication Date': publication_date,
                'Abstract': abstract,
                'FullTextLink': fulltext_link
            })

            # 写入输出 CSV
            writer.writerow(article)

            batch_count += 1

            # 每处理10篇文章，保存一次
            if batch_count % 10 == 0:
                outfile.flush()
                # TinyDB不需要显式调用flush()
                print(f"Processed {batch_count} articles. Data saved.")

            # 为避免触发速率限制，暂停一段时间
            time.sleep(0.34)  # 大约每秒 3 次请求

        # 处理完毕后，确保所有数据已保存
        outfile.flush()
        # TinyDB不需要显式调用flush()
        print(f"All {batch_count} articles have been processed and saved.")

    # 关闭数据库连接
    db.close()

if __name__ == "__main__":
    main()


Original CSV Headers: ['PMID', 'Title', 'Publication Date']
Normalized CSV Headers: ['pmid', 'title', 'publication date']
Total articles to process: 559
Processing PMID=39318983, Title='Adverse Events Associated With 3 Vitrectomy Platforms Reported to the US FDA MAUDE Database.', Publication Date='Sep, 2024'
No abstract found for PMID 39318983 and Title 'Adverse Events Associated With 3 Vitrectomy Platforms Reported to the US FDA MAUDE Database.'.
Processing PMID=39311943, Title='Adverse events related to robotic-assisted knee arthroplasty: a cross-sectional study from the MAUDE database.', Publication Date='Sep, 2024'
Processing PMID=39284394, Title='How Often Does Technology Fail in Robotic-Assisted Arthroplasty? A Comprehensive Analysis of a United States Food and Drug Administration Database.', Publication Date='Sep, 2024'
Processing PMID=39281406, Title='Adverse Events of Femtosecond-Assisted Laser-Assisted In situ Keratomileusis: A Manufacturer and User Facility Device Experience

In [4]:
import pandas as pd
from scholarly import scholarly
import time
import os
import sys

def read_output_csv(output_csv):
    """
    读取输出 CSV 文件，并筛选出 Abstract 字段为 "No Abstract Available" 的记录。

    :param output_csv: 输出 CSV 文件的路径
    :return: 包含无摘要记录的 DataFrame
    """
    try:
        df = pd.read_csv(output_csv, encoding='utf-8')
        no_abstract_df = df[df['Abstract'] == "No Abstract Available"]
        return no_abstract_df
    except FileNotFoundError:
        print(f"错误: 文件 '{output_csv}' 未找到。")
        sys.exit(1)
    except Exception as e:
        print(f"读取 '{output_csv}' 时出错: {e}")
        sys.exit(1)

def search_abstract_by_title(title):
    """
    使用 scholarly 库基于标题搜索摘要。

    :param title: 文章标题
    :return: 摘要文本或错误标记
    """
    try:
        search_query = scholarly.search_pubs(title)
        pub = next(search_query, None)
        if pub:
            abstract = pub.get('bib').get('abstract', None)
            if abstract:
                print(f"找到摘要: {abstract[:100]}...")  # 打印摘要的前100个字符
                return abstract
            else:
                print("通过标题搜索未找到摘要。")
                return "No Abstract Available"
        else:
            print("通过标题搜索未找到任何结果。")
            return "No Abstract Available"
    except Exception as e:
        print(f"搜索标题 '{title}' 时出错: {e}")
        return "Error Fetching Abstract by Title"

def update_abstracts(df, output_csv, backup_csv):
    """
    更新 DataFrame 中的 Abstract 字段，并在每次更新后保存回 CSV 文件。

    :param df: 包含无摘要记录的 DataFrame
    :param output_csv: 原始输出 CSV 文件路径
    :param backup_csv: 备份 CSV 文件路径
    :return: 更新后的 DataFrame
    """
    # 创建备份
    if not os.path.exists(backup_csv):
        pd.read_csv(output_csv, encoding='utf-8').to_csv(backup_csv, index=False, encoding='utf-8')
        print(f"已创建备份文件 '{backup_csv}'。")
    else:
        print(f"备份文件 '{backup_csv}' 已存在。")

    # 读取原始 CSV 文件
    try:
        full_df = pd.read_csv(output_csv, encoding='utf-8')
    except Exception as e:
        print(f"读取 '{output_csv}' 时出错: {e}")
        sys.exit(1)

    # 遍历无摘要的记录并更新
    for idx, row in df.iterrows():
        pmid = row['PMID']
        title = row['Title']
        print(f"\n处理 PMID: {pmid}, Title: '{title}'")
        abstract = search_abstract_by_title(title)
        # 更新 DataFrame
        full_df.loc[full_df['PMID'] == pmid, 'Abstract'] = abstract

        # 在每次更新后保存 CSV 文件
        try:
            full_df.to_csv(output_csv, index=False, encoding='utf-8')
            print(f"已更新并保存到 '{output_csv}'。")
        except Exception as e:
            print(f"保存更新到 '{output_csv}' 时出错: {e}")
            sys.exit(1)

        # 暂停以避免触发反爬虫机制
        time.sleep(2)

    return full_df


def main():
    OUTPUT_CSV = "./maude_papers_output.csv"
    BACKUP_CSV = "./maude_papers_output_backup.csv"

    # 读取输出 CSV 并筛选无摘要的记录
    no_abstract_df = read_output_csv(OUTPUT_CSV)
    total_no_abstract = no_abstract_df.shape[0]
    print(f"总共有 {total_no_abstract} 条记录的摘要为 'No Abstract Available'。")

    if total_no_abstract == 0:
        print("所有记录都有摘要。无需进一步操作。")
        return

    # 更新摘要
    updated_df = update_abstracts(no_abstract_df, OUTPUT_CSV, BACKUP_CSV)

    # 统计更新后的结果
    updated_no_abstract = updated_df[updated_df['Abstract'] == "No Abstract Available"].shape[0]
    print(f"\n更新后仍有 {updated_no_abstract} 条记录的摘要为 'No Abstract Available'。")

if __name__ == "__main__":
    main()


总共有 1 条记录的摘要为 'No Abstract Available'。
备份文件 './maude_papers_output_backup.csv' 已存在。

处理 PMID: 37225596, Title: 'EDITORIAL COMMENT Re: Device Failure and Adverse Events Related to Single-use and Reusable Flexible Ureteroscopes: Findings and New Insights From an 11-Year Analysis of the Manufacturer and User Facility Device Experience Database. Urology. 2023 Jul;177:41-47.'
通过标题搜索未找到任何结果。
已更新并保存到 './maude_papers_output.csv'。

更新后仍有 1 条记录的摘要为 'No Abstract Available'。
