In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import time

term = "bisbenzylisoquinolines"
start_year = 1945
end_year = 2025
csv_file = "pubmed_bisBIAs_results.csv"
columns = ["Year", "PMID", "DOI", "Abstract"]

# 初始化CSV文件
pd.DataFrame(columns=columns).to_csv(csv_file, index=False)


def get_literature_id(term, year):
    """获取指定年份的文献ID列表"""
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
    params = {
        "db": "pubmed",
        "term": f"{term} AND {year}[pdat]",
        "retmax": 100000  # 一次获取最多10万条记录
    }

    try:
        time.sleep(0.34)  # 遵守NCBI速率限制
        response = requests.get(base_url, params=params, timeout=10)
        response.raise_for_status()
    except Exception as e:
        print(f"Error fetching IDs for {year}: {str(e)}")
        return [], 0

    soup = BeautifulSoup(response.text, "xml")
    count = int(soup.find("Count").text) if soup.find("Count") else 0
    id_list = [id_tag.text for id_tag in soup.find_all("Id")]

    return id_list, count


def parse_article(article_text):
    """稳健的文献解析函数"""
    record = {"PMID": None, "DOI": None, "Abstract": None}

    try:
        # 使用更可靠的字段定位方式
        lines = [line.strip() for line in article_text.split('\n')]

        # PMID提取
        pmid_line = next(line for line in lines if line.startswith("PMID- "))
        record["PMID"] = pmid_line.split("- ")[1]

        # DOI提取
        for line in lines:
            if line.startswith("AID - ") and "[doi]" in line:
                record["DOI"] = line.split("- ")[1].split("[doi]")[0].strip()
                break

        # 摘要提取
        abstract = []
        in_abstract = False
        for line in lines:
            if line.startswith("AB  - "):
                in_abstract = True
                abstract.append(line[6:].strip())
            elif in_abstract:
                if line and not line.startswith(("PMID-", "AID -", "FAU -")):
                    abstract.append(line)
                else:
                    break
        record["Abstract"] = " ".join(abstract) if abstract else None

    except Exception as e:
        print(f"解析文献时发生错误: {str(e)}")

    return record


def get_detailed_info(id_list, year):
    """获取文献详细信息"""
    extracted_info = []
    if not id_list:
        return extracted_info

    # 分批处理，每批200个ID
    for i in range(0, len(id_list), 200):
        batch_ids = id_list[i:i + 200]
        params = {
            "db": "pubmed",
            "id": ",".join(batch_ids),
            "rettype": "medline",
            "retmode": "text"
        }

        try:
            time.sleep(0.34)
            response = requests.get(
                "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi",
                params=params,
                timeout=10
            )
            response.raise_for_status()
        except Exception as e:
            print(f"Error fetching details for {year} batch {i}: {str(e)}")
            continue

        # 使用正则表达式分割文献记录（保留完整PMID行）
        records = re.findall(r'PMID- .*?(?=\nPMID- |\Z)', response.text, flags=re.DOTALL)

        for record_text in records:
            parsed = parse_article(record_text)
            if not parsed["PMID"]:
                print(f"跳过无法解析PMID的记录，年份：{year}")
                continue
            extracted_info.append([
                year,
                parsed["PMID"],
                parsed["DOI"],
                parsed["Abstract"]
            ])

    return extracted_info


# 主循环处理每个年份
for year in range(start_year, end_year + 1):
    print(f"Processing year {year}...")

    # 获取文献ID
    id_list, count = get_literature_id(term, year)
    print(f"Found {count} articles for {year}")

    if count == 0:
        continue

    # 获取详细信息
    details = get_detailed_info(id_list, year)

    # 保存到CSV
    if details:
        pd.DataFrame(details, columns=columns).to_csv(
            csv_file, mode="a", header=False, index=False
        )
        print(f"Saved {len(details)} records for {year}\n")
    else:
        print(f"No details found for {year}\n")

print("All years processed!")

Processing year 1945...
Found 4 articles for 1945
Saved 4 records for 1945

Processing year 1946...
Found 16 articles for 1946
Saved 16 records for 1946

Processing year 1947...
Found 45 articles for 1947
Saved 45 records for 1947

Processing year 1948...
Found 32 articles for 1948
Saved 32 records for 1948

Processing year 1949...
Found 39 articles for 1949
Saved 39 records for 1949

Processing year 1950...
Found 81 articles for 1950
Saved 81 records for 1950

Processing year 1951...
Found 70 articles for 1951
Saved 70 records for 1951

Processing year 1952...
Found 65 articles for 1952
Saved 65 records for 1952

Processing year 1953...
Found 76 articles for 1953
Saved 76 records for 1953

Processing year 1954...
Found 54 articles for 1954
Saved 54 records for 1954

Processing year 1955...
Found 42 articles for 1955
Saved 42 records for 1955

Processing year 1956...
Found 60 articles for 1956
Saved 60 records for 1956

Processing year 1957...
Found 61 articles for 1957
Saved 61 record