从网上下载rss文件

In [1]:
import requests
import os
import xml.etree.ElementTree as ET
name_list = ["miccai","isbi","mlmi","spiemi","smc","bibm","chil","bhi","bcb","amia","ipmi","cbms","ner","ichi","biostec","cinc","psb","miccai","icphm","memea","chase","bibe","bmei","recomb","mie","bsn","icbbe","icdh","miua","sipaim","cibcb","vcbm","aime","isbra","segah","ict4ageingwell"]
def parse_rss_items(conf_id):
    rss_url = f"https://dblp.uni-trier.de/feed/streams/conf/{conf_id}.rss"
    response = requests.get(rss_url)

    if response.status_code != 200:
        raise Exception(f"❌ Failed to fetch RSS: {response.status_code}")
    root = ET.fromstring(response.content)
    items = root.findall(".//item")

    result = []
    for item in items:
        title = item.find("title").text     # 如 "BSN 2024"
        link = item.find("link").text       # 如 https://dblp.org/db/conf/bsn/bsn2024.html
        result.append((title, link))
    return result


从 dblp 链接中提取 conf_id 和年份，构造 API 地址并下载 JSON 文件

In [1]:
import os
import requests
import xml.etree.ElementTree as ET
import re
def parse_local_rss_file(file_path):
    """
    从本地 RSS 文件中提取 link 列表
    """
    tree = ET.parse(file_path)
    root = tree.getroot()
    items = root.findall(".//item")

    result = []
    for item in items:
        link = item.find("link").text.strip()
        result.append(link)
    return result
def parse_conf_year_from_link(link):
    """
    从 link 中提取 conf 和 year
    例：https://dblp.org/db/conf/bsn/bsn2024.html → ('bsn', '2024')
    """
    parts = link.strip().split("/")
    conf = parts[-2]
    filename = parts[-1].replace(".html", "")  # 如 bsn2024
    year_match = re.search(r'(\d{4})', filename)
    if year_match:
        year = year_match.group(1)
        return conf, year
    else:
        return conf, None  # 无年份信息

In [2]:
import time


def download_dblp_json_from_link(link, base_dir="data"):
    """
    从 link 构造 API 请求并保存为 data/{conf}/{conf}_{year}.json
    """
    conf, year = parse_conf_year_from_link(link)
    if not year:
        print(f"⚠️ 无法从链接中提取年份: {link}")
        return

    conf_year_id = f"{conf}{year}"
    query = f"toc:db/conf/{conf}/{conf_year_id}.bht:"
    url = f"https://dblp.org/search/publ/api?q={query}&h=1000&format=json"

    save_dir = os.path.join(base_dir, conf.upper())
    os.makedirs(save_dir, exist_ok=True)
    save_path = os.path.join(save_dir, f"{conf.upper()}_{year}.json")
    
    time.sleep(3)  # 避免请求过快导致被封禁
    response = requests.get(url)
    if response.status_code == 200:
        with open(save_path, "w", encoding="utf-8") as f:
            f.write(response.text)
        print(f"✅ Saved: {save_path}")
    else:
        print(f"❌ Failed: {conf}_{year} ({response.status_code})")

In [3]:
def download_all_from_local_rss(rss_file_path):
    """
    主函数：读取本地 RSS，逐个处理链接并下载 JSON
    """
    print(f"📂 正在读取本地 RSS 文件: {rss_file_path}")
    links = parse_local_rss_file(rss_file_path)
    print(f"🔍 共找到 {len(links)} 个年份链接")

    for link in links:
        print(f"➡️ 正在处理: {link}")
        download_dblp_json_from_link(link)

In [4]:
# miccai 不行
download_all_from_local_rss("rss/miccai.rss")

📂 正在读取本地 RSS 文件: miccai.rss
🔍 共找到 281 个年份链接
➡️ 正在处理: https://dblp.org/db/conf/miccai/toothfairy2024.html
✅ Saved: data\MICCAI\MICCAI_2024.json
➡️ 正在处理: https://dblp.org/db/conf/miccai/uwf4dr2024.html
✅ Saved: data\MICCAI\MICCAI_2024.json
➡️ 正在处理: https://dblp.org/db/conf/miccai/stacom2024.html
✅ Saved: data\MICCAI\MICCAI_2024.json
➡️ 正在处理: https://dblp.org/db/conf/miccai/cdmri2024.html
✅ Saved: data\MICCAI\MICCAI_2024.json
➡️ 正在处理: https://dblp.org/db/conf/miccai/diamond2024.html
✅ Saved: data\MICCAI\MICCAI_2024.json
➡️ 正在处理: https://dblp.org/db/conf/miccai/ldtm2024.html
✅ Saved: data\MICCAI\MICCAI_2024.json
➡️ 正在处理: https://dblp.org/db/conf/miccai/care2024.html
✅ Saved: data\MICCAI\MICCAI_2024.json
➡️ 正在处理: https://dblp.org/db/conf/miccai/grail2024.html
✅ Saved: data\MICCAI\MICCAI_2024.json
➡️ 正在处理: https://dblp.org/db/conf/miccai/hntsmrg2024.html
✅ Saved: data\MICCAI\MICCAI_2024.json
➡️ 正在处理: https://dblp.org/db/conf/miccai/lisa2024.html
✅ Saved: data\MICCAI\MICCAI_2024.json
➡️ 正在处理: