In [1]:

import os
import requests


sudo docker run --rm -p 8070:8070 grobid/grobid:0.8.1运行前命令行启动 

In [40]:

def parse_pdfs_with_grobid(pdf_dir, grobid_url="http://localhost:8070/api/processFulltextDocument"):
    """
    解析指定文件夹内所有 PDF 文件，返回解析结果列表（每项包含文件名和返回XML文本）
    
    参数:
        pdf_dir (str): PDF 文件夹路径
        grobid_url (str): GROBID API 地址（默认是本地端口 8070）
    
    返回:
        List[Dict]: [{'filename': 'xxx.pdf', 'tei': '<TEI>...</TEI>'}, ...]
    """
    result_list = []

    for filename in os.listdir(pdf_dir):
        if filename.lower().endswith(".pdf"):
            filepath = os.path.join(pdf_dir, filename)
            print(f"解析中: {filename}")
            try:
                with open(filepath, 'rb') as f:
                    response = requests.post(
                        grobid_url,
                        files={'input': (filename, f, 'application/pdf')}
                    )
                if response.status_code == 200:
                    result_list.append({
                        "filename": filename,
                        "tei": response.text
                    })
                else:
                    print(f"❌ 解析失败: {filename}, 状态码: {response.status_code}")
            except Exception as e:
                print(f"❌ 解析出错: {filename}, 错误: {e}")
    
    return result_list


In [41]:
pdf_folder_path = "/home/wbh/knowledge/files"
results = parse_pdfs_with_grobid(pdf_folder_path)

解析中: chi24_使用强化学习模拟人类情绪.pdf
解析中: chi24_用强化学习方法来进行任务切换.pdf
解析中: chi21_通过强化学习调整用户界面.pdf


In [31]:
import os

def save_tei_results(results, output_folder):
    """
    将 results 中的每个 TEI 字符串保存为 .tei.xml 文件。
    每个元素应该是 {'filename': ..., 'tei': ...}
    """
    os.makedirs(output_folder, exist_ok=True)

    for idx, item in enumerate(results):
        tei_xml = item.get("tei", "")  # 安全获取
        filename = os.path.splitext(os.path.basename(item.get("filename", f"document_{idx+1}")))[0]
        file_path = os.path.join(output_folder, f"{filename}.tei.xml")

        with open(file_path, "w", encoding="utf-8") as f:
            f.write(tei_xml)

        print(f"已保存: {file_path}")




In [32]:
save_tei_results(results, "./tei_files")

已保存: ./tei_files/chi24_使用强化学习模拟人类情绪.tei.xml
已保存: ./tei_files/chi24_用强化学习方法来进行任务切换.tei.xml
已保存: ./tei_files/chi21_通过强化学习调整用户界面.tei.xml


In [42]:
import re
from bs4 import BeautifulSoup, Tag, NavigableString

def extract_nested_sections_with_numbered_citations(tei_xml):
    soup = BeautifulSoup(tei_xml, "lxml-xml")
    section_tree = {}

    # 引文原始结构映射：id -> biblStruct
    bibl_structs = {b.get("xml:id"): b for b in soup.find_all("biblStruct")}

    path_stack = []

    for div in soup.find_all("div"):
        head = div.find("head")
        if not head:
            continue

        section_title = head.get_text(strip=True)
        n = head.get("n", None)
        level = len(n.split(".")) if n else 1
        path_stack = path_stack[:level - 1]
        path_stack.append(section_title)

        # 进入嵌套结构
        current = section_tree
        for t in path_stack[:-1]:
            current = current.setdefault(t, {"text": "", "citations": [], "subsections": {}})["subsections"]

        section_text = ""
        citation_order = []

        for p in div.find_all("p"):
            new_p = ""
            pending_refs = []

            children = list(p.children)
            i = 0
            while i < len(children):
                node = children[i]

                if isinstance(node, NavigableString):
                    for ref_id in pending_refs:
                        if ref_id not in citation_order and ref_id in bibl_structs:
                            citation_order.append(ref_id)
                        if ref_id in citation_order:
                            ref_number = citation_order.index(ref_id) + 1
                            new_p += f"[{ref_number}]"
                    pending_refs = []

                    cleaned = re.sub(r'\[\d+(?:,\s*\d+)*\]', '', node)
                    new_p += cleaned

                elif isinstance(node, Tag) and node.name == "ref" and node.get("type") == "bibr":
                    ref_id = node.get("target", "").lstrip("#")
                    if ref_id:
                        pending_refs.append(ref_id)

                else:
                    for ref_id in pending_refs:
                        if ref_id not in citation_order and ref_id in bibl_structs:
                            citation_order.append(ref_id)
                        if ref_id in citation_order:
                            ref_number = citation_order.index(ref_id) + 1
                            new_p += f"[{ref_number}]"
                    pending_refs = []

                    try:
                        raw = node.get_text()
                        cleaned = re.sub(r'\[\d+(?:,\s*\d+)*\]', '', raw)
                        new_p += cleaned
                    except:
                        continue

                i += 1

            for ref_id in pending_refs:
                if ref_id not in citation_order and ref_id in bibl_structs:
                    citation_order.append(ref_id)
                if ref_id in citation_order:
                    ref_number = citation_order.index(ref_id) + 1
                    new_p += f"[{ref_number}]"
            pending_refs = []

            section_text += new_p.strip() + " "

        # 构造结构化引用信息
        structured_citations = []
        for i, ref_id in enumerate(citation_order):
            bibl = bibl_structs.get(ref_id)
            if not bibl:
                continue
            title_tag = bibl.find("title")
            title = title_tag.get_text(strip=True) if title_tag else "Unknown Title"

            authors = bibl.find_all("persName")
            author_names = [a.get_text(" ", strip=True) for a in authors]
            author_str = ", ".join(author_names)

            date_tag = bibl.find("date")
            year = date_tag.get("when", "") if date_tag else ""

            structured_citations.append({
                "id": f"[{i+1}]",
                "text": title,
                "author": author_str,
                "year": year
            })

        # 填充最终结构
        current[path_stack[-1]] = {
            "text": section_text.strip(),
            "citations": structured_citations,
            "subsections": {}
        }

    return section_tree


In [43]:
import os
import json

tei_folder = "/home/wbh/knowledge/tei_files"
json_folder = "/home/wbh/knowledge/json_files"

# 如果 json_files 文件夹不存在就创建它
os.makedirs(json_folder, exist_ok=True)

for filename in os.listdir(tei_folder):
    if filename.endswith(".xml"):
        file_path = os.path.join(tei_folder, filename)

        # 读取 TEI XML 内容
        with open(file_path, 'r', encoding='utf-8') as f:
            tei_xml = f.read()

        # 提取结构信息
        parsed = extract_nested_sections_with_numbered_citations(tei_xml)

        # 保存为 JSON，路径换成 json_folder 目录
        json_filename = filename.replace(".xml", ".json")
        json_path = os.path.join(json_folder, json_filename)

        with open(json_path, "w", encoding="utf-8") as f_out:
            json.dump(parsed, f_out, indent=2, ensure_ascii=False)

        print(f"✅ Saved: {json_path}")


✅ Saved: /home/wbh/knowledge/json_files/chi24_使用强化学习模拟人类情绪.tei.json
✅ Saved: /home/wbh/knowledge/json_files/chi24_用强化学习方法来进行任务切换.tei.json
✅ Saved: /home/wbh/knowledge/json_files/chi21_通过强化学习调整用户界面.tei.json


In [None]:
import os
import subprocess

# 设置路径
jar_path = "pdffigures2/pdffigures2.jar"
pdf_dir = "files"
output_dir = "output"

figures_dir = os.path.join(output_dir, "figures")
json_dir = os.path.join(output_dir, "json")

# 创建输出目录
os.makedirs(figures_dir, exist_ok=True)
os.makedirs(json_dir, exist_ok=True)

# 遍历所有 PDF
for filename in os.listdir(pdf_dir):
    if not filename.lower().endswith(".pdf"):
        continue

    pdf_path = os.path.join(pdf_dir, filename)
    base_name = os.path.splitext(filename)[0]

    # prefix 只是路径前缀，不是文件名
    figure_prefix = os.path.join(figures_dir, base_name + "_")
    json_prefix = os.path.join(json_dir, base_name + "_")

    print(f"📄 正在处理: {filename} ...")

    cmd = [
        "java", "-Xmx4G", "-jar", jar_path,
        "-m", figure_prefix,                   # 图片文件前缀
        "-d", json_prefix,                     # JSON 文件前缀
        "-e",                                   # 忽略错误，继续执行
        pdf_path
    ]

    try:
        subprocess.run(cmd, check=True)
        print(f"✅ 处理完成: {filename}")
    except subprocess.CalledProcessError as e:
        print(f"❌ 出错: {filename}\n{e}")

print("\n🎉 所有 PDF 处理完成！")


📄 正在处理: chi24_使用强化学习模拟人类情绪.pdf ...
❌ 出错: chi24_使用强化学习模拟人类情绪.pdf
Command '['java', '-Xmx4G', '-jar', 'pdffigures2.jar', '-m', 'output/figures/chi24_使用强化学习模拟人类情绪/chi24_使用强化学习模拟人类情绪-', '-d', 'output/json/chi24_使用强化学习模拟人类情绪', '-e', 'files/chi24_使用强化学习模拟人类情绪.pdf']' returned non-zero exit status 1.
📄 正在处理: chi24_用强化学习方法来进行任务切换.pdf ...
❌ 出错: chi24_用强化学习方法来进行任务切换.pdf
Command '['java', '-Xmx4G', '-jar', 'pdffigures2.jar', '-m', 'output/figures/chi24_用强化学习方法来进行任务切换/chi24_用强化学习方法来进行任务切换-', '-d', 'output/json/chi24_用强化学习方法来进行任务切换', '-e', 'files/chi24_用强化学习方法来进行任务切换.pdf']' returned non-zero exit status 1.
📄 正在处理: chi21_通过强化学习调整用户界面.pdf ...
❌ 出错: chi21_通过强化学习调整用户界面.pdf
Command '['java', '-Xmx4G', '-jar', 'pdffigures2.jar', '-m', 'output/figures/chi21_通过强化学习调整用户界面/chi21_通过强化学习调整用户界面-', '-d', 'output/json/chi21_通过强化学习调整用户界面', '-e', 'files/chi21_通过强化学习调整用户界面.pdf']' returned non-zero exit status 1.

🎉 所有 PDF 处理完成！


Error: Unable to access jarfile pdffigures2.jar
Error: Unable to access jarfile pdffigures2.jar
Error: Unable to access jarfile pdffigures2.jar
