# 测试 Unstructured 对 PDF 的解析能力

这个 notebook 用于验证 `unstructured` 库是否能正确解析指定的 PDF 文件，并识别出其中的不同内容元素，例如表格、标题、正文等。

In [1]:
import os
from unstructured.partition.pdf import partition_pdf
from unstructured.documents.elements import Text, Title, Table

# 定义要测试的PDF文件路径
pdf_path = os.path.join('data', 'embodia', 'pdf', 's10846-019-01130-x.pdf')

# 检查文件是否存在
if not os.path.exists(pdf_path):
    print(f"错误：文件未找到 '{pdf_path}'")
else:
    print(f"正在处理文件: {pdf_path}")
    
    # 使用 partition_pdf 函数解析PDF，设置策略为 'hi_res' 以获取更精确的结果
    # hi_res 策略会使用底层的模型来更好地识别文档布局
    elements = partition_pdf(
        filename=pdf_path,
        strategy='hi_res', # 使用高分辨率策略以更好地识别表格等元素
        infer_table_structure=True, # 尝试推断表格结构
        model_name='yolox' # 明确指定使用的模型
    )
    
    print("\n--- 解析出的所有元素类型 ---")
    element_types = set()
    for el in elements:
        element_types.add(type(el).__name__)
    print(sorted(list(element_types)))
    
    print("\n--- 元素内容预览 (前5个) ---")
    for i, el in enumerate(elements[:5]):
        print(f"- Element {i+1} (类型: {type(el).__name__}):")
        print(f"{str(el)[:200]}...\n")
        
    print("\n--- 识别到的表格内容 ---")
    tables = [el for el in elements if isinstance(el, Table)]
    if tables:
        for i, table in enumerate(tables):
            print(f"- Table {i+1}:")
            # Table 元素有一个 .text 属性包含表格的文本表示
            # 和一个 .metadata.text_as_html 属性包含HTML表示
            print(table.text)
            print("-----\n")
    else:
        print("未在此文档中识别到任何表格。")
        
    print("\n--- 过滤掉表格和特定类型后的主体内容 ---")
    # 根据 Unstructured 的文档，我们可以通过元素的类别来过滤
    # 常见的需要排除的类别有：Header, Footer, PageNumber, Table
    # 注意：'Bibliography', 'Formula' 等更细致的分类可能不总能被完美识别
    excluded_types = ["Header", "Footer", "PageNumber", "Table"]
    
    main_content = []
    for el in elements:
        if type(el).__name__ not in excluded_types:
            main_content.append(str(el))
            
    # 将主体内容合并成一个字符串
    full_text = "\n".join(main_content)
    
    print("处理后的主体内容（预览前500个字符）:")
    print(full_text[:500])

    # 你也可以将完整内容写入文件以便检查
    # with open("output_text.txt", "w", encoding="utf-8") as f:
    #     f.write(full_text)
    # print("\n完整内容已写入 output_text.txt")

  from .autonotebook import tqdm as notebook_tqdm


正在处理文件: data\embodia\pdf\s10846-019-01130-x.pdf


The `max_size` parameter is deprecated and will be removed in v4.26. Please specify in `size['longest_edge'] instead`.
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`



--- 解析出的所有元素类型 ---
['FigureCaption', 'Formula', 'Header', 'Image', 'ListItem', 'NarrativeText', 'Table', 'Text', 'Title']

--- 元素内容预览 (前5个) ---
- Element 1 (类型: Header):
Journal of Intelligent & Robotic Systems (2020) 99:407–425 https://doi.org/10.1007/s10846-019-01130-x...

- Element 2 (类型: Title):
Ground Vehicle Driving by Full Sized Humanoid...

- Element 3 (类型: Image):
Check for updates...

- Element 4 (类型: Title):
Kiwon Sohn1 · Giho Jang2...

- Element 5 (类型: NarrativeText):
Received: 18 January 2019 / Accepted: 2 December 2019 / Published online: 27 December 2019 © Springer Nature B.V. 2019...


--- 识别到的表格内容 ---
- Table 1:
Tasks Success rate [%] Driving 92.1 29 - 122 29 - 55 with driving assistant system Door 94.4 43 - 95 Valve 93.0 55 - 129 Rubble 86.3 30 - 228 Stairs 91.2 218 - 298 Avg. Rate 91.4
-----

- Table 2:
Tasks S F Best team and record Driving Door Valve Rubble Stairs 19 18 16 8 7 4 5 7 15 16 DRC-Hubo@UNLV: 00:55 NEDO-JSK: 00:50 KAIST: 00:33 DRC-Hubo@UNLV: 00:57 NEDO-