In [7]:
import sys
from pathlib import Path
import requests
import argparse
import logging


def test_single_file(file_path: str):
    """
    Test processing of a single document
    
    Args:
        file_path: Path to the document file to test
    """
    # API endpoint
    url = "http://localhost:8000/process"
    
    # Check file exists
    file_path = Path(file_path)
    if not file_path.exists():
        raise FileNotFoundError(f"File not found: {file_path}")
    
    # logger.info(f"Testing file: {file_path.name}")
    
    # Prepare file for request
    with open(file_path, "rb") as f:
        files = [("files", f)]
        
        try:
            # Send request
            # logger.info("Sending request to API...")
            response = requests.post(url, files=files)
            
            # Check response
            assert response.status_code == 200, f"API request failed with status code: {response.status_code}"
            
            # Parse results
            results = response.json()
            return results
            
            # logger.info("Test completed successfully!")
            
        except Exception as e:
            # logger.error(f"Error during testing: {str(e)}")
            raise


In [12]:
result=test_single_file("/Users/frank/PycharmProjects/myOmniDocParser/sampledocs/谷歌发布SigLIP 2：多语言视觉-语言编码器的革命性进步.pdf")

In [13]:
result

{'谷歌发布SigLIP 2：多语言视觉-语言编码器的革命性进步.pdf': [{'content': '2/22/25, 8:11 AM 谷歌发布 SigLIP 2 ：多语言视觉 - 语言编码器的革命性进步\n谷歌发布SigLIP 2：多语言视觉-语言编码器的革命性进步\nInternLM3等LLM 2025年02月22日 06:48 机智流\n<image id="001">\nIt appears that the image is blank or contains no visible content. Therefore, there is no information to extract or analyze. If you have another image or specific content you\'d like me to look at, please share!\n</image>\n作者： InternLM3 等 LLM （内容可能有误，请仔细甄别）\n全文约  2400  字，预计阅读时间  6  分钟\n论文链接： https://huggingface.co/papers/2502.14786\n在人工智能领域，视觉-语言模型（Vision-Language Models, VLMs）的技术进步日新月异。最\n近，Google DeepMind推出了一款名为SigLIP 2的新多语言视觉-语言编码器。这款模型在原有的\nSigLIP基础上进行了大幅优化，不仅在核心任务上表现出色，还扩展了多语言支持、密集预测和定\n位能力。今天，我们将以第三方的视角，为大家详细解读SigLIP 2的亮点和优势。\n🌟 SigLIP 2是什么？有哪些突破？\nSigLIP 2是SigLIP的升级版本，它通过融合多种先进技术，包括基于字幕的预训练、自监督损失\n（如自蒸馏、掩码预测）和在线数据整理，显著提升了性能。相比前代，SigLIP 2在零-shot分\n类、图像-文本检索以及作为VLMs视觉编码器的迁移性能上都有了明显进步。更令人兴奋的是，它\n在定位任务和密集预测任务（如分割和深度估计）上也实现了质的飞跃。\n🔍 SigLIP 2的五大核心优势\n1. 🚀 强大的多语言支持\nSigLIP 2不仅在以英语为主的视觉-语言任务中表现出色，

In [16]:
print(result['谷歌发布SigLIP 2：多语言视觉-语言编码器的革命性进步.pdf'][2]['content'])

2/22/25, 8:11 AM 谷歌发布 SigLIP 2 ：多语言视觉 - 语言编码器的革命性进步
<image id="003">
The image contains a bar graph comparing various metrics across different datasets and models. Here’s a structured breakdown of the relevant information:

### Datasets/Models:
1. **A2D**
2. **AOKVQA-DA (val)**
3. **AOKVQA-MC (val)**
4. **COCO-35L (avg34)**
5. **COCO-35L (en)**
6. **COCOCap**
7. **CountBenchQA**
8. **DocVQA (val)**
9. **GQA**
10. **InfoVQA (val)**
11. **NLVR2**
12. **NoCaps**
13. **OCR-VQA**
14. **OKVQA**
15. **RefCOCO (testA)**
16. **RefCOCO (testB)**
17. **RefCOCO+ (val)**
18. **RefCOCO+ (testA)**
19. **RefCOCO+ (testB)**
20. **RefCOCOg (test)**
21. **RefCOCOg (val)**
22. **ST-VQA (val)**
23. **SciCap**
24. **ScienceQA**
25. **Screen2Words**
26. **TallyQA (complex)**
27. **TallyQA (simple)**
28. **TextCaps**
29. **TextVQA (val)**
30. **VQA2 (minival)**
31. **VizWizVQA (val)**
32. **WidgetCap**
33. **XM3600 (avg35)**
34. **XM3600 (en)**
35. **Average**

### Metrics:
- The bars represent different conf