In [36]:
import glob
import json
from pathlib import Path
import pymupdf
from papermage.magelib import Document
from docling_core.types.doc import DoclingDocument

from workspace.helpers import positions_from_box, get_cleaned_captions, get_caption_for_box, image_from_box

In [2]:
import logging

# Set the logging level to a value higher than CRITICAL to suppress all logs
logging.disable(logging.FATAL + 1)

In [49]:
input_pdf_paths = glob.glob("/workspace/data/test_eval/pdfs_100/*.pdf")
len(input_pdf_paths)


100

In [50]:
papermage_paths = glob.glob("/workspace/data/test_eval/pm/*.json")
len(papermage_paths)

97

In [51]:
docling_fast_paths = glob.glob("/workspace/data/test_eval/docling/fast/*.json")
len(docling_fast_paths)

77

In [52]:
docling_accurate_paths = glob.glob("/workspace/data/test_eval/docling/accurate/*.json")
len(docling_accurate_paths)

79

In [7]:
dois_100 = [path.split("/")[-1].replace(".pdf", "") for path in input_pdf_paths]

In [8]:
def check_doi_in_paths(doi, paths):
    for path in paths:
        if doi in path:
            return path
    return False


In [58]:
for doi in dois_100:

    if not doi == "10.1002$jmv.20254":
        continue
    print("--------------------------------")
    print(doi)
    print("--------------------------------")
    print("papermage")
    pm_path = check_doi_in_paths(doi, papermage_paths)
    if pm_path:
        with open(pm_path, "r") as f:
            pm_doc = Document.from_json(json.load(f))
            cleaned_captions = get_cleaned_captions(pm_doc)
            for t_id, table in enumerate(pm_doc.tables):
                t_caption = get_caption_for_box(box=table.boxes[0], captions=pm_doc.captions, caption_ids=cleaned_captions["tables"])
                
                pdf_path = path = [path for path in input_pdf_paths if doi in path][0]
                file_name = Path(f"{doi}_{table.boxes[0].page}_{t_id}")
                im_path = f"/workspace/data/test_eval/pm_images/{file_name}.png"
                image_from_box(box=table.boxes[0], pdf_path=pdf_path, im_path=im_path, scale=pymupdf.Matrix(2, 2))
                print(t_caption)
    print()
    print("docling fast")
    dl_fast_path = check_doi_in_paths(doi, docling_fast_paths)
    if dl_fast_path:
        with open(dl_fast_path, "r") as f:
            doc_dict = json.loads(f.read())
            doc = DoclingDocument.model_validate(doc_dict)

            for t_id, table in enumerate(doc.tables):
                caption = table.caption_text(doc)
                print(caption)
                print(table.export_to_dataframe().to_markdown())

    print()
    print("docling accurate")
    dl_acc_path = check_doi_in_paths(doi, docling_accurate_paths)
    if dl_acc_path:
        with open(dl_acc_path, "r") as f:
            doc_dict = json.loads(f.read())
            doc = DoclingDocument.model_validate(doc_dict)

            for t_id, table in enumerate(doc.tables):
                caption = table.caption_text(doc)
                print(caption)
                print(table.export_to_dataframe().to_markdown())

--------------------------------
10.1002$jmv.20254
--------------------------------
papermage
TABLE I. Results of Anti-SARS-CoV IgG Antibody Detected by ELISARS TM Kit Study

docling fast

docling accurate


In [59]:
from docling.document_converter import DocumentConverter

source = "/workspace/data/test_eval/pdfs_100/10.1002$jmv.20254.pdf"  # PDF path or URL
converter = DocumentConverter()
result = converter.convert(source)
print(result.document.export_to_markdown())

  from .autonotebook import tqdm as notebook_tqdm
Fetching 9 files: 100%|██████████| 9/9 [00:00<00:00, 30102.66it/s]


## Evaluation of a Recombinant Nucleocapsid Protein-Based Assay for Anti-SARS-CoV IgG Detection

Paul K.S. Chan,$^{1,2}$* Esther Y.M. Liu,$^{2}$Danny T.M. Leung,$^{3}$Jo L.K. Cheung,$^{2}$C.H. Ma, 3 Frankie C.H. Tam,$^{3}$Mamie Hui,$^{1,2}$John S. Tam,$^{1,2}$and Pak Leong Lim 3

$^{1}$Centre for Emerging Infectious Diseases, The Chinese University of Hong Kong, Prince of Wales Hospital, Shatin, New Territories, Hong Kong SAR, China

$^{2}$Department of Microbiology, The Chinese University of Hong Kong, Prince of Wales Hospital, Shatin,

New Territories, Hong Kong SAR, China

$^{3}$Clinical Immunology Unit, The Chinese University of Hong Kong, Prince of Wales Hospital, Shatin,

New Territories, Hong Kong SAR, China

A high throughput accurate assay for anti-SARSCoV IgG detection is needed for large-scale epidemiological studies. The evaluation of a commercial recombinant nucleocapsid proteinbased microtitre plate enzyme immunoassay, ELISARS TM is described. The results on 150 sera from