In [1]:
!pip install docling



In [2]:
import glob
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions, TableFormerMode
from docling.datamodel.base_models import InputFormat

In [10]:
pdf_files = glob.glob("/Users/fabian/Python/Joppie/cord19plus/src/pdfs/*.pdf") 

In [12]:
test_pdf_files = pdf_files[:10]

In [46]:
converter = DocumentConverter()

def convert_document(source):
    return converter.convert(source).document

In [47]:
list_of_docs = [convert_document(source) for source in test_pdf_files]

Fetching 9 files: 100%|██████████| 9/9 [00:00<00:00, 98048.66it/s]


In [49]:
import model
from model import Base

In [None]:
len(list_of_docs)

In [130]:
max([len(b) for b in [a.text for a in current_doc.texts]])

3382

In [50]:
def get_table_name_from_caption(caption: str) -> str:
    m = re.search(
        r"(^(tab[^ \t\r\f\n]+)([-\t \r\f])([a-z]?([_.-])?[0-9]+)([_.-])?[^\s:\-\.]?([a-z])?)", caption, flags=re.I
    )
    return m.group(0).rstrip(".:-") if m else None
    
def get_fulltext_references(doc, caption) -> list:
    """
    if sentence contains caption: take sentence[i-1:i+2]

    :param doc: docling Document
    :param caption: caption to look for in the full text
    :return: context surrounding caption
    """
    table = get_table_name_from_caption(caption)
    refs = []
    i_sent = []
 
    if table:
        for block in [a.text for a in current_doc.texts]:
            # only consider boxes, that are not intersecting with captions
            if not block == caption:
                if table in block.text:
                    refs.append(block.text[:500])
   
    return refs


In [51]:
!pip install habanero



In [52]:
from habanero import Crossref
cr = Crossref()

In [53]:
list_of_docs

[DoclingDocument(schema_name='DoclingDocument', version='1.0.0', name='10.1016$j.vetmic.2019.08.005', origin=DocumentOrigin(mimetype='application/pdf', binary_hash=16177480908108895704, filename='10.1016$j.vetmic.2019.08.005.pdf', uri=None), furniture=GroupItem(self_ref='#/furniture', parent=None, children=[], name='_root_', label=<GroupLabel.UNSPECIFIED: 'unspecified'>), body=GroupItem(self_ref='#/body', parent=None, children=[RefItem(cref='#/pictures/0'), RefItem(cref='#/texts/0'), RefItem(cref='#/texts/1'), RefItem(cref='#/pictures/1'), RefItem(cref='#/pictures/2'), RefItem(cref='#/texts/2'), RefItem(cref='#/texts/3'), RefItem(cref='#/texts/4'), RefItem(cref='#/texts/5'), RefItem(cref='#/texts/6'), RefItem(cref='#/texts/7'), RefItem(cref='#/groups/0'), RefItem(cref='#/texts/10'), RefItem(cref='#/texts/11'), RefItem(cref='#/texts/12'), RefItem(cref='#/texts/13'), RefItem(cref='#/texts/14'), RefItem(cref='#/texts/15'), RefItem(cref='#/texts/16'), RefItem(cref='#/texts/17'), RefItem(cr

In [54]:
def cr_clean_authors(l_authors):
    ret = []
    for a in l_authors:
        ret.append(f"{a['family']}, {a['given']}")

    return ret

In [56]:
current_doc.name

'10.1007$978-3-030-03344-6_24'

In [57]:
# mehr metadata wären möglich
cr_clean_authors(cr.works(ids = '10.1371/journal.pone.0033693')["message"]["author"])

['Sadasivan, Shankar',
 'Pond, Brooks B.',
 'Pani, Amar K.',
 'Qu, Chunxu',
 'Jiao, Yun',
 'Smeyne, Richard J.']

In [58]:
# testing bbox position translation
table = current_doc.pictures[0]
position_left = table.prov[0].bbox.normalized(current_doc.pages[1].size).l
position_top = 1-table.prov[0].bbox.normalized(current_doc.pages[1].size).t
position_page = table.prov[0].page_no
width = 1 - table.prov[0].bbox.normalized(current_doc.pages[1].size).l - (1-table.prov[0].bbox.normalized(current_doc.pages[1].size).r)
height = table.prov[0].bbox.normalized(current_doc.pages[1].size).t - (1-table.prov[0].bbox.normalized(current_doc.pages[1].size).b)

print(position_left, position_top, position_page, width, height)

0.8031685993933853 0.09199629469116466 1 0.07563139027132637 0.7666686213300813


In [60]:
def parse_docling_doc(current_doc, model):

    # publication
    p = model.Publication()

    # document
    current_doi = current_doc.name
    ## title
    try:
        title = cr.works(ids = current_doi)["message"].title
        current_d = model.Document(doi=current_doi,
                        title=title,
                        publication = p)  
    except:
        try:
            current_d = model.Document(doi=current_doi,
                        # first line of text in document, usually title (except: e.g. Preprints with arxiv watermark on left)
                        title=current_doc.titles[0].text,
                        publication = p)
        except:
            current_d = model.Document(doi=current_doi,
                        title="",
                        publication = p)

    try:
        current_doc.authors = cr_clean_authors(cr.works(ids = current_doi)["message"].authors)
        # alternative with more detailed, structured information
        # current_d.authors = cr.works(ids = current_doi)["message"].authors
    except:
        current_d.authors = []

    # tables 
    tabs = []
    for table in current_doc.tables:
        # l, t, w, h, page = positions_from_box(table)
        try:
            t = model.Table(content=' '.join(table.export_to_dataframe().to_string().split()),
                        # pass doc information here 
                        document_id = current_doi,
                        document = current_d,
                        header = table.export_to_dataframe().columns.to_list()
                        content = table.export_to_dataframe().values.tolist()
                        pm_content = " ".join([a.text for a in table.data.table_cells])
                        # caption self referential with doc
                        caption=table.caption_text(current_doc),
                        # position has to be normalized by page size, docling also calculated position relative to bottom left, e.g. left is l, right is 1-r
                        position_left = table.prov[0].bbox.normalized(current_doc.pages[1].size).l,
                        position_top = 1-table.prov[0].bbox.normalized(current_doc.pages[1].size).t,
                        position_page = table.prov[0].page_no,
                        width = 1 - table.prov[0].bbox.normalized(current_doc.pages[1].size).l - (1-table.prov[0].bbox.normalized(current_doc.pages[1].size).r),
                        height = table.prov[0].bbox.normalized(current_doc.pages[1].size).t - (1-table.prov[0].bbox.normalized(current_doc.pages[1].size).b),
                        # proxy references 
                        references=["d1", "d2", "d3"]) 
            tabs.append(t)
        except:
            print("table parsing error")
            pass

    # figures
    figs = []
    for figure in current_doc.pictures:
        
        try:
            f = model.Figure(document_id = current_doi,
                    document = current_d,
                    document_id = current_doi,
                    position_left = figure.prov[0].bbox.normalized(current_doc.pages[1].size).l,
                    position_top = 1-figure.prov[0].bbox.normalized(current_doc.pages[1].size).t,
                    position_page = figure.prov[0].page_no,
                    width = 1 - figure.prov[0].bbox.normalized(current_doc.pages[1].size).l - (1-figure.prov[0].bbox.normalized(current_doc.pages[1].size).r),
                    height = figure.prov[0].bbox.normalized(current_doc.pages[1].size).t - (1-figure.prov[0].bbox.normalized(current_doc.pages[1].size).b)) 
            figs.append(f)
        except:
            pass
        

    # Currently, equations are not supported 

    # add lists of tables, figures, equations to document
    current_d.tables=tabs
    current_d.figures=figs

    # HAS TO BE EDITED: CURRENTLY ONLY ONE DOCUMENT TO ONE PUBLICATION
    # add list of documents to publication
    p.documents = [current_d]

    return p, current_d, tabs, figs

In [61]:
res = parse_docling_doc(current_doc, model)

In [70]:
results = [parse_docling_doc(d, model) for d in list_of_docs]

In [72]:
results[1]

(<model.Publication at 0x31d7226d0>,
 <model.Document at 0x31d722dd0>,
 [<model.Table at 0x31d7c5150>],
 [<model.Figure at 0x36c5fbad0>])

In [63]:
res

(<model.Publication at 0x31d5b57d0>,
 <model.Document at 0x31d5b4810>,
 [<model.Table at 0x31785b890>],
 [<model.Figure at 0x4889705d0>, <model.Figure at 0x36c519a10>])

In [123]:
" ".join([a.text for a in list_of_docs[3].tables[0].data.table_cells])

'Strain Isolate Viral load (genome copies/ml) Replication HCoV-229E 552 6.67E + 03 No  349 6.58E + 03 Yes HCoV-NL63 R2354 1.53E + 07 Yes  P0588 5.89E + 05 No  J1816 5.01E + 04 No  K1341 9.14E + 03 No  K0109 5.49E + 03 No  Ams-057 1.09E + 06 No HCoV-HKU1 516 4.71E + 06 Yes  561 7.11E + 05 No  476 1.33E + 05 Yes  315 1.46E + 04 Yes  548 6.88E + 02 No  350 2.77E + 03 Yes HCoV-OC43 500 1.57E + 02 Yes  671 3.43E + 03 Yes  634 1.66E + 02 Yes  562 3.28E + 03 Yes'

In [None]:
    id: Mapped[int] = mapped_column(primary_key=True, autoincrement=True)
    ir_id: Mapped[str] = mapped_column(String(1024))
    ir_tab_id: Mapped[str] = mapped_column(String(1024))
    table_name = mapped_column(String(2048), nullable=True) 

    pm_content: Mapped[str] = mapped_column(String(2**15), nullable=True) #32768

    #header_json: Mapped[JSON] = mapped_column(type_=JSON, nullable=False)
    #content_json: Mapped[JSON] = mapped_column(type_=JSON, nullable=False)

    header: Mapped[List[str]] = mapped_column(ARRAY(String(2**15)), nullable=True) # done 
    #content: Mapped[List[List[str]]] = mapped_column(ARRAY(String(2**15), dimensions=2), nullable=True)
    content: Mapped[List[List[str]]] = mapped_column(JSONB, nullable=True) #done
    document_id: Mapped[int] = mapped_column(ForeignKey("document.doi")) #done
    document: Mapped["Document"] = relationship(back_populates="tables") #done

    position_page: Mapped[int] = mapped_column(Integer(), nullable=True) #do we want to map this to document pages?
    position_left: Mapped[float] = mapped_column(Float(), nullable=True) #done
    position_top: Mapped[float] = mapped_column(Float(), nullable=True) #done
    width: Mapped[float] = mapped_column(Float(), nullable=True) #done
    height: Mapped[float] = mapped_column(Float(), nullable=True) #done

    caption: Mapped[str] = mapped_column(String(2**15), nullable=True) #done
    references: Mapped[List[str]] = mapped_column(ARRAY(String(2**15)), nullable=True)

    class Figure(Base):
    __tablename__ = "figure"

    id: Mapped[int] = mapped_column(primary_key=True, autoincrement=True)
    ir_id: Mapped[str] = mapped_column(String(1024))

    caption: Mapped[str] = mapped_column(String(2**15), nullable=True)

    document_id: Mapped[int] = mapped_column(ForeignKey("document.doi"))
    document: Mapped["Document"] = relationship(back_populates="figures")  #done

    position_page: Mapped[int] =  mapped_column(Integer())  #done
    position_left: Mapped[float] = mapped_column(Float()) #done
    position_top: Mapped[float] = mapped_column(Float()) #done
    width: Mapped[float] = mapped_column(Float()) #done
    height: Mapped[float] = mapped_column(Float()) #done