In [7]:
from database_setup import *
from model import *

from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions, TableFormerMode
from docling.datamodel.base_models import InputFormat
from docling_core.types.doc import DoclingDocument


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [56]:
from tqdm import tqdm

In [8]:
docling_docs = sorted(glob.glob("/workspace/src/docling_out/*"))

In [9]:
len(docling_docs)

63869

In [11]:
docling_dois = [x.split("/")[-1].replace(".json", "").replace("$", "/") for x in docling_docs]
len(docling_dois)

63869

In [4]:
# adding titles
import ir_datasets
dataset = ir_datasets.load("cord19/fulltext/trec-covid")


In [62]:
test_doc = DoclingDocument.load_from_json("/workspace/src/docling_out/10.1016$j.antiviral.2007.12.008.json")

In [74]:
# um Titel zu extrahieren

for text in test_doc.texts:
    #if "Oligonucleotide antiviral therapeutics:" in text.text:
    if text.label == "section_header":
        print(text.text)

Oligonucleotide antiviral therapeutics: Antisense and RNA interference for highly pathogenic RNA viruses
Abstract
1. Introduction
2. Antisense oligonucleotides (ASOs)
3. RNA interference (RNAi)
4. Antisense and siRNA as antiviral therapeutics
4.1. Filoviruses
4.2. Flaviviruses
4.3. Arenaviruses
4.4. Alphaviruses
4.5. SARS-associated coronavirus (SARS Co-V)
4.6. Influenza A
5. Conclusion
Acknowledgements
References


In [17]:
cord19_doc = dataset.docs

In [57]:
titles = {}
abstracts = {}
for doc in tqdm(cord19_doc):
    if doc.doi in docling_dois:
        titles[doc.doi] = doc.title
        abstracts[doc.doi] = doc.abstract

100%|██████████| 192509/192509 [00:54<00:00, 3561.85it/s]


In [32]:
db_vals = dotenv_values("/workspace/src/.env")
session = setup_engine_session(db_vals['USER'], db_vals['PASSWORD'], db_vals['ADDRESS'], db_vals['PORT'], db_vals['DB'], echo=False)

In [31]:
#parallel_extract_data(docling_docs[38860:], model, session, num_workers=8, batch_size = 10)

In [54]:
def update_document_title(doi, titles_dict, session):
    session.query(Document).filter(Document.doi == doi).update(
    {
        Document.title: titles_dict[doi]
    },
    synchronize_session=False
    )
    try:
        session.commit()
    except Exception as e:
        logging.error(f"Error committing session: {e}")
        session.rollback()


def update_document(doi, title_dict, abstract_dict, session):
    session.query(Document).filter(Document.doi == doi).update(
    {
        Document.abstract: abstract_dict[doi],
        Document.title: titles_dict[doi]
    },
    synchronize_session=False
    )
    try:
        session.commit()
    except Exception as e:
        logging.error(f"Error committing session: {e}")
        session.rollback()





def parallel_update_data(data_dict, session, field="title", num_workers=None, batch_size = 10):
    # data_dict is {doi:data, …} for a given field, e.g.: {1001.3456: "this is a title", …} for title
    data_list = [(a, data_dict, session) for a in list(data_dict.keys())]
    with tqdm(total=len(data_list), desc="Overall Progress") as pbar:
        with ProcessPoolExecutor(max_workers=num_workers) as executor:
            for i in range(0, len(data_list), batch_size):
                batch_paths = data_list[i:i + batch_size]

                # Extract data in parallel for the current batch
                with tqdm(total=len(batch_paths), desc="Extracting Data", leave=False) as extract_pbar:
  
                    # Using list() to eagerly evaluate executor.map and catch exceptions
                    for data in executor.map(update_document_title, batch_paths):
                        if data is not None:
                            if field == "title":
                                update_document_title(data)
                        extract_pbar.update(1)

In [51]:
parallel_update_data(titles, session)

Overall Progress:   0%|          | 0/63869 [00:00<?, ?it/s]


TypeError: cannot pickle 'weakref.ReferenceType' object

In [33]:
update_document_title('10.1001/amajethics.2020.344', titles, session)

In [58]:
for doi in tqdm(titles.keys()):
    update_document_title(doi, titles, session)

100%|██████████| 63869/63869 [1:25:30<00:00, 12.45it/s]


In [53]:
session.rollback()