In [1]:
import json
from pathlib import Path
import os
import time
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pydantic_settings import BaseSettings, SettingsConfigDict

class Settings(BaseSettings):
    model_config = SettingsConfigDict(
        env_file="../.env", env_file_encoding="utf-8", extra="ignore"
    )
    data_dir: str
    docling_model_dir: str
    
settings = Settings()
os.environ["HF_HOME"] = settings.docling_model_dir

In [4]:
import sys
sys.path.append("src")

from core.base.schema import TextNode, ImageNode, TableNode, TextType, TextLabel, Document
from core.reader.docling.pdf_reader import DoclingPDFReader
from core.processor.document.text_merger import TextNodeMerger
from core.splitter.text.langchain_text_splitters import LangchainRecursiveCharacterTextSplitter

In [5]:
reader = DoclingPDFReader()

In [6]:
pdf_dir = os.path.join(settings.data_dir, "allganize-RAG-Evaluation-Dataset-KO/finance")
pdf_fnames =[x for x in os.listdir(pdf_dir) if x.endswith(".pdf")]
print("num files:", len(pdf_fnames))
pdf_fnames[:10]

# file_path = os.path.join(pdf_dir, pdf_fnames[0])
file_path = "resources/finance-small-images.pdf"
file_path = "resources/1706.03762v7.pdf"
# file_path = "resources/1706.03762v7-sample.pdf"
# file_path = "resources/list_group_sample_msword.pdf"
# file_path = "resources/list_group_sample_google.pdf"

num files: 10


In [7]:
# Run Reader
document = reader.run(file_path)

In [8]:
# Run Processor (merge)
nodes = document.nodes
print(len(nodes))

merger = TextNodeMerger()
document = merger.run(document)
len(document.nodes)

136


26

In [18]:
# Run Splitter for each node

processed_nodes = []
splitter = LangchainRecursiveCharacterTextSplitter(
    chunk_size = 1024,
    chunk_overlap = 128
)

for i, node in enumerate(document.nodes):
    if isinstance(node, TextNode):
        split_nodes = splitter.run(node)
        if len(split_nodes) > 1:
            print(f"SPLITTING NODE {i}")
            print(f"TEXT: {len(node.text)} {node.text[:100]}")
            print("-"*100)
            for split_node in split_nodes:
                print(f"SPLIT TEXT: {len(split_node.text)} {split_node.text[:100]}")
                print("-"*100)
            
        processed_nodes.extend(splitter.run(node))
    else:
        processed_nodes.append(node)

processed_document = Document(nodes=processed_nodes)
len(processed_document.nodes)

SPLITTING NODE 0
TEXT: 5162 arXiv:1706.03762v7  [cs.CL]  2 Aug 2023
Provided proper attribution is provided, Google hereby grant
----------------------------------------------------------------------------------------------------
SPLIT TEXT: 648 arXiv:1706.03762v7  [cs.CL]  2 Aug 2023
Provided proper attribution is provided, Google hereby grant
----------------------------------------------------------------------------------------------------
SPLIT TEXT: 1023 The dominant sequence transduction models are based on complex recurrent or convolutional neural net
----------------------------------------------------------------------------------------------------
SPLIT TEXT: 242 a small fraction of the training costs of the best models from the literature. We show that the Tran
----------------------------------------------------------------------------------------------------
SPLIT TEXT: 431 1 Introduction
Recurrent neural networks, long short-term memory [13] and gated recurrent [7] neura

55