# Read Document Files (v2507)
Read PDF Files into Documents -> Split -> Save to Docstore

## Methodology
```
1. Load Document Readers
    1-1. Initialize DoclingPDFReader (PDF Backend)
    1-2. Initialize DoclingPDFReader (VLM Doctags Backend) - Disabled for now
    1-3. Initialize PDF2ImageReader
2. Load PDF File Data
3. Ingest Data
    3-1. (Reader) PDF File -> PSIKing Document
    3-2. (Splitter) Chunk Documents
4. Insert into DocumentStore
    4-1. Insert
    4-2. Save to Disk
```

In [1]:
import json
import os

from pathlib import Path
import time
from typing import Any, Dict, List, Optional

import pandas as pd
from pydantic import BaseModel
from tqdm import tqdm

from config import settings

from psiking.core.base.schema import Document, TextNode, ImageNode, TableNode

# 1. Load PSIKing Document Readers

## 1-1. Load DoclingPDFReader (PDF Backend)

In [2]:
from docling_core.types.doc import PictureItem

from docling.datamodel.base_models import InputFormat

from docling.datamodel.pipeline_options import (
    AcceleratorDevice,
    VlmPipelineOptions,
    PdfPipelineOptions,
    PictureDescriptionApiOptions,
    ResponseFormat,
    TableStructureOptions,
    TableFormerMode
)
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.pipeline.vlm_pipeline import VlmPipeline
from docling.datamodel.pipeline_options_vlm_model import (
    ApiVlmOptions,
    InferenceFramework,
    InlineVlmOptions,
    ResponseFormat,
    TransformersModelType
)

In [3]:
pipeline_options = PdfPipelineOptions()

# If force_backend_text = True, text from backend will be used instead of generated text
pipeline_options.force_backend_text = False
pipeline_options.generate_picture_images = True

pipeline_options.images_scale = 1.5
pipeline_options.generate_page_images = True
pipeline_options.generate_picture_images = True
pipeline_options.do_ocr = False

# TableStructure
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options = TableStructureOptions(mode=TableFormerMode.ACCURATE)

pipeline_options.accelerator_options.device = AcceleratorDevice.MPS

In [4]:
from psiking.core.reader.pdf.docling.pipeline_options.picture_description import (
    openai_options as docling_openai_picture_description_options
) 

pipeline_options.do_picture_description = True
pipeline_options.enable_remote_services = True

print(settings.vlm_model)
pipeline_options.picture_description_options=docling_openai_picture_description_options(
    api_key=settings.vlm_api_key,
    model=settings.vlm_model
)

gpt-4.1-nano


In [5]:
docling_pdf_converter = DocumentConverter(
    allowed_formats = [
        InputFormat.PDF,
    ],
    format_options={
        InputFormat.PDF: PdfFormatOption(
            pipeline_options=pipeline_options,
        ),
    }
)

In [6]:
from psiking.core.reader.pdf.docling import DoclingPDFReader

# initalize reader
docling_pdf_reader = DoclingPDFReader(converter=docling_pdf_converter)

## 1-2. Load DoclingPDFReader (VLM Doctags Backend)
* Disabled for now due to bad Korean ocr performance

In [7]:
# pipeline_options = VlmPipelineOptions()
# # If force_backend_text = True, text from backend will be used instead of generated text
# pipeline_options.force_backend_text = False
# pipeline_options.generate_picture_images = True

# pipeline_options.accelerator_options.device = AcceleratorDevice.MPS


# # smoldocling-preview
# # vlm_conversion_options = InlineVlmOptions(
# #     repo_id = "ds4sd/SmolDocling-256M-preview",
# #     inference_framework=InferenceFramework.TRANSFORMERS,
# #     supported_devices=[AcceleratorDevice.MPS],
# #     transformers_model_type=TransformersModelType.AUTOMODEL_VISION2SEQ,
# #     prompt="Convert this page to docling.",
# #     response_format=ResponseFormat.DOCTAGS,
# #     load_in_8bit=False,
# #     quantized=False
# # )

# ## Remote (vllm) Version
# pipeline_options.enable_remote_services=True
# vlm_conversion_options = ApiVlmOptions(
#     url='http://localhost:8081/v1/chat/completions',
#     prompt="Convert this page to docling.",
#     response_format=ResponseFormat.DOCTAGS,
#     concurrency=8
# )


# ## Pick a VLM model. We choose SmolDocling-256M by default
# pipeline_options.vlm_options = vlm_conversion_options

In [8]:
# docling_vlm_converter = DocumentConverter(
#     format_options={
#         InputFormat.PDF: PdfFormatOption(
#             pipeline_cls=VlmPipeline,
#             pipeline_options=pipeline_options,
#         ),
#     }
# )

In [9]:
# # initalize reader
# docling_vlm_reader = DoclingPDFReader(converter=docling_vlm_converter)

## 1-3. Load PDF2ImageReader

In [10]:
from psiking.core.reader.pdf.pdf2image import PDF2ImageReader

poppler_path = "/opt/homebrew/Cellar/poppler/25.07.0/bin"
pdf2img_reader = PDF2ImageReader(poppler_path=poppler_path)

# 2. Load PDF File Data

In [11]:
# PDF File directory
# pdf_dir = os.path.join(settings.data_dir, "retrieval_dataset/allganize-RAG-Evaluation-Dataset-KO/finance")
pdf_dir = 'data/pdf/finance'
pdf_fnames =[x for x in os.listdir(pdf_dir) if x.endswith(".pdf")]
print("num files:", len(pdf_fnames))
pdf_fnames[:10]

num files: 10


['7373884a-8255-482d-9e7c-00b919083526.pdf',
 '5484364a-38de-48b7-a0a6-b009f361bd9e.pdf',
 'b59c836c-ec57-44ba-b4a8-2ae3d58a22e4.pdf',
 '99d45724-817a-4c05-85e2-83e0aa8ac8c0.pdf',
 '03d95093-ed1f-4a66-83dc-5534dfbd87e3.pdf',
 'c94f675e-7d81-48bd-88f8-c5ff766190cc.pdf',
 '053248f8-4311-413e-b34b-9a65a4251f4f.pdf',
 '72b54f4b-7002-48ea-ad20-2c613d8360f6.pdf',
 'bbd035d6-51a2-41ba-b913-8357d89b7852.pdf',
 '980889bb-16cd-447f-b5eb-1384b84903cc.pdf']

In [13]:
# Map to FileIds
metadata_df = pd.read_csv('data/metadata.tsv', sep='\t')

pdf_file_ids = [
    metadata_df[metadata_df.id==x.replace('.pdf', '')].iloc[0]['id'] for x in pdf_fnames
]

In [14]:
len(pdf_file_ids)

10

# 3. Ingest Files

## 3-1. Reader - File->Document

In [15]:
# Convert pages to image
documents = []
failed_fnames = []

for doc_i in tqdm(range(len(pdf_fnames))):
    fname=pdf_fnames[doc_i]
    file_path = os.path.join(pdf_dir, fname)
    file_id = pdf_file_ids[doc_i]
    
    # Method 1 - Docling with PDF Backend
    try:
        document = docling_pdf_reader.run(
            file_path, 
            extra_info = {
                "source_id": file_id,
                "domain": "finance",
                "method": "docling-pdf"
            }
        )
        documents.append(document)
        continue
    except Exception as e:
        # print("[DOCLING READER] failed {} - {}".format(fname, str(e)))
        print("[DOCLING PDF READER] failed {}, Falling back to PDF2IMG".format(fname))
        # print(traceback.format_exc())
        
        
    # Method 2 - Docling with VLM (Doctags) Backend (fallback 1)
    ## Disabled due to bad Korean support
    # try:
    #     document = docling_vlm_reader.run(
    #         file_path, 
    #         extra_info = {
    #             "source_id": file_id,
    #             "domain": "finance",
    #             "method": "docling-vlm"
    #         }
    #     )
    #     documents.append(document)
    #     continue
    # except Exception as e:
    #     # print("[DOCLING READER] failed {} - {}".format(fname, str(e)))
    #     print("[DOCLING VLM READER] failed {}, Falling back to PDF2IMG".format(fname))
    #     # print(traceback.format_exc())

    # Method 3 - PDF2Image (fallbakc2)
    try:
        document = pdf2img_reader.run(
            file_path,
            extra_info = {
                "source_id": file_id,
                "domain": "finance",
                "method": "pdf2image"
            }
        )
        documents.append(document)
    except Exception as e:
        print("[PDF2IMG READER] failed {} - {}".format(fname, str(e)))
        failed_fnames.append(fname)

 20%|██        | 2/10 [06:18<21:46, 163.34s/it]Encountered an error during conversion of document 02616dbc4dc47f992b7008e68e4f1d4cb49ccece229e7fad02a38a3470346a63:
Traceback (most recent call last):

  File "/opt/miniconda3/envs/psiking/lib/python3.10/site-packages/docling/pipeline/base_pipeline.py", line 160, in _build_document
    for p in pipeline_pages:  # Must exhaust!

  File "/opt/miniconda3/envs/psiking/lib/python3.10/site-packages/docling/pipeline/base_pipeline.py", line 126, in _apply_on_pages
    yield from page_batch

  File "/opt/miniconda3/envs/psiking/lib/python3.10/site-packages/docling/models/page_assemble_model.py", line 70, in __call__
    for page in page_batch:

  File "/opt/miniconda3/envs/psiking/lib/python3.10/site-packages/docling/models/table_structure_model.py", line 177, in __call__
    for page in page_batch:

  File "/opt/miniconda3/envs/psiking/lib/python3.10/site-packages/docling/models/layout_model.py", line 151, in __call__
    for page in page_batch:


[DOCLING PDF READER] failed b59c836c-ec57-44ba-b4a8-2ae3d58a22e4.pdf, Falling back to PDF2IMG


 40%|████      | 4/10 [06:41<06:06, 61.11s/it] Encountered an error during conversion of document ce014774ce984417127bff298a0e883db7ad2652e7cb66d49bbbb2423cc4176c:
Traceback (most recent call last):

  File "/opt/miniconda3/envs/psiking/lib/python3.10/site-packages/docling/pipeline/base_pipeline.py", line 160, in _build_document
    for p in pipeline_pages:  # Must exhaust!

  File "/opt/miniconda3/envs/psiking/lib/python3.10/site-packages/docling/pipeline/base_pipeline.py", line 126, in _apply_on_pages
    yield from page_batch

  File "/opt/miniconda3/envs/psiking/lib/python3.10/site-packages/docling/models/page_assemble_model.py", line 70, in __call__
    for page in page_batch:

  File "/opt/miniconda3/envs/psiking/lib/python3.10/site-packages/docling/models/table_structure_model.py", line 177, in __call__
    for page in page_batch:

  File "/opt/miniconda3/envs/psiking/lib/python3.10/site-packages/docling/models/layout_model.py", line 151, in __call__
    for page in page_batch:


[DOCLING PDF READER] failed 03d95093-ed1f-4a66-83dc-5534dfbd87e3.pdf, Falling back to PDF2IMG


100%|██████████| 10/10 [12:45<00:00, 76.59s/it] 


In [16]:
for node in document.nodes[:3]:
    print(type(node))

<class 'psiking.core.base.schema.TextNode'>
<class 'psiking.core.base.schema.TextNode'>
<class 'psiking.core.base.schema.TextNode'>


In [17]:
document.metadata

{'reader': 'DoclingPDFReader',
 'source_id': '980889bb-16cd-447f-b5eb-1384b84903cc',
 'domain': 'finance',
 'method': 'docling-pdf'}

## 3-2. Splitter - Chunk Documents

In [18]:
from psiking.core.processor.document.text_merger import TextNodeMerger
# Split Documents page-level
merger = TextNodeMerger()

merged_documents = []
for document in documents:
    merged_document = merger.run(document)
    merged_documents.append(merged_document)

In [19]:
# merged_documents[0]
merged_documents[0].nodes[0]

TextNode(id_='2ea3cca4-167e-42a7-a29c-925ca866676f', metadata={'prov': '[{"page_no": 1, "bbox": {"l": 71.444, "t": 702.6370374023437, "r": 511.598, "b": 645.7080374023437, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 37]}]'}, text_type=<TextType.PLAIN: 'plain'>, label=<TextLabel.PLAIN: 'plain'>, resource=MediaResource(data=None, text='증권사 리서치센터장, 자산운용사 대표와 함께하는 제1회 증시 콘서트\n2019 하반기 증시 대전망\n|\xa0일\xa0시\xa0| 2019.\xa07.\xa02\xa0(화)\xa014:30\n|\xa0장\xa0소\xa0| 금융투자협회\xa03층\xa0불스홀', path=None, url=None, mimetype=None))

In [20]:
# Run Splitter
import copy
from psiking.core.splitter.text.langchain_text_splitters import LangchainRecursiveCharacterTextSplitter

splitter = LangchainRecursiveCharacterTextSplitter(
    chunk_size = 1024,
    chunk_overlap = 128
)

chunks = []
for document in merged_documents:
    document_chunks = []
    document_metadata = document.metadata
    
    for i, node in enumerate(document.nodes):
        # Run Splitter
        if isinstance(node, TextNode):
            try:
                split_nodes = splitter.run(node)
            except Exception as e:
                print(i, node)
                print(str(e))
                raise e
        else:
            split_nodes = [node]
            
        node_metadata = node.metadata
        # Add 
        chunk_metadata = copy.deepcopy(document_metadata)
        chunk_metadata['prov'] = node_metadata['prov']
        
        # Create New Document
        for split_node in split_nodes:
            # Each Document contains single node
            chunk = Document(
                nodes=[split_node],
                metadata=chunk_metadata
            )
            document_chunks.append(chunk)
    chunks.extend(document_chunks)

In [21]:
print(len(chunks))

1032


# 4. Insert to DocumentStore

In [22]:
from psiking.core.storage.docstore.in_memory import InMemoryDocumentStore

In [23]:
doc_store = InMemoryDocumentStore()

In [24]:
doc_store.add(chunks)

In [25]:
doc_store.count()

1032

In [26]:
doc_store.save('storage/docstore_v2507.json')