# 1. Imports y definición de clases

In [1]:
from pathlib import Path
from typing import List
from PIL import Image
import base64
import io
import matplotlib.pyplot as plt

from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling_core.types.doc import DocItem, TableItem, PictureItem
from docling.chunking import HybridChunker
from docling_core.transforms.chunker import DocChunk

from dataclasses import dataclass
from typing import Literal, Dict, Any

# Definimos tipo y clase para los chunks
ChunkType = Literal["text", "table", "image"]

@dataclass
class DocumentChunk:
    content: str
    type: ChunkType
    source_page: int
    metadata: Dict[str, Any]

  from .autonotebook import tqdm as notebook_tqdm


# 2. Clase Parser PDF

In [2]:
class PdfParser:
    def __init__(self, chunk_size: int = 256, image_resolution_scale: float = 2.0):
        pipeline_options = PdfPipelineOptions(
            images_scale=image_resolution_scale,
            generate_page_images=True,
            generate_picture_images=True
        )
        self.converter = DocumentConverter(
            format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)}
        )
        self.chunker = HybridChunker(max_tokens_per_chunk=chunk_size)

    def _image_to_base64(self, image: Image) -> str:
        buffered = io.BytesIO()
        image.save(buffered, format="PNG")
        return base64.b64encode(buffered.getvalue()).decode('utf-8')

    def _base64_to_image(self, b64: str) -> Image:
        return Image.open(io.BytesIO(base64.b64decode(b64)))

    def parse(self, file_path: str) -> List[DocumentChunk]:
        doc = self.converter.convert(file_path).document
        text_chunks = list(self.chunker.chunk(doc))
        visual_elements = [item for item, _ in doc.iterate_items() if isinstance(item, (TableItem, PictureItem))]

        all_items = text_chunks + visual_elements
        result_chunks: List[DocumentChunk] = []

        for item in all_items:
            try:
                if isinstance(item, DocChunk):
                    page = item.meta.doc_items[0].prov[0].page_no if item.meta and item.meta.doc_items else 0
                    result_chunks.append(DocumentChunk(
                        content=item.text,
                        type="text",
                        source_page=page,
                        metadata={}
                    ))

                elif isinstance(item, TableItem):
                    image_b64 = self._image_to_base64(item.get_image(doc))
                    caption = item.caption_text(doc)
                    page = item.prov[0].page_no if item.prov else 0
                    result_chunks.append(DocumentChunk(
                        content=image_b64,
                        type="table",
                        source_page=page,
                        metadata={"caption": caption}
                    ))

                elif isinstance(item, PictureItem):
                    image_b64 = self._image_to_base64(item.get_image(doc))
                    caption = item.caption_text(doc)
                    page = item.prov[0].page_no if item.prov else 0
                    result_chunks.append(DocumentChunk(
                        content=image_b64,
                        type="image",
                        source_page=page,
                        metadata={"caption": caption}
                    ))
            except Exception as e:
                print(f"Error processing item: {e}")
        return result_chunks

# 3. Ejecutamos el parser con un PDF de prueba

In [None]:
pdf_path = "./creacion de valor con genAI.pdf"
parser = PdfParser(chunk_size=256)
chunks = parser.parse(pdf_path)

print(f"Total de chunks encontrados: {len(chunks)}")

FileNotFoundError: [Errno 2] No such file or directory: '.creacion de valor con genAI.pdf'