## Convirtiendo un pdf en imágenes

In [None]:
from document_transformer import Document

class PDFDocument(Document):
    """Custom class to PDF Documents"""

class ImageDocument(Document):
    """Custom class to Image Documents"""
    def saver(self, path):
        self.data.save(path)
        return self

: 

In [None]:
#!pip install pdf2image

In [None]:
from document_transformer import DocumentTransformer
import pdf2image  # install: pip install pdf2image
from typing import List
from pathlib import Path

class PDF2Images(DocumentTransformer):
    input: PDFDocument = None
    output: List[ImageDocument] = []

    def transformer(self) -> List[ImageDocument]:
        """Split the PDF document into pages"""
        images = pdf2image.convert_from_path(self.input.path)
        return [
            ImageDocument(
                metadata={'pdf_path': Path(self.input.path).name, 'page': i+1, 'size': image.size},
                data=image,
            )
            for i, image in enumerate(images)
        ]

In [None]:
pdf_doc = PDFDocument(path="document.pdf")
images = PDF2Images(input=pdf_doc).run()

for image in images:
    image.save(path=f'images/pag_{image.metadata["page"]}.jpg')
    print(f"Imagen: {image.id}")
    print(f"Parents: {image.parents}")
    print(f"Metadata: {image.metadata}")

In [None]:
from document_transformer import Pipeline, plot_graph

# Define Pipeline, add more transformers as you need
pipeline = Pipeline(transformers=[
    PDF2Images(to="images/pag_{metadata[page]}.jpg"),
    # Images2Markdown(to="images/pag_{metadata[page]}.md")),
    # ...
])

# Define input and get output
pdf_doc = PDFDocument(path="document.pdf")
images = pipeline.run(input=pdf_doc)

# See transfomer plot graph
plot_graph(pipeline.get_traces())