# 이미지 pdf 로더 (with. easyocr)

In [1]:
# uv add easyocr
# uv add -U PyMupdf

In [2]:
import fitz
import numpy as np
from PIL import Image
import easyocr
from langchain.docstore.document import Document

In [3]:
# reader = easyocr.Reader(['ko', 'en'], gpu=True)
# file_path = '../data/Image_Samsung_Electronics_Sustainability_Report_2025_KOR_partial.pdf'

# # with fitz.open(file_path) as pdf:
# #     mat = fitz.Matrix(300, 300)
# #     pix = pdf[0].get_pixmap(matrix=mat, alpha=False)
# #     img = Image.frombytes("RGB")

# with fitz.open(file_path) as pdf:
#     dpi = 300
#     zoom = dpi / 72.0
#     mat = fitz.Matrix(zoom, zoom)
#     mat = fitz.Matrix(300, 300)
#     pix = pdf[0].get_pixmap(matrix=mat, alpha=False)
#     img = Image.frombytes("RGB", (pix.width, pix.height), pix.samples)
#     arr = np.array(img)
#     result = reader.readtext(arr, detail=0, paragraph=True)

In [4]:
def load_pdf_with_easyocr(
    pdf_path: str,
    languages=("ko","en"),   # 한국어+영어
    gpu=True,                # GPU 사용
    dpi=300,                 # OCR 품질용 렌더 DPI
    page_from=None,          # 1부터 시작, None=첫 페이지
    page_to=None             # 포함, None=마지막 페이지
):
    reader = easyocr.Reader(list(languages), gpu=gpu)
    docs = []

    with fitz.open(pdf_path) as pdf:
        total = len(pdf)
        start = 1 if page_from is None else max(1, page_from)
        end   = total if page_to is None else min(total, page_to)

        for p in range(start-1, end):
            page = pdf[p]
            zoom = dpi / 72.0
            mat = fitz.Matrix(zoom, zoom)
            pix = page.get_pixmap(matrix=mat, alpha=False)
            img = Image.frombytes("RGB", (pix.width, pix.height), pix.samples)

            arr = np.array(img)
            result = reader.readtext(arr, detail=0, paragraph=True)
            text = "\n".join(result) if result else ""

            docs.append(Document(
                page_content=text,
                metadata={"source": pdf_path, "page": p+1, "dpi": dpi, "ocr_engine": "EasyOCR"}
            ))

    return docs

In [6]:
file_path = '../data/Image_Samsung_Electronics_Sustainability_Report_2025_KOR_partial.pdf'
docs = load_pdf_with_easyocr(file_path, languages=['ko', 'en'], gpu=True, dpi=300)

In [8]:
len(docs)

30

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

recursive_spliiter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap = 100
)
recursive_chunks = recursive_spliiter.split_documents(docs)
print(len(recursive_chunks), "전체 잘린 chunk 사이즈")