In [1]:
import json
import yaml
from pathlib import Path
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.document_converter import (
    DocumentConverter,
    PdfFormatOption,
    WordFormatOption,
)
from docling.datamodel.pipeline_options import PdfPipelineOptions, EasyOcrOptions

from docling.pipeline.simple_pipeline import SimplePipeline
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline

  from .autonotebook import tqdm as notebook_tqdm


In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [4]:
# 모듈 최상단에 패턴 컴파일
import re
NEWLINE_PATTERN = re.compile(r'\r\n\d+')

def normalize_newlines(text: str) -> str:
    """개행문자 정규화 (동기 함수)"""
    return NEWLINE_PATTERN.sub('\n', text)

In [5]:
text = "Hello\r\n123World\r\n456Python"
result = normalize_newlines(text)
print(result)

Hello
World
Python


In [28]:
import os
import time
import pickle
import pdfplumber
from tqdm.auto import tqdm
from langchain_core.documents import Document


# 공유 가능한 옵션 정의
DEFAULT_PIPELINE_OPTIONS = PdfPipelineOptions(
    do_ocr=True,
    do_table_structure=True,
    ocr_options=EasyOcrOptions(lang=["en", "ko"])
    )


def parsing_pdf_by_page_with_docling(path:str, save_folder:str):
    path = path.replace("\\", "/")
    filename = path.split("/")[-1]

    first_sentence = f"This page explains {filename.replace(".pdf", "")} that belongs to {save_folder} category.\n"


    pipeline_options = DEFAULT_PIPELINE_OPTIONS
    converter = DocumentConverter(
        allowed_formats=[
            InputFormat.PDF
        ],
        format_options={
            InputFormat.PDF: PdfFormatOption(
                pipeline_options=pipeline_options,
                backend=PyPdfiumDocumentBackend
            ),}
    )
    loaded_docs = converter.convert(path)
    with pdfplumber.open(path) as pdf:
        page_num = 0
        docs = []
        for _ in tqdm(pdf.pages):
            docling_text = loaded_docs.document.export_to_markdown(page_no=int(page_num)+1)
            docling_text = normalize_newlines(docling_text)
            docling_text = first_sentence + docling_text
            lang_doc = Document(page_content=docling_text, metadata={'filename': filename, 'page':str(page_num)})
            docs.append(lang_doc)
            page_num+=1
            time.sleep(0.1)

    if not os.path.exists(f"../00_data/{save_folder}"):
        os.makedirs(f"../00_data/{save_folder}")
        
    saving_name = filename.replace(".pdf", "")
    with open(f"../00_data/{save_folder}/{saving_name}.pkl", 'ab') as file:
        pickle.dump(docs, file)

    if os.path.exists(path):
        os.remove(path)

    return docs

In [29]:
import os

def get_file_list(folder_path):
    """
    지정된 폴더 내 모든 파일 이름을 리스트로 반환합니다.
    폴더가 존재하지 않거나 파일이 없을 경우 빈 리스트를 반환합니다.
    """
    if not os.path.isdir(folder_path):
        print("유효한 폴더 경로가 아닙니다.")
        return []

    return [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]

In [30]:
저장폴더명 = "법령"
files = get_file_list(folder_path="../00_data")


for file in tqdm(files):
    path = f"D:/AI_Labs/00_data/{file}"
    result = parsing_pdf_by_page_with_docling(path=path, save_folder=저장폴더명)
print(">>> ALL is completed")

CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

>>> ALL is completed





In [None]:
import pickle

# pickle 파일 경로
file_path = "../00_data/법령/산업안전보건기준에 관한 규칙(고용노동부령)(제00417호)(20250629).pkl"

# 파일 열기 및 데이터 로드
with open(file_path, 'rb') as f:
    data = pickle.load(f)

# 데이터 확인
print(data)