# Parsing

In [2]:
import json
import yaml
from pathlib import Path
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.document_converter import (
    DocumentConverter,
    PdfFormatOption,
    WordFormatOption,
)
from docling.datamodel.pipeline_options import PdfPipelineOptions, EasyOcrOptions

from docling.pipeline.simple_pipeline import SimplePipeline
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# 모듈 최상단에 패턴 컴파일
import re
NEWLINE_PATTERN = re.compile(r'\r\n\d+')

def normalize_newlines(text: str) -> str:
    """개행문자 정규화 (동기 함수)"""
    return NEWLINE_PATTERN.sub('\n', text)

In [9]:
import os
import time
import pickle
import pdfplumber
from tqdm.auto import tqdm
from langchain_core.documents import Document


# 공유 가능한 옵션 정의
DEFAULT_PIPELINE_OPTIONS = PdfPipelineOptions(
    do_ocr=True,
    do_table_structure=True,
    ocr_options=EasyOcrOptions(lang=["en", "ko"])
    )


def parsing_pdf_by_page_with_docling(path:str, lv1_cat:str, lv2_cat:str):
    path = path.replace("\\", "/")
    filename = path.split("/")[-1]

    first_sentence = f"This page explains {filename.replace(".pdf", "")} that belongs to {lv1_cat} and  {lv2_cat} categories.\n"


    pipeline_options = DEFAULT_PIPELINE_OPTIONS
    converter = DocumentConverter(
        allowed_formats=[
            InputFormat.PDF
        ],
        format_options={
            InputFormat.PDF: PdfFormatOption(
                pipeline_options=pipeline_options,
                backend=PyPdfiumDocumentBackend
            ),}
    )
    loaded_docs = converter.convert(path)
    with pdfplumber.open(path) as pdf:
        page_num = 0
        docs = []
        for _ in tqdm(pdf.pages):
            docling_text = loaded_docs.document.export_to_markdown(page_no=int(page_num)+1)
            docling_text = docling_text.replace("<!-- image -->", "")
            docling_text = normalize_newlines(docling_text)
            docling_text = first_sentence + docling_text
            lang_doc = Document(page_content=docling_text, metadata={'filename': filename, 'lv1_cat': lv1_cat, 'lv2_cat': lv2_cat, 'page':str(page_num)})
            docs.append(lang_doc)
            page_num+=1
            time.sleep(0.1)
    
    parsed_foldername = f"{lv1_cat}_{lv2_cat}"
    if not os.path.exists(f"../docs/{parsed_foldername}"):
        os.makedirs(f"../docs/{parsed_foldername}")
        
    parsed_filename = filename.replace(".pdf", "")
    with open(f"../docs/{parsed_foldername}/{parsed_filename}.pkl", 'ab') as file:
        pickle.dump(docs, file)

    if os.path.exists(path):
        os.remove(path)

    return docs

In [10]:
import os

def get_file_list(folder_path):
    """
    지정된 폴더 내 모든 파일 이름을 리스트로 반환합니다.
    폴더가 존재하지 않거나 파일이 없을 경우 빈 리스트를 반환합니다.
    """
    if not os.path.isdir(folder_path):
        print("유효한 폴더 경로가 아닙니다.")
        return []

    return [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]

In [11]:
lv1_cat, lv2_cat = "RULE", "KR"
files = get_file_list(folder_path="../data")
files

['KR Notation Guide_2025.pdf']

In [12]:
for file in tqdm(files):
    path = f"../data/{file}"
    result = parsing_pdf_by_page_with_docling(path=path, lv1_cat=lv1_cat, lv2_cat=lv2_cat)
print(">>> ALL is completed")

  0%|          | 0/1 [00:00<?, ?it/s]2025-10-03 21:47:18,659 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-03 21:47:18,668 - INFO - Going to convert document batch...
2025-10-03 21:47:18,670 - INFO - Initializing pipeline for StandardPdfPipeline with options hash acb7af01651b138d096f2f800318bf95
2025-10-03 21:47:18,917 - INFO - Loading plugin 'docling_defaults'
2025-10-03 21:47:18,956 - INFO - Registered picture descriptions: ['vlm', 'api']
2025-10-03 21:47:19,033 - INFO - Loading plugin 'docling_defaults'
2025-10-03 21:47:19,187 - INFO - Registered ocr engines: ['easyocr', 'ocrmac', 'rapidocr', 'tesserocr', 'tesseract']
2025-10-03 21:47:23,685 - INFO - Accelerator device: 'cpu'
2025-10-03 21:47:26,347 - INFO - Accelerator device: 'cpu'
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
To su

>>> ALL is completed





# ElasticSearch

In [None]:
import pickle

doc_path = "../docs/RULE_KR"

files = get_file_list(folder_path=doc_path)
print(files)

documents = []
for file in files:
    file_path = f"{doc_path}/{file}"
    # 파일 열기 및 데이터 로드
    with open(file_path, 'rb') as f:
        document = pickle.load(f)
        documents.append(document)

# 데이터 확인
print(len(documents[0]))
print(documents[0])

['KR Notation Guide_2025.pkl']
249
[Document(metadata={'filename': 'KR Notation Guide_2025.pdf', 'lv1_cat': 'RULE', 'lv2_cat': 'KR', 'page': '0'}, page_content='This page explains KR Notation Guide_2025 that belongs to RULE and  KR categories.\n\n\n## 2025\n\n## Notation Guide\n\n## KR'), Document(metadata={'filename': 'KR Notation Guide_2025.pdf', 'lv1_cat': 'RULE', 'lv2_cat': 'KR', 'page': '1'}, page_content='This page explains KR Notation Guide_2025 that belongs to RULE and  KR categories.\n## CONTENTS\n\n| CHAPTER 1                                                                                                                             | GENERAL ···········································································································  1                  |\n|---------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------

In [26]:
from langchain_ollama import OllamaEmbeddings
from langchain_elasticsearch import ElasticsearchStore, DenseVectorStrategy

def init_elastic_vectorstore(index_name:str):
    vector_store = ElasticsearchStore(
        index_name=index_name, 
        embedding=OllamaEmbeddings(base_url="http://localhost:11434", model="bge-m3:latest"), 
        es_url="http://localhost:9200",
        # es_user="Kstyle",
        # es_password="12345",
        # strategy=DenseVectorStrategy(hybrid=True)
        )
    return vector_store

In [32]:
index_name="rule"
vector_store = init_elastic_vectorstore(index_name=index_name)
vector_store

2025-10-03 22:24:44,779 - INFO - GET http://localhost:9200/ [status:200 duration:0.012s]


<langchain_elasticsearch.vectorstores.ElasticsearchStore at 0x2033a723710>

In [33]:
from tqdm.auto import tqdm
from uuid import uuid4

for doc in tqdm(documents):
    uuids = [str(uuid4()) for _ in range(len(doc))]
    vector_store.add_documents(documents=doc, ids=uuids)

  0%|          | 0/1 [00:00<?, ?it/s]2025-10-03 22:24:48,632 - INFO - HEAD http://localhost:9200/rule [status:404 duration:0.006s]
2025-10-03 22:24:48,909 - INFO - HTTP Request: POST http://localhost:11434/api/embed "HTTP/1.1 200 OK"
2025-10-03 22:24:50,286 - INFO - PUT http://localhost:9200/rule [status:200 duration:1.374s]
2025-10-03 22:45:06,791 - INFO - HTTP Request: POST http://localhost:11434/api/embed "HTTP/1.1 200 OK"
2025-10-03 22:45:09,582 - INFO - PUT http://localhost:9200/_bulk?refresh=true [status:200 duration:2.402s]
100%|██████████| 1/1 [20:20<00:00, 1220.97s/it]


In [7]:
from elasticsearch import Elasticsearch

def get_elastic_index_list():
    es = Elasticsearch(["http://localhost:9200"], basic_auth=('Kstyle', '12345'))
    # Get all index names
    response = es.cat.indices(index='*', format='json')
    return list(response)

index_list = get_elastic_index_list()
index_list

[{'health': 'yellow',
  'status': 'open',
  'index': 'rule',
  'uuid': 'rkTAl1bmSjCkU_yCrjMdAw',
  'pri': '1',
  'rep': '1',
  'docs.count': '249',
  'docs.deleted': '0',
  'store.size': '3.3mb',
  'pri.store.size': '3.3mb',
  'dataset.size': '3.3mb'}]

In [35]:
def delete_index(index_name:str):
    es = Elasticsearch(["http://localhost:9200"],
                   basic_auth=('Kstyle', '12345'))
    response = es.indices.delete(index=index_name)
    print(f"Index '{index_name}' deletion response:", response)

# delete_index(index_name="ship_safety")

In [1]:
import requests

url = 'http://127.0.0.1:9200/_cat/indices?v'
response = requests.get(url)

if response.status_code == 200:
    print(response.text)
else:
    print("Failed to retrieve indices. Status code: {response.status_code}")

health status index uuid                   pri rep docs.count docs.deleted store.size pri.store.size dataset.size
yellow open   rule  rkTAl1bmSjCkU_yCrjMdAw   1   1        249            0      3.3mb          3.3mb        3.3mb



In [5]:
response

<Response [200]>