In [1]:
import os
import pickle
import pdfplumber
from PyPDF2 import PdfReader
from tqdm import tqdm
from docling.document_converter import DocumentConverter
from langchain_core.documents import Document

In [2]:
def extract_level_name(path:str) -> list:  # 폴더 구조(lv1, lv2, lv3를 metadata로 추출하는 함수)
    temp = path.split("/") 
    lv1 = temp[1]
    if temp[2]:
        if temp[2] != temp[-1]:
            lv2 = temp[2]
            lv3 = temp[-1].replace(".pdf", "")
        else:
            lv2 = None
            lv3 = temp[-1].replace(".pdf", "")
    result = [lv1, lv2, lv3]
    return result

def main_filepath_extractor(path:str, total_results=[]) -> list:   # 폴더 트리를 리커시브하게 읽어서 전체 PDF 파일의 full 경로를 리스트에 수집
    all_items = os.listdir(path)
    files = [f for f in all_items if os.path.isfile(os.path.join(path, f))]
    results = [os.path.join(path, file) for file in files]
    results = [result.replace("\\", "/") for result in results]
    total_results.extend(results)
    dirs = [f for f in all_items if os.path.isdir(os.path.join(path, f))]
    if dirs:
        dirs = [path+"/" + lv2_dir for lv2_dir in dirs]
        for dir in dirs:
            main_filepath_extractor(dir)
    return total_results


In [3]:

def docling_parser(path:str):
    try:
        print(f">>> Parsing Start: {path}")
        
        converter = DocumentConverter()
        loaded_docs = converter.convert(path)
        lv1, lv2, filename = extract_level_name(path)

        with pdfplumber.open(path) as pdf:
            page_number = 0  # for metadata
            results = []
            for _ in tqdm(pdf.pages):
                docling_text = loaded_docs.document.export_to_markdown(page_no=int(page_number)+1)
                lang_doc = Document(page_content=docling_text, metadata={"Page": str(page_number), "First Division": str(lv1), "Second Division": str(lv2), "File Name": str(filename), "File Path": str(path)})   
                results.append(lang_doc)
                with open(f'./parsed_docs/parsed_{filename}.pkl', 'ab') as file:
                    pickle.dump(results, file)
                
                page_number += 1
            
        print(f"------- Done (length of results: {len(results)}------------")

        with open(f'./parsed_docs/parsed_{filename}.pkl', 'wb') as file:
            pickle.dump(results, file)
        return results
    
    except Exception as e:
        print(f"Error - {e}")
        with open(f'./parsed_docs/error_{filename}.pkl', 'wb') as file:
            pickle.dump(results, file)

def main(lv1_path:str):
    total_files = main_filepath_extractor(lv1_path)
    print(total_files)

    picklefiles = os.listdir('./parsed_docs')
    picklefiles = [i.replace(".pkl", "") for i in picklefiles]
    picklefiles = [i.replace("parsed_", "") for i in picklefiles]

    for file_path in tqdm(total_files):
        filename = file_path.split("/")[-1]
        filename = filename.replace(".pdf", "")

        if filename not in picklefiles:   # 기완료 중복 체크
            print(f">>>>> Do Parsing :  {filename}")
            docling_parser(path=file_path)
        else: print(f">>>>> Already parsed : {filename}")
    
    print(">>>>> All Parsings are Completed")

lv1_path = "./docs"
main(lv1_path=lv1_path)

['./docs/sub_cat/DNV Rules for Classification of Ships _2016_39_Fishing vessels.pdf', './docs/sub_cat/FWG.pdf', './docs/sub_cat/Guidance for Approval of Risk-based Ship Design_2015.pdf', './docs/sub_cat/Meta Llama Responsible Use Guide.pdf', './docs/sub_cat/PART 11_2014_Common Structural Rules for Bulk Carriers.pdf', './docs/sub_cat/Unit_Cooler.pdf', './docs/sub_cat/WinGD-Portfolio-Engines_2024_Digital.pdf']


  0%|          | 0/7 [00:00<?, ?it/s]

>>>>> Already parsed : DNV Rules for Classification of Ships _2016_39_Fishing vessels
>>>>> Do Parsing :  FWG
>>> Parsing Start: ./docs/sub_cat/FWG.pdf


  from .autonotebook import tqdm as notebook_tqdm
Fetching 9 files: 100%|██████████| 9/9 [00:00<?, ?it/s]
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
100%|██████████| 7/7 [00:00<00:00, 281.60it/s]
100%|██████████| 7/7 [01:07<00:00,  9.64s/it]

------- Done (length of results: 7------------
>>>>> Already parsed : Guidance for Approval of Risk-based Ship Design_2015
>>>>> Already parsed : Meta Llama Responsible Use Guide
>>>>> Already parsed : PART 11_2014_Common Structural Rules for Bulk Carriers
>>>>> Already parsed : Unit_Cooler
>>>>> Already parsed : WinGD-Portfolio-Engines_2024_Digital
>>>>> All Parsings are Completed





In [44]:
filename = "PART 11_2014_Common Structural Rules for Bulk Carriers"
# filename = "Unit_Cooler"
with open(f'./parsed_docs/parsed_{filename}.pkl', 'rb') as file:
    parsed_text = pickle.load(file)

len(parsed_text)

524

In [45]:
page_num = 13
print(parsed_text[page_num].metadata)
print(parsed_text[page_num].page_content)

{'Page': '13', 'First Division': 'docs', 'Second Division': 'sub_cat', 'File Name': 'PART 11_2014_Common Structural Rules for Bulk Carriers', 'File Path': './docs/sub_cat/PART 11_2014_Common Structural Rules for Bulk Carriers.pdf'}
| Plan or document                                                         | Containing also in formation on                                                                         |
|--------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------|
| Bulwarks and freeing ports                                               | Arrangement and dimensions of bulwarks and freeing ports on  the freeboard deck and superstructure deck |
| Windows and side scuttles, arrangements and de- tails                    |                                                                                                         |
| Scuppers and sanitary discharges  