In [1]:
import os
import pickle
import pdfplumber
from tqdm import tqdm
from docling_parse.docling_parse import pdf_parser_v2
from docling.document_converter import DocumentConverter
from langchain_core.documents import Document

In [None]:
def extract_level_name(path:str) -> list:  # 폴더 구조(lv1, lv2, lv3를 metadata로 추출하는 함수)
    temp = path.split("/") 
    lv1 = temp[1]
    if temp[2]:
        if temp[2] != temp[-1]:
            lv2 = temp[2]
            lv3 = temp[-1].replace(".pdf", "")
        else:
            lv2 = None
            lv3 = temp[-1].replace(".pdf", "")
    result = [lv1, lv2, lv3]
    return result

def main_filepath_extractor(path:str, total_results=[]) -> list:   # 폴더 트리를 리커시브하게 읽어서 전체 PDF 파일의 full 경로를 리스트에 수집
    all_items = os.listdir(path)
    files = [f for f in all_items if os.path.isfile(os.path.join(path, f))]
    results = [os.path.join(path, file) for file in files]
    results = [result.replace("\\", "/") for result in results]
    total_results.extend(results)
    dirs = [f for f in all_items if os.path.isdir(os.path.join(path, f))]
    if dirs:
        dirs = [path+"/" + lv2_dir for lv2_dir in dirs]
        for dir in dirs:
            main_filepath_extractor(dir)
    return total_results

def docling_parser(path:str):
    try:
        print(f">>> Parsing Start: {path}")
        parser = pdf_parser_v2()
        doc_key = f"key={path}"
        success = parser.load_document(doc_key, path)
        num_pages = parser.number_of_pages(doc_key)
        print(f"------ Total Page Number : {num_pages} ----------")

        converter = DocumentConverter()
        loaded_docs = converter.convert(path)
        print("------ Doc Loading is Completed ----------")

        lv1, lv2, filename = extract_level_name(path)
        results = []
        for page_number in range(num_pages-1):
            docling_text = loaded_docs.document.export_to_markdown(page_no=int(page_number)+1)
            lang_doc = Document(page_content=docling_text, metadata={"Page": str(page_number), "First Division": str(lv1), "Second Division": str(lv2), "File Name": str(filename), "File Path": str(path)})   
                     
            results.append(lang_doc)
            with open(f'./parsed_docs/parsed_{filename}.pkl', 'ab') as file:
                pickle.dump(results, file)
            
        print(f"------- Done (length of results: {len(results)}------------")

        with open(f'./parsed_docs/parsed_{filename}.pkl', 'wb') as file:
            pickle.dump(results, file)
        return results
    
    except Exception as e:
        print(f"Error - {e}")
        with open(f'./parsed_docs/error_{filename}.pkl', 'wb') as file:
            pickle.dump(results, file)

def main(lv1_path:str):
    total_files = main_filepath_extractor(lv1_path)
    print(total_files)

    picklefiles = os.listdir('./parsed_docs')
    picklefiles = [i.replace(".pkl", "") for i in picklefiles]
    picklefiles = [i.replace("parsed_", "") for i in picklefiles]

    for file_path in tqdm(total_files):
        filename = file_path.split("/")[-1]
        filename = filename.replace(".pdf", "")

        if filename not in picklefiles:   # 기완료 중복 체크
            print(f">>>>> Do Parsing :  {filename}")
            docling_parser(path=file_path)
        else: print(f">>>>> Already parsed : {filename}")
    
    print(">>>>> All Parsings are Completed")

lv1_path = "./docs"
main(lv1_path=lv1_path)

['./docs/sub_cat/Meta Llama Responsible Use Guide.pdf', './docs/sub_cat/Unit_Cooler.pdf']


100%|██████████| 2/2 [00:00<00:00, 2000.14it/s]

>>>>> Already parsed : Meta Llama Responsible Use Guide
>>>>> Already parsed : Unit_Cooler
>>>>> All Parsings are Completed





In [34]:
filename = "Meta Llama Responsible Use Guide"
# filename = "Unit_Cooler"
with open(f'./parsed_docs/parsed_{filename}.pkl', 'rb') as file:
    parsed_text = pickle.load(file)

len(parsed_text)

26

In [45]:
page_num = 4
print(parsed_text[page_num].metadata)
print(parsed_text[page_num].page_content)

{'Page': '4', 'First Division': 'docs', 'Second Division': 'sub_cat', 'File Name': 'Meta Llama Responsible Use Guide', 'File Path': './docs/sub_cat/Meta Llama Responsible Use Guide.pdf'}
<!-- image -->

## How to use this guide

This guide is a resource for developers that outlines common approaches to building responsibly at each level of an LLM-powered product. It covers best practices and considerations that developers should evaluate in the context of their specific use case and market. It also highlights some mitigation strategies and resources available to developers to address risks at various points in the system. These best practices should be considered holistically because strategies adopted at one level can impact the entire system.

The recommendations included in this guide reflect current research on responsible generative AI. We expect these to evolve as the field advances and access to foundation models grows, inviting further innovation on AI safety. Decisions to impl