In [1]:
import pdfplumber
from typing import Iterator
from langchain_core.documents import Document
from paddleocr import PaddleOCR
from pprint import pprint
import re
import os
import pickle
from tqdm import tqdm
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_ollama import OllamaEmbeddings

In [2]:
def create_folder_if_not_exists(folder_path:str):  # 이미지 저장 폴더 생성
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
        print(f"폴더가 생성되었습니다: {folder_path}")
    else:
        print(f"폴더가 이미 존재합니다: {folder_path}")

def save_pdf_to_img(path:str, file_name:str, page_num:int):  # pdf를 png 이미지 파일로 저장
    with pdfplumber.open(path) as pdf:
        page = pdf.pages[page_num]
        im = page.to_image(resolution=150)
        # im.draw_rects(first_page.extract_words())  # 글자에 Red Box 그리기
        save_path = f"{os.getcwd()}/images/{file_name}/{file_name}_{page_num}.png"
        im.save(save_path, format="PNG", )
    return save_path

table_settings={      # extract_tables method variable (깃헙 디폴트 세팅 참조)
    "vertical_strategy": "lines",
    "horizontal_strategy": "lines",
    "explicit_vertical_lines": [],
    "explicit_horizontal_lines": [],
    "snap_tolerance": 3,
    "snap_x_tolerance": 3,
    "snap_y_tolerance": 3,
    "join_tolerance": 3,
    "join_x_tolerance": 3,
    "join_y_tolerance": 3,
    "edge_min_length": 3,
    "min_words_vertical": 3,
    "min_words_horizontal": 1,
    "intersection_tolerance": 3,
    "intersection_x_tolerance": 3,
    "intersection_y_tolerance": 3,
    "text_tolerance": 3,
    "text_x_tolerance": 3,
    "text_y_tolerance": 3,
    # "text_*": …,
    }

def convert_header_to_separator(header: str) -> str:   # 테이블 첫줄 파싱후, 두번째 줄에 Header Line 추가 함수(마크다운 형식을 위한)
    # Use a regex to replace each header content with the appropriate number of hyphens
    separator = re.sub(r'[^|]+', lambda m: '-' * max(1, len(m.group(0))), header) # max 부분 관련 구분자는 최소 1개는 들어가야 마크다운 적용
    separator = separator.replace("||", "|-|", 1)  # 수평구분자가 최소 한개는 있어야 마크다운 적용(그냥 비어있으면 안됨)
    return separator

def table_parser(pdf_path:str, page_num:int, crop:bool=False) -> list:   # 테이블 파싱(마크다운 형식), A4상단 표준 크롭핑 적용 선택 가능(디폴트 false)
    full_table = []
    with pdfplumber.open(pdf_path) as pdf:
        # Find the examined page
        table_page = pdf.pages[page_num]
        if crop:
            bounding_box = (3, 70, 590, 770)   #default : (0, 0, 595, 841)
            table_page = table_page.crop(bounding_box, relative=False, strict=True)
        else: pass
        tables = table_page.extract_tables(table_settings = table_settings)
        # if tables:
        for table in tables:
            table_string = ''
            # Iterate through each row of the table
            for row_num in range(len(table)):
                row = table[row_num]
                # Remove the line breaker from the wrapped texts
                cleaned_row = [item.replace('\n', ' ') if item is not None and '\n' in item else 'None' if item is None else item for item in row]
                # Convert the table into a string
                table_string+=('|'+'|'.join(cleaned_row)+'|'+'\n')
                if row_num ==0:  # 첫줄 작업이면, Header Line 추가
                    header_line = convert_header_to_separator(table_string[:-1])
                    table_string+= header_line+'\n'
            # Removing the last line break
            table_string = table_string[:-1]
            full_table.append(table_string)
        return full_table

def extract_level_name(path:str) -> list:  # 폴더 구조(lv1, lv2, lv3를 metadata로 추출하는 함수)
    temp = path.split("/")  # path 예시 : ['.\\2024\\Manual\\Guidance for Autonomous Ships_2023.pdf','.\\2024\\POS\\FWG.pdf']
    lv1 = temp[1]
    if temp[2]:
        if temp[2] != temp[-1]:
            lv2 = temp[2]
            lv3 = temp[-1].replace(".pdf", "")
        else:
            lv2 = None
            lv3 = temp[-1].replace(".pdf", "")
    result = [lv1, lv2, lv3]
    return result

total_results =[]
def main_filepath_extractor(path:str) -> list:   # 폴더 트리를 리커시브하게 읽어서 전체 PDF 파일의 full 경로를 리스트에 수집
    global total_results
    all_items = os.listdir(path)
    files = [f for f in all_items if os.path.isfile(os.path.join(path, f))]
    results = [os.path.join(path, file) for file in files]
    results = [result.replace("\\", "/") for result in results]
    total_results.extend(results)
    dirs = [f for f in all_items if os.path.isdir(os.path.join(path, f))]
    if dirs:
        dirs = [path+"/" + lv2_dir for lv2_dir in dirs]
        for dir in dirs:
            main_filepath_extractor(dir)
    return total_results

def main_parser(path:str, crop:bool=False, lang:str="en") -> Iterator[Document]:  # 메인 Parsing 함수, text-extraction은 pypdf2 적용
    '''
    - pdfplumber: table, image 추출
    - pypdf2: text 추출
    - paddleocr : 이미지 pdf 줄파싱
    - lang 후보: ['ch', 'en', 'korean', 'japan', 'chinese_cht', 'ta', 'te', 'ka', 'latin', 'arabic', 'cyrillic', 'devanagari']
    '''
    full_result = []
    file_name = path.split("/")[-1].split(".")[0].strip()
    img_save_folder = os.path.join(os.getcwd(), f"images/{file_name}")  # images 폴더 생성후 그 안에 file_name폴더 생성
    create_folder_if_not_exists(img_save_folder)  # 이미지 저장할 폴더 생성
    ocr = PaddleOCR(use_angle_cls=True, lang=lang)

    with pdfplumber.open(path) as pdf:
        page_number = 0  # for metadata
        for _ in tqdm(pdf.pages):
            level_names = extract_level_name(path)  # for metadata
            img_path = save_pdf_to_img(path, file_name, page_number) # for saving pdf page as png img file
            reader = PdfReader(path)
            page = reader.pages[page_number]
            text_result = page.extract_text().replace("\n", " ").replace("- ", "").replace("  ", " ")

            if len(text_result) == 0:  # 텍스트 추출 결과가 없으면, OCR 실시
                print("이미지 OCR")
                ocr_result = ocr.ocr(img_path)
                for idx in range(len(ocr_result)):
                    res = ocr_result[idx]
                    temp_result = []
                    try:
                        for line in res:
                            temp_result.append(line[1][0])
                    except: temp_result.append("Error has been occured")
                text_result = " ".join(temp_result)

            table_result = table_parser(path, page_number, crop)  # for page_content

            if table_result:
                total_page_result = ""
                for table in table_result:
                    total_page_result = text_result + "\n\n" + table   # table_result가 있으면, text_result 끝에 엔터후 이어붙이기
                    result = Document(
                        page_content=total_page_result,
                        metadata={"Page": page_number, "First Division":level_names[0], "Second Division": level_names[1], "File Name": level_names[2], "File Path": path},
                        )
            else:
                result = Document(
                    page_content = text_result,
                    metadata={"Page": page_number, "First Division":level_names[0], "Second Division": level_names[1], "File Name": level_names[2], "File Path": path},
                    )
            full_result.append(result)
            page_number += 1
        parsed_document = full_result
    return parsed_document   # langchain Document type
### [End] Main Fucntions with pdfminer.six ###########################################################################################

def add_firstline_in_splitted_text(origin_splitted_text:str):
    lv1 = origin_splitted_text.metadata["First Division"]
    lv2 = origin_splitted_text.metadata["Second Division"]
    title = origin_splitted_text.metadata["File Name"]
    origin_page_content = origin_splitted_text.page_content
    first_sentence = f"This page explains {title}, that belongs to catogories of {lv1} and {lv2}."
    new_page_content = f'''{first_sentence}/n{origin_page_content}'''
    origin_matadata = origin_splitted_text.metadata
    return Document(page_content=new_page_content, metadata=origin_matadata)

In [3]:
import chromadb
import pandas as pd
from langchain_chroma import Chroma
from langchain_ollama import OllamaEmbeddings

def read_vectordb_as_df(db_path:str):
    result = []
    client = chromadb.PersistentClient(path=db_path)
    for collection in client.list_collections():
        data = collection.get(include=['embeddings','documents', 'metadatas'])
        result.append(data)
        df = pd.DataFrame({"ids":data["ids"],
                           "metadatas":data["metadatas"],
                           "documents":data["documents"]})
        df["first_div"] = df["metadatas"].apply(lambda x: x["First Division"])
        df["second_div"] = df["metadatas"].apply(lambda x: x["Second Division"])
        df["filename"] = df["metadatas"].apply(lambda x: x["File Name"])
        df = df[["ids", "first_div", "second_div","filename","documents", "metadatas"]]
    return df


def delete_document(filename:str, db_path:str):
  vector_store = Chroma(collection_name="collection_01", persist_directory=db_path, embedding_function=OllamaEmbeddings(model="bge-m3:latest"))
  del_ids = vector_store.get(where={'File Name':filename})["ids"]
  vector_store.delete(del_ids)
  print("Document is deleted")


In [4]:
lv1_dir = "./Rules"     # 최상단 엄마 폴더
db_path = "./db/chroma_db_03"

In [5]:
total_paths = []
total_paths = main_filepath_extractor(path=lv1_dir)  # 모든 PDF의 Full Path를 리스트에 담기
total_paths = list(set(total_paths))
total_paths.sort()
print(total_paths)
print(len(total_paths))
res = extract_level_name(path=total_paths[-1])
print(res)

['./Rules/DNV/DNV Rules for Classification of Ships _2016_39_Fishing vessels.pdf', './Rules/DNV/DNV Rules for Classification of Ships _2016_40_Offshore Service Vessels, Tugs and Special Ships.pdf', './Rules/DNV/DNV Rules for Classification of Ships _2016_41_Slop reception and processing facilities.pdf', './Rules/DNV/DNV Rules for Classification of Ships _2016_42_Ships for Carriage of Refrigerated Cargoes.pdf', './Rules/DNV/DNV Rules for Classification of Ships _2016_43_Carriage of Dangerous Goods.pdf']
5
['Rules', 'DNV', 'DNV Rules for Classification of Ships _2016_43_Carriage of Dangerous Goods']


In [8]:
for path in tqdm(total_paths):
  print(">>> 중복 파일 체크")
  vector_store = Chroma(collection_name="collection_01", persist_directory=db_path, embedding_function=OllamaEmbeddings(model="bge-m3:latest"))
  df = read_vectordb_as_df(db_path=db_path)
  vectordb_filenames = df["filename"].unique().tolist()
  target_filename = extract_level_name(path=path)[-1]

  print(len(vectordb_filenames))
  print(target_filename)

  if target_filename not in vectordb_filenames:

    print("============= MAIN PARSER ============")
    parsed_text = main_parser(path=path, lang="en")
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=50)
    splitted_texts = text_splitter.split_documents(parsed_text)
    new_splitted_texts = [add_firstline_in_splitted_text(text) for text in splitted_texts]
    print(f"============= Text Splitter - {len(new_splitted_texts)}============")

    vector_store = Chroma(collection_name="collection_01",embedding_function=OllamaEmbeddings(model="bge-m3:latest"), persist_directory=db_path)

    print("============= Embedding  ============")

    # squares_generator = (i for i in new_splitted_texts)
    # for div in tqdm(squares_generator):
    #   vector_store.add_documents(documents=[div])

    for div in tqdm(new_splitted_texts):
      vector_store.add_documents(documents=[div])

    print(f">>> [End]{path}--------------------------------------------")
    print("")

  else:
    print("Already Parsed Document")

  0%|          | 0/5 [00:00<?, ?it/s]

>>> 중복 파일 체크
2
DNV Rules for Classification of Ships _2016_39_Fishing vessels
Already Parsed Document
>>> 중복 파일 체크
2
DNV Rules for Classification of Ships _2016_40_Offshore Service Vessels, Tugs and Special Ships
Already Parsed Document
>>> 중복 파일 체크
2
DNV Rules for Classification of Ships _2016_41_Slop reception and processing facilities
폴더가 생성되었습니다: d:\AA_develop\parsing\images/DNV Rules for Classification of Ships _2016_41_Slop reception and processing facilities
[2024/10/28 12:15:49] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='C:\\Users\\jongb/.paddleocr/whl\\det\\en\\en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_

100%|██████████| 11/11 [00:02<00:00,  4.49it/s]






: 

# VectorDB 조회

In [6]:
vector_store = Chroma(collection_name="collection_01", persist_directory=db_path, embedding_function=OllamaEmbeddings(model="bge-m3:latest"))
df = read_vectordb_as_df(db_path=db_path)
print(vector_store)
print(df.shape)
print(df["first_div"].unique())
print(df["second_div"].unique())
print(df["filename"].unique()[-3:])

<langchain_chroma.vectorstores.Chroma object at 0x00000271BDDA4880>
(99, 6)
['Rules']
['DNV']
['DNV Rules for Classification of Ships _2016_39_Fishing vessels'
 'DNV Rules for Classification of Ships _2016_40_Offshore Service Vessels, Tugs and Special Ships']


In [7]:
query = """
what is the obligation of the master in a troubled vessel in singapore port?
"""
res = vector_store.similarity_search_with_relevance_scores(query=query, k=3)
res

[(Document(metadata={'File Name': 'DNV Rules for Classification of Ships _2016_39_Fishing vessels', 'File Path': './Rules/DNV/DNV Rules for Classification of Ships _2016_39_Fishing vessels.pdf', 'First Division': 'Rules', 'Page': 9, 'Second Division': 'DNV'}, page_content='This page explains DNV Rules for Classification of Ships _2016_39_Fishing vessels, that belongs to catogories of Rules and DNV./nRules for Ships, July 2016  Pt.5 Ch.6 Sec.1 General requirements – Page 10 DET NORSKE VERITAS ASare not fully met, or if the design of the weather deck is such that water may be trapped. The stability calculations shall take the effect of this water into account according to the re quirements of 703 to 705. 702  If hatches or similar openings have to be left periodically open during oper ation, the stability calculations shall take the effect of water in th e open compartment(s) in to account according to the requirements of 703 to 705, provided that the angle of downflooding for the critic

# 문서 삭제

In [None]:
print(df.shape)
try:
  filename = 'Port Information Guide_Rotterdam_2024'
  delete_document(filename=filename, db_path=db_path)
  vector_store = Chroma(collection_name="collection_01", persist_directory=db_path, embedding_function=OllamaEmbeddings(model="bge-m3:latest"))
  df = read_vectordb_as_df(db_path=db_path)

  print(df.shape)
except:
  print("문서가 없습니다.")
  vector_store = Chroma(collection_name="collection_01", persist_directory=db_path, embedding_function=OllamaEmbeddings(model="bge-m3:latest"))
  df = read_vectordb_as_df(db_path=db_path)
  print(df.shape)