In [2]:
import chromadb
import ollama
from langchain_chroma import Chroma
from langchain_ollama import OllamaEmbeddings
import pandas as pd

In [3]:
def read_vectordb_as_df(db_path:str):
    client = chromadb.PersistentClient(path=db_path)
    for collection in client.list_collections():
        data = collection.get(include=['embeddings', 'documents', 'metadatas'])
        df = pd.DataFrame({"ids":data["ids"], 
                            "metadatas":data["metadatas"], 
                            "documents":data["documents"]})
        df["first_div"] = df["metadatas"].apply(lambda x: x["First Division"])
        df["second_div"] = df["metadatas"].apply(lambda x: x["Second Division"])
        df["filename"] = df["metadatas"].apply(lambda x: x["File Name"])
        df = df[["ids", "first_div", "second_div","filename","documents", "metadatas"]]
    return df

In [104]:
db_path = "./db/chroma_db_02"
vectorstore = Chroma(collection_name="collection_01", persist_directory=db_path, embedding_function=OllamaEmbeddings(model="bge-m3:latest"))
print(vectorstore)

<langchain_chroma.vectorstores.Chroma object at 0x000001B47AE165A0>


In [None]:
df = read_vectordb_as_df(db_path=db_path)
df.head(2)

In [84]:
# query = """
# according to "iss" manual, what is the "noon report" in iss system?
# """
query = """
what is the obligation of master of troubled vessel in "singapore" "port"
"""

# Semantic Search

In [152]:

def get_semantic_search_docs(query:str, vectorstore, k:int=100, fetch_k:int=200):
    retriever = vectorstore.as_retriever(
        search_type="mmr", 
        search_kwargs={'k': k, "fetch_k":fetch_k}
        )
    result = retriever.invoke(query)
    print(len(result))
    return result

result = get_semantic_search_docs(query=query, vectorstore=vectorstore, k=100, fetch_k=200)
result

100


[Document(metadata={'File Name': "Lloyd's Register Rules and Regulations for the Classification of Ships, July 2022", 'File Path': "/content/drive/MyDrive/Rules/LR/Lloyd's Register Rules and Regulations for the Classification of Ships, July 2022.pdf", 'First Division': 'Rules', 'Page': 1759, 'Second Division': 'LR'}, page_content="This page explains Lloyd's Register Rules and Regulations for the Classification of Ships, July 2022, that belongs to catogories of Rules and LR./nTable 12.2.1 Passenger ships Maximum noise levels in dB(A) may be considered where agreed between the Owner and Builder at specification/contract stage. Not more than 20 per cent of the passenger cabins, 30 per cent of the public spaces and 20 per cent of the crew cabins should exceed the relevant noise criteria by more than 3 dB(A). 2.2.4 Acoustic insulation of bulkheads and decks between passenger spaces is to be generally in accordance with the values of the weighted apparent sound reduction index R'w as given i

# BM25 Search

In [None]:
import re
import numpy as np
from rank_bm25 import BM25Okapi


def get_bm25_top_docs(query:str, documents:list, top_k:int=20):

    tokenized_corpus = [doc.page_content for doc in documents]
    bm25 = BM25Okapi(tokenized_corpus)

    pattern = r'"(.*?)"'  # 따옴표로 둘러싸인 단어만 검색 대상으로 리스트에 담기
    query = re.findall(pattern, query)
    doc_scores = bm25.get_scores(query)
    sorted_indices = np.argsort(doc_scores)  # 값에 대한 정렬된 인덱스
    order_values = np.empty_like(sorted_indices)
    order_values[sorted_indices] = np.arange(len(doc_scores))
    top_index = [i for i, value in enumerate(order_values) if value < top_k]
    top_docs = [i for i in documents if documents.index(i) in top_index ]
    print(len(top_docs))
    print(top_docs)

    return top_docs


result = get_bm25_top_docs(query=query, documents=result, top_k=20)
result

20
[Document(metadata={'File Name': "Lloyd's Register Rules and Regulations for the Classification of Ships, July 2022", 'File Path': "/content/drive/MyDrive/Rules/LR/Lloyd's Register Rules and Regulations for the Classification of Ships, July 2022.pdf", 'First Division': 'Rules', 'Page': 1759, 'Second Division': 'LR'}, page_content="This page explains Lloyd's Register Rules and Regulations for the Classification of Ships, July 2022, that belongs to catogories of Rules and LR./nTable 12.2.1 Passenger ships Maximum noise levels in dB(A) may be considered where agreed between the Owner and Builder at specification/contract stage. Not more than 20 per cent of the passenger cabins, 30 per cent of the public spaces and 20 per cent of the crew cabins should exceed the relevant noise criteria by more than 3 dB(A). 2.2.4 Acoustic insulation of bulkheads and decks between passenger spaces is to be generally in accordance with the values of the weighted apparent sound reduction index R'w as give

[Document(metadata={'File Name': "Lloyd's Register Rules and Regulations for the Classification of Ships, July 2022", 'File Path': "/content/drive/MyDrive/Rules/LR/Lloyd's Register Rules and Regulations for the Classification of Ships, July 2022.pdf", 'First Division': 'Rules', 'Page': 1759, 'Second Division': 'LR'}, page_content="This page explains Lloyd's Register Rules and Regulations for the Classification of Ships, July 2022, that belongs to catogories of Rules and LR./nTable 12.2.1 Passenger ships Maximum noise levels in dB(A) may be considered where agreed between the Owner and Builder at specification/contract stage. Not more than 20 per cent of the passenger cabins, 30 per cent of the public spaces and 20 per cent of the crew cabins should exceed the relevant noise criteria by more than 3 dB(A). 2.2.4 Acoustic insulation of bulkheads and decks between passenger spaces is to be generally in accordance with the values of the weighted apparent sound reduction index R'w as given i

# Keyword Matching

In [None]:
def get_keywords_matched_docs(query:str, documents:list, and_condition:bool=True):
    pattern = r'"(.*?)"'  # 따옴표로 둘러싸인 단어만 검색 대상으로 리스트에 담기
    extracted_keywords = re.findall(pattern, query)
    lower_keywors = [keyword.lower() for keyword in extracted_keywords]

    lower_docs = [doc.page_content.lower() for doc in documents]
    if and_condition: matching_sentences = [sentence for sentence in lower_docs if all(keyword in sentence for keyword in lower_keywors)]
    else: matching_sentences = [sentence for sentence in lower_docs if any(keyword in sentence for keyword in lower_keywors)]

    print(lower_docs)

    matched_index = [lower_docs.index(doc) for doc in matching_sentences]
    print(matched_index)

    final_matched_docs = [documents[i] for i in matched_index]

    return final_matched_docs

result = get_keywords_matched_docs(query=query, documents=result, and_condition=True)
result

["this page explains lloyd's register rules and regulations for the classification of ships, july 2022, that belongs to catogories of rules and lr./ntable 12.2.1 passenger ships maximum noise levels in db(a) may be considered where agreed between the owner and builder at specification/contract stage. not more than 20 per cent of the passenger cabins, 30 per cent of the public spaces and 20 per cent of the crew cabins should exceed the relevant noise criteria by more than 3 db(a). 2.2.4 acoustic insulation of bulkheads and decks between passenger spaces is to be generally in accordance with the values of the weighted apparent sound reduction index r'w as given in table 12.2.2 minimum apparent airborne sound insulation indices, r'w , calculated using iso 717/1. see also pt 7, ch 12, 2.2 passenger accommodation and public spaces 2.2.6 .rules and regulations for the classification of ships, july 2022 passenger and crew accommodation comfort part 7, chapter 12 section 2 1758 lloyd 's regist

[Document(metadata={'File Name': "Lloyd's Register Rules and Regulations for the Classification of Ships, July 2022", 'File Path': "/content/drive/MyDrive/Rules/LR/Lloyd's Register Rules and Regulations for the Classification of Ships, July 2022.pdf", 'First Division': 'Rules', 'Page': 1759, 'Second Division': 'LR'}, page_content="This page explains Lloyd's Register Rules and Regulations for the Classification of Ships, July 2022, that belongs to catogories of Rules and LR./nTable 12.2.1 Passenger ships Maximum noise levels in dB(A) may be considered where agreed between the Owner and Builder at specification/contract stage. Not more than 20 per cent of the passenger cabins, 30 per cent of the public spaces and 20 per cent of the crew cabins should exceed the relevant noise criteria by more than 3 dB(A). 2.2.4 Acoustic insulation of bulkheads and decks between passenger spaces is to be generally in accordance with the values of the weighted apparent sound reduction index R'w as given i

# Test

In [173]:
%timeit

# query = 'what is the obligation of master of troubled vessel in "singapore" port'
# query = 'according to "iss" manual, what is the "noon report" in iss system?'
# query = 'with reference to "lr" rules, explain the "noise" level of radar rooms.(vectorstore)'
query ='with reference to "lr" rule, explain the measurement procedure of noise'

pattern = r'"(.*?)"'  # 따옴표로 둘러싸인 단어만 검색 대상으로 리스트에 담기
extracted_keywords = re.findall(pattern, query)
if len(extracted_keywords) > 0:
    documents = get_semantic_search_docs(query=query, vectorstore=vectorstore, k=100, fetch_k=500)
    documents = get_bm25_top_docs(query=query, documents=documents, top_k=30)
    documents = get_keywords_matched_docs(query=query, documents=documents, and_condition=True)
else: documents = get_semantic_search_docs(query=query, vectorstore=vectorstore, k=3, fetch_k=500)

documents

100
30
[Document(metadata={'File Name': "Lloyd's Register Rules and Regulations for the Classification of Ships, July 2022", 'File Path': "/content/drive/MyDrive/Rules/LR/Lloyd's Register Rules and Regulations for the Classification of Ships, July 2022.pdf", 'First Division': 'Rules', 'Page': 1766, 'Second Division': 'LR'}, page_content="This page explains Lloyd's Register Rules and Regulations for the Classification of Ships, July 2022, that belongs to catogories of Rules and LR./nlevels should be measured for information only. Definition of long-term DP mode can be agreed between Owner and Builder. 4.2.5 Prior to survey, a test programme is to be submitted for approval by LR. This programme is to contain details of the following: (a) Measurement locations indicated on a general arrangement of the ship. (b) The ship's loading condition during survey. (c) The machinery operating condition, including HVAC system, during survey. (d) Noise and vibration measuring equipment. 4.3 Noise meas

[Document(metadata={'File Name': "Lloyd's Register Rules and Regulations for the Classification of Ships, July 2022", 'File Path': "/content/drive/MyDrive/Rules/LR/Lloyd's Register Rules and Regulations for the Classification of Ships, July 2022.pdf", 'First Division': 'Rules', 'Page': 1766, 'Second Division': 'LR'}, page_content="This page explains Lloyd's Register Rules and Regulations for the Classification of Ships, July 2022, that belongs to catogories of Rules and LR./nlevels should be measured for information only. Definition of long-term DP mode can be agreed between Owner and Builder. 4.2.5 Prior to survey, a test programme is to be submitted for approval by LR. This programme is to contain details of the following: (a) Measurement locations indicated on a general arrangement of the ship. (b) The ship's loading condition during survey. (c) The machinery operating condition, including HVAC system, during survey. (d) Noise and vibration measuring equipment. 4.3 Noise measurement

In [174]:
from pprint import pprint
pprint(documents[0].page_content)

("This page explains Lloyd's Register Rules and Regulations for the "
 'Classification of Ships, July 2022, that belongs to catogories of Rules and '
 'LR./nlevels should be measured for information only. Definition of long-term '
 'DP mode can be agreed between Owner and Builder. 4.2.5 Prior to survey, a '
 'test programme is to be submitted for approval by LR. This programme is to '
 'contain details of the following: (a) Measurement locations indicated on a '
 "general arrangement of the ship. (b) The ship's loading condition during "
 'survey. (c) The machinery operating condition, including HVAC system, during '
 'survey. (d) Noise and vibration measuring equipment. 4.3 Noise measurements '
 '4.3.1 Noise measurements are to be conducted in accordance with ISO 2923 and '
 'IMO Resolution MSC.337(91) – Adoption of the Code on Noise Levels on Board '
 'Ships – (Adopted on 30 November 2012)The Annex below is consolidated into '
 'Resolution MSC.337(91) . Measurements of noise levels a