In [3]:
import chromadb
import ollama
from langchain_chroma import Chroma
from langchain_ollama import OllamaEmbeddings
import pandas as pd

In [4]:
def read_vectordb_as_df(db_path:str):
    client = chromadb.PersistentClient(path=db_path)
    for collection in client.list_collections():
        data = collection.get(include=['embeddings', 'documents', 'metadatas'])
        df = pd.DataFrame({"ids":data["ids"], 
                            "metadatas":data["metadatas"], 
                            "documents":data["documents"]})
        df["first_div"] = df["metadatas"].apply(lambda x: x["First Division"])
        df["second_div"] = df["metadatas"].apply(lambda x: x["Second Division"])
        df["filename"] = df["metadatas"].apply(lambda x: x["File Name"])
        df = df[["ids", "first_div", "second_div","filename","documents", "metadatas"]]
    return df

In [5]:
db_path = "./db/chroma_db_02"
vectorstore = Chroma(collection_name="collection_01", persist_directory=db_path, embedding_function=OllamaEmbeddings(base_url="http://localhost:11434", model="bge-m3:latest"))
print(vectorstore)

<langchain_chroma.vectorstores.Chroma object at 0x000002895DBB1370>


In [6]:
df = read_vectordb_as_df(db_path=db_path)
df.head(2)

Unnamed: 0,ids,first_div,second_div,filename,documents,metadatas
0,faace8c4-ab2c-43b4-9b4e-7fc15319bc78,MANUAL,Common,[KISA Insight 2023 Vol.03] ChatGPT(챗GPT) 보ᄋ...,This page explains [KISA Insight 2023 Vol.03] ...,{'File Name': '[KISA Insight 2023 Vol.03] Chat...
1,1a3d1b93-e5d3-4a96-990d-e4ba6b976e29,MANUAL,Common,[KISA Insight 2023 Vol.03] ChatGPT(챗GPT) 보ᄋ...,This page explains [KISA Insight 2023 Vol.03] ...,{'File Name': '[KISA Insight 2023 Vol.03] Chat...


In [7]:
# query = """
# according to "iss" manual, what is the "noon report" in iss system?
# """
query = """
what is the obligation of master of troubled vessel in "singapore" "port"
"""

# Semantic Search

In [None]:

def get_semantic_search_docs(query:str, vectorstore, k:int=100, fetch_k:int=200):
    retriever = vectorstore.as_retriever(
        search_type="mmr", 
        search_kwargs={'k': k, "fetch_k":fetch_k}
        )
    result = retriever.invoke(query)
    print(len(result))
    return result

result = get_semantic_search_docs(query=query, vectorstore=vectorstore, k=10, fetch_k=100)
result

10


[Document(metadata={'File Name': 'Maritime and Port Authority of Singapore_2000', 'File Path': '/content/drive/MyDrive/PORT/Port Regulation/Maritime and Port Authority of Singapore_2000.pdf', 'First Division': 'PORT', 'Page': 29, 'Second Division': 'Port Regulation'}, page_content='This page explains Maritime and Port Authority of Singapore_2000, that belongs to catogories of PORT and Port Regulation./nPortMastermayrequirevesseltoleaveport 36.ThePortMaster maydirect avessel toleave theportifheisof theopinion thatitwould notbeintheinterest oftheAuthority forthe vessel toremain inport. Damaged vesselsentering port 37.Noperson maycause orpermit adamaged vessel toenter the portwithout theprior written permission ofthePortMaster whomay grant thewritten permission subject tosuch conditions asthePort Master thinks fit. [S518/2017 wef18/09/2017] PARTVI VESSELS BERTHED ALONGSIDE Unauthorised berthing, etc. 38.Noperson maycause orpermit avessel — (a)toproceed alongside, orliealongside, anyplace 

# Reranking - bge-reranker-v2-m3

In [None]:
from FlagEmbedding import FlagReranker

  from .autonotebook import tqdm as notebook_tqdm


: 

In [None]:
def reranking(query, docs):
    reranked_docs = []
    reranking_model_path = "D:/LLMs/bge-reranker-v2-m3"
    reranker = FlagReranker(model_name_or_path=reranking_model_path, use_fp16=True)
    inputs = [[query, doc.page_content] for doc in docs]
    scores = reranker.compute_score(inputs)
    if not isinstance(scores, list):
        scores = [scores]
    score_index = [(score, idx) for idx, score in enumerate(scores)]
    print(score_index)
    sorted_score_index = sorted(score_index, key=lambda x:x[0], reverse=True)
    sorted_indices = [idx for score, idx in sorted_score_index]
    for i in range(len(sorted_indices)):
        reranked_docs.append(result[sorted_indices[i]])
    return reranked_docs

In [None]:
result = reranking(query=query, docs=result)
result

You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


[(2.660515785217285, 0), (2.1382970809936523, 1), (2.1106739044189453, 2), (1.899213433265686, 3), (1.6073378324508667, 4), (1.412809133529663, 5), (1.3553341627120972, 6), (1.3008028268814087, 7), (1.037865400314331, 8), (0.7517440319061279, 9)]


[Document(metadata={'File Name': 'Maritime and Port Authority of Singapore_2000', 'File Path': '/content/drive/MyDrive/PORT/Port Regulation/Maritime and Port Authority of Singapore_2000.pdf', 'First Division': 'PORT', 'Page': 14, 'Second Division': 'Port Regulation'}, page_content='This page explains Maritime and Port Authority of Singapore_2000, that belongs to catogories of PORT and Port Regulation./ndamage thatmayaffecttheseaworthiness ofsuch vessel, bytheearlier ofthefollowing: (i)within 24hours ofthatincident oroccurrence; [S518/2017 wef18/09/2017] (ii)before anyfurther application forportclearance is made; and [S518/2017 wef18/09/2017] (h)where portclearance hasbeen granted, surrender theport clearance tothePortMaster andmake afresh application forportclearance. [S518/2017 wef18/09/2017] (2)Themaster ,andanyother officer,person-in-char geandmember ofthecrew,ofthevessel referred toinparagraph (1)must comply withallinstructions given bythePortMaster . [S518/2017 wef18/09/2017] Emer

# BM25 Search

In [None]:
import re
import numpy as np
from rank_bm25 import BM25Okapi


def get_bm25_top_docs(query:str, documents:list, top_k:int=20):

    tokenized_corpus = [doc.page_content for doc in documents]
    bm25 = BM25Okapi(tokenized_corpus)

    pattern = r'"(.*?)"'  # 따옴표로 둘러싸인 단어만 검색 대상으로 리스트에 담기
    query = re.findall(pattern, query)
    doc_scores = bm25.get_scores(query)
    sorted_indices = np.argsort(doc_scores)  # 값에 대한 정렬된 인덱스
    order_values = np.empty_like(sorted_indices)
    order_values[sorted_indices] = np.arange(len(doc_scores))
    top_index = [i for i, value in enumerate(order_values) if value < top_k]
    top_docs = [i for i in documents if documents.index(i) in top_index ]
    print(len(top_docs))
    print(top_docs)

    return top_docs


result = get_bm25_top_docs(query=query, documents=result, top_k=20)
result

10
[Document(metadata={'File Name': 'Maritime and Port Authority of Singapore_2000', 'File Path': '/content/drive/MyDrive/PORT/Port Regulation/Maritime and Port Authority of Singapore_2000.pdf', 'First Division': 'PORT', 'Page': 14, 'Second Division': 'Port Regulation'}, page_content='This page explains Maritime and Port Authority of Singapore_2000, that belongs to catogories of PORT and Port Regulation./ndamage thatmayaffecttheseaworthiness ofsuch vessel, bytheearlier ofthefollowing: (i)within 24hours ofthatincident oroccurrence; [S518/2017 wef18/09/2017] (ii)before anyfurther application forportclearance is made; and [S518/2017 wef18/09/2017] (h)where portclearance hasbeen granted, surrender theport clearance tothePortMaster andmake afresh application forportclearance. [S518/2017 wef18/09/2017] (2)Themaster ,andanyother officer,person-in-char geandmember ofthecrew,ofthevessel referred toinparagraph (1)must comply withallinstructions given bythePortMaster . [S518/2017 wef18/09/2017] E

[Document(metadata={'File Name': 'Maritime and Port Authority of Singapore_2000', 'File Path': '/content/drive/MyDrive/PORT/Port Regulation/Maritime and Port Authority of Singapore_2000.pdf', 'First Division': 'PORT', 'Page': 14, 'Second Division': 'Port Regulation'}, page_content='This page explains Maritime and Port Authority of Singapore_2000, that belongs to catogories of PORT and Port Regulation./ndamage thatmayaffecttheseaworthiness ofsuch vessel, bytheearlier ofthefollowing: (i)within 24hours ofthatincident oroccurrence; [S518/2017 wef18/09/2017] (ii)before anyfurther application forportclearance is made; and [S518/2017 wef18/09/2017] (h)where portclearance hasbeen granted, surrender theport clearance tothePortMaster andmake afresh application forportclearance. [S518/2017 wef18/09/2017] (2)Themaster ,andanyother officer,person-in-char geandmember ofthecrew,ofthevessel referred toinparagraph (1)must comply withallinstructions given bythePortMaster . [S518/2017 wef18/09/2017] Emer

# Keyword Matching

In [None]:
def get_keywords_matched_docs(query:str, documents:list, and_condition:bool=True):
    pattern = r'"(.*?)"'  # 따옴표로 둘러싸인 단어만 검색 대상으로 리스트에 담기
    extracted_keywords = re.findall(pattern, query)
    lower_keywors = [keyword.lower() for keyword in extracted_keywords]

    lower_docs = [doc.page_content.lower() for doc in documents]
    if and_condition: matching_sentences = [sentence for sentence in lower_docs if all(keyword in sentence for keyword in lower_keywors)]  # 복수 키워드 and 조건
    else: matching_sentences = [sentence for sentence in lower_docs if any(keyword in sentence for keyword in lower_keywors)]  # 복수 키워드 or 조건 (and 조건이 null 이면)

    print(lower_docs)

    matched_index = [lower_docs.index(doc) for doc in matching_sentences]
    print(matched_index)

    final_matched_docs = [documents[i] for i in matched_index]

    return final_matched_docs

result = get_keywords_matched_docs(query=query, documents=result, and_condition=True)
result

['this page explains maritime and port authority of singapore_2000, that belongs to catogories of port and port regulation./ndamage thatmayaffecttheseaworthiness ofsuch vessel, bytheearlier ofthefollowing: (i)within 24hours ofthatincident oroccurrence; [s518/2017 wef18/09/2017] (ii)before anyfurther application forportclearance is made; and [s518/2017 wef18/09/2017] (h)where portclearance hasbeen granted, surrender theport clearance totheportmaster andmake afresh application forportclearance. [s518/2017 wef18/09/2017] (2)themaster ,andanyother officer,person-in-char geandmember ofthecrew,ofthevessel referred toinparagraph (1)must comply withallinstructions given bytheportmaster . [s518/2017 wef18/09/2017] emergency oraccident 6.intheevent ofanemergency onavessel, oranaccident toa vessel, thatisnotprovided forinregulation 5,themaster orpersonin-char geofthevessel must — (a)inform theport master oftheemergency oraccident without delay bythemost direct means andcomply with anyinstruction 

[Document(metadata={'File Name': 'Maritime and Port Authority of Singapore_2000', 'File Path': '/content/drive/MyDrive/PORT/Port Regulation/Maritime and Port Authority of Singapore_2000.pdf', 'First Division': 'PORT', 'Page': 14, 'Second Division': 'Port Regulation'}, page_content='This page explains Maritime and Port Authority of Singapore_2000, that belongs to catogories of PORT and Port Regulation./ndamage thatmayaffecttheseaworthiness ofsuch vessel, bytheearlier ofthefollowing: (i)within 24hours ofthatincident oroccurrence; [S518/2017 wef18/09/2017] (ii)before anyfurther application forportclearance is made; and [S518/2017 wef18/09/2017] (h)where portclearance hasbeen granted, surrender theport clearance tothePortMaster andmake afresh application forportclearance. [S518/2017 wef18/09/2017] (2)Themaster ,andanyother officer,person-in-char geandmember ofthecrew,ofthevessel referred toinparagraph (1)must comply withallinstructions given bythePortMaster . [S518/2017 wef18/09/2017] Emer

# Test

In [None]:
%timeit

# query = 'what is the obligation of master of troubled vessel in "singapore" port'
query = 'according to "iss" manual, what is the "noon report" in iss system?'
# query = 'with reference to "lr" rules, explain the "noise" level of radar rooms.(vectorstore)'
# query ='with reference to "lr" rule, explain the measurement procedure of noise'

pattern = r'"(.*?)"'  # 따옴표로 둘러싸인 단어만 검색 대상으로 리스트에 담기
extracted_keywords = re.findall(pattern, query)
if len(extracted_keywords) > 0:
    documents = get_semantic_search_docs(query=query, vectorstore=vectorstore, k=10, fetch_k=50)
    documents = reranking(query=query, docs=documents)
    documents = get_bm25_top_docs(query=query, documents=documents, top_k=30)
    documents = get_keywords_matched_docs(query=query, documents=documents, and_condition=True)
else: 
    documents = get_semantic_search_docs(query=query, vectorstore=vectorstore, k=3, fetch_k=50)
    documents = reranking(query=query, docs=documents)

documents

10


You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


[(1.028217077255249, 0), (1.7737805843353271, 1), (0.8020984530448914, 2), (-7.5000457763671875, 3), (-3.6974735260009766, 4), (-6.289929389953613, 5), (-4.436821937561035, 6), (-8.44660472869873, 7), (-7.1805620193481445, 8), (-4.743778228759766, 9)]
10
[Document(metadata={'File Name': 'Maritime and Port Authority of Singapore_2000', 'File Path': '/content/drive/MyDrive/PORT/Port Regulation/Maritime and Port Authority of Singapore_2000.pdf', 'First Division': 'PORT', 'Page': 24, 'Second Division': 'Port Regulation'}, page_content='This page explains Maritime and Port Authority of Singapore_2000, that belongs to catogories of PORT and Port Regulation./nFailuretocomplywithregulations 25.—(1) Where themaster ofavessel is,inanyemergency affecting thesafety ofanyperson orproperty ,unable tocomply — (a)with anyprovision of,oranydirection given under , regulations 21to24,26and27;or (b)withanycondition imposed under regulation 24(1), themaster must take such steps asmay benecessary toavoid en

[]

In [None]:
from pprint import pprint
pprint(documents[0].page_content)

("This page explains Lloyd's Register Rules and Regulations for the "
 'Classification of Ships, July 2022, that belongs to catogories of Rules and '
 'LR./nlevels should be measured for information only. Definition of long-term '
 'DP mode can be agreed between Owner and Builder. 4.2.5 Prior to survey, a '
 'test programme is to be submitted for approval by LR. This programme is to '
 'contain details of the following: (a) Measurement locations indicated on a '
 "general arrangement of the ship. (b) The ship's loading condition during "
 'survey. (c) The machinery operating condition, including HVAC system, during '
 'survey. (d) Noise and vibration measuring equipment. 4.3 Noise measurements '
 '4.3.1 Noise measurements are to be conducted in accordance with ISO 2923 and '
 'IMO Resolution MSC.337(91) – Adoption of the Code on Noise Levels on Board '
 'Ships – (Adopted on 30 November 2012)The Annex below is consolidated into '
 'Resolution MSC.337(91) . Measurements of noise levels a