In [1]:
import chromadb
import ollama
from langchain_chroma import Chroma
from langchain_ollama import OllamaEmbeddings
import pandas as pd

In [2]:
def read_vectordb_as_df(db_path:str):
    client = chromadb.PersistentClient(path=db_path)
    for collection in client.list_collections():
        data = collection.get(include=['embeddings', 'documents', 'metadatas'])
        df = pd.DataFrame({"ids":data["ids"], 
                            "metadatas":data["metadatas"], 
                            "documents":data["documents"]})
        df["first_div"] = df["metadatas"].apply(lambda x: x["First Division"])
        df["second_div"] = df["metadatas"].apply(lambda x: x["Second Division"])
        df["filename"] = df["metadatas"].apply(lambda x: x["File Name"])
        df = df[["ids", "first_div", "second_div","filename","documents", "metadatas"]]
    return df

In [3]:
db_path = "./db/chroma_db_02"
vector_store = Chroma(collection_name="collection_01", persist_directory=db_path, embedding_function=OllamaEmbeddings(model="bge-m3:latest"))
print(vector_store)

<langchain_chroma.vectorstores.Chroma object at 0x000001F5E67E9250>


In [18]:
query = """
according to "iss" "manual", what is the "noon report" in "iss system"?
"""
retriever = vector_store.as_retriever(search_type="mmr", search_kwargs={'k': 3, "fetch_k":5})
result = retriever.invoke(query)
result

[Document(metadata={'File Name': 'DNV Rules for Classification of Ships _2016_55_Nautical Safety', 'File Path': '/content/drive/MyDrive/Rules/DNV/DNV Rules for Classification of Ships _2016_55_Nautical Safety.pdf', 'First Division': 'Rules', 'Page': 82, 'Second Division': 'DNV'}, page_content='This page explains DNV Rules for Classification of Ships _2016_55_Nautical Safety, that belongs to catogories of Rules and DNV./nRules for Ships, January 2014  Pt.6 Ch.8 Sec.7 Network based integration of naviga tion systems (ICS) – Page 83 DET N ORSKE V ERITAS AS — time — ENC — radar video. 302  The position, heading and speed information shall b e displayed together with the indication of its sou rce. Guidance note: Sensor data, e.g. GYR 1, GYR 2, GPS 1, GPS 2, EM log, Dop pler log, GPS, radar 1, radar 2 etc.; -or result of calculation or manual input; -unit if ambiguous, e.g. UTC for time. ---e-n-d---of---G-u-i-d-a-n-c-e---n-o-t-e-- 303  Display of sensor output data The ICS shall be capable o

In [19]:
df = read_vectordb_as_df(db_path=db_path)
df.head(2)

Unnamed: 0,ids,first_div,second_div,filename,documents,metadatas
0,faace8c4-ab2c-43b4-9b4e-7fc15319bc78,MANUAL,Common,[KISA Insight 2023 Vol.03] ChatGPT(챗GPT) 보ᄋ...,This page explains [KISA Insight 2023 Vol.03] ...,{'File Name': '[KISA Insight 2023 Vol.03] Chat...
1,1a3d1b93-e5d3-4a96-990d-e4ba6b976e29,MANUAL,Common,[KISA Insight 2023 Vol.03] ChatGPT(챗GPT) 보ᄋ...,This page explains [KISA Insight 2023 Vol.03] ...,{'File Name': '[KISA Insight 2023 Vol.03] Chat...


In [5]:
# d = "I like a apple"
# response = ollama.embeddings(model="bge-m3:latest", prompt=d)
# response

In [20]:
import numpy as np
from sentence_transformers.util import cos_sim

def get_similarity_search_score_rank(query:str, db_path:str):
    embedded_query = ollama.embeddings(model="bge-m3:latest", prompt=query)
    embedded_query=[np.float64(k) for k in embedded_query['embedding']]

    client = chromadb.PersistentClient(path=db_path)
    for collection in client.list_collections():
        data = collection.get(include=['embeddings'])
    
    results = []
    for d in data['embeddings']:
        similarity = cos_sim(embedded_query, d)
        results.append(similarity)

    results = [r.item() for r in results]

    sorted_indices = np.argsort(results)  # 값에 대한 정렬된 인덱스
    order_values = np.empty_like(sorted_indices)
    order_values[sorted_indices] = np.arange(len(results))

    return order_values

res1 = get_similarity_search_score_rank(query=query, db_path=db_path)
res1

array([64942, 64738, 68280, ..., 64225, 18917, 32203], dtype=int64)

In [22]:
len(res1)

68848

In [23]:
import re
import numpy as np
from rank_bm25 import BM25Okapi

def bm25_search_rank(query:str, db_path:str):
    client = chromadb.PersistentClient(path=db_path)
    for collection in client.list_collections():
        data = collection.get(include=['documents', 'metadatas'])
    tokenized_corpus = [doc.split(" ") for doc in data["documents"]]
    bm25 = BM25Okapi(tokenized_corpus)

    pattern = r'"(.*?)"'  # 따옴표로 둘러싸인 단어만 검색 대상으로 리스트에 담기
    tokenized_query = re.findall(pattern, query)
    print(tokenized_query)

    doc_scores = bm25.get_scores(tokenized_query)

    sorted_indices = np.argsort(doc_scores)  # 값에 대한 정렬된 인덱스
    order_values = np.empty_like(sorted_indices)
    order_values[sorted_indices] = np.arange(len(doc_scores))


    return order_values

res2 = bm25_search_rank(query=query, db_path=db_path)
res2

['iss', 'manual', 'noon report', 'iss system']


array([    0, 45454, 45455, ..., 22770, 11642, 66965], dtype=int64)

In [24]:
len(res2)

68848

In [25]:
def rrf(all_rankings: list[list[int]]):
    """Takes in list of rankings produced by multiple retrieval algorithms,
    and returns newly of ranked and scored items."""
    scores = {} # key is the index and value is the score of that index
    # 1. Take every retrieval algorithm ranking
    for algorithm_ranks in all_rankings:
        # 2. For each ranking, take the index and the ranked position
        for rank, idx in enumerate(algorithm_ranks):
            # 3. Calculate the score and add it to the index
            if idx in scores:
                scores[idx] += 1 / (60 + rank)
            else:
                scores[idx] = 1 / (60 + rank)

    # 4. Sort the indices based on accumulated scores
    sorted_scores = sorted(scores.items(), key=lambda item: item[1], reverse=True)
    return sorted_scores

In [26]:
new_ranks = rrf([res1, res2])
new_ranks[:10]

[(45458, 0.01677157793662648),
 (0, 0.016721766121182067),
 (64942, 0.016708704669020795),
 (45454, 0.016532312224395065),
 (64738, 0.016435364313233374),
 (68280, 0.016152219050867334),
 (45455, 0.016144346897328188),
 (45456, 0.015909929863418236),
 (31432, 0.015889654254434126),
 (45462, 0.015684648736418443)]

In [31]:
num = 64942
t_df = df.iloc[num:num+1, :]
t_df
from pprint import pprint
pprint(t_df["metadatas"].values[0])
pprint(t_df["documents"].values[0])

{'File Name': "Lloyd's Register Rules and Regulations for the Classification "
              'of Ships, July 2022',
 'File Path': "/content/drive/MyDrive/Rules/LR/Lloyd's Register Rules and "
              'Regulations for the Classification of Ships, July 2022.pdf',
 'First Division': 'Rules',
 'Page': 930,
 'Second Division': 'LR'}
("This page explains Lloyd's Register Rules and Regulations for the "
 'Classification of Ships, July 2022, that belongs to catogories of Rules and '
 'LR./nD1/4D1/4 D1/2D1/2 9 8765431 CL21Figure 10.6.3 Hull envelope plating '
 'Itemisation of parts n Section 7 Construction details and minimum thickness '
 '7.1 Symbols 7.1.1 The symbols used in this Section are defined in Pt 4, Ch '
 '9, 10.1 Symbols . 7.2 Compartment minimum thickness 7.2.1 The requirements '
 'of Pt 4, Ch 9, 10.2 Compartment minimum thickness are also applicable to '
 'small conventional single hull tankers. 7.3 Geometric properties and '
 'proportions of members 7.3.1 The depth of the w