In [1]:
import chromadb
import ollama
from langchain_chroma import Chroma
from langchain_ollama import OllamaEmbeddings
import pandas as pd

In [2]:
def read_vectordb_as_df(db_path:str):
    client = chromadb.PersistentClient(path=db_path)
    for collection in client.list_collections():
        data = collection.get(include=['embeddings', 'documents', 'metadatas'])
        df = pd.DataFrame({"ids":data["ids"], 
                            "metadatas":data["metadatas"], 
                            "documents":data["documents"]})
        df["first_div"] = df["metadatas"].apply(lambda x: x["First Division"])
        df["second_div"] = df["metadatas"].apply(lambda x: x["Second Division"])
        df["filename"] = df["metadatas"].apply(lambda x: x["File Name"])
        df = df[["ids", "first_div", "second_div","filename","documents", "metadatas"]]
    return df

In [3]:
db_path = "./db/chroma_db_02"
vector_store = Chroma(collection_name="my_collection", persist_directory=db_path, embedding_function=OllamaEmbeddings(model="bge-m3:latest"))
print(vector_store)

<langchain_chroma.vectorstores.Chroma object at 0x000001ED82541250>


In [4]:
df = read_vectordb_as_df(db_path=db_path)
df.head(2)

Unnamed: 0,ids,first_div,second_div,filename,documents,metadatas
0,faace8c4-ab2c-43b4-9b4e-7fc15319bc78,MANUAL,Common,[KISA Insight 2023 Vol.03] ChatGPT(챗GPT) 보ᄋ...,This page explains [KISA Insight 2023 Vol.03] ...,{'File Name': '[KISA Insight 2023 Vol.03] Chat...
1,1a3d1b93-e5d3-4a96-990d-e4ba6b976e29,MANUAL,Common,[KISA Insight 2023 Vol.03] ChatGPT(챗GPT) 보ᄋ...,This page explains [KISA Insight 2023 Vol.03] ...,{'File Name': '[KISA Insight 2023 Vol.03] Chat...


In [9]:
# d = "I like a apple"
# response = ollama.embeddings(model="bge-m3:latest", prompt=d)
# response

In [5]:
import numpy as np
from sentence_transformers.util import cos_sim

def get_similarity_search_score_rank(query:str, db_path:str):
    embedded_query = ollama.embeddings(model="bge-m3:latest", prompt=query)
    embedded_query=[np.float64(k) for k in embedded_query['embedding']]

    client = chromadb.PersistentClient(path=db_path)
    for collection in client.list_collections():
        data = collection.get(include=['embeddings'])
    
    results = []
    for d in data['embeddings']:
        similarity = cos_sim(embedded_query, d)
        results.append(similarity)

    results = [r.item() for r in results]

    sorted_indices = np.argsort(results)  # 값에 대한 정렬된 인덱스
    order_values = np.empty_like(sorted_indices)
    order_values[sorted_indices] = np.arange(len(results))

    return order_values


query = """
what is the noon report in iss system?
"""
res1 = get_similarity_search_score_rank(query=query, db_path=db_path)
res1

  from tqdm.autonotebook import tqdm, trange


array([47166, 44995, 53152, ..., 30774, 28023,  5082], dtype=int64)

In [6]:
len(res1)

53790

In [7]:
import re
import numpy as np
from rank_bm25 import BM25Okapi

def bm25_search_rank(query:str, db_path:str):
    client = chromadb.PersistentClient(path=db_path)
    for collection in client.list_collections():
        data = collection.get(include=['documents', 'metadatas'])
    tokenized_corpus = [doc.split(" ") for doc in data["documents"]]
    bm25 = BM25Okapi(tokenized_corpus)

    pattern = r'"(.*?)"'  # 따옴표로 둘러싸인 단어만 검색 대상으로 리스트에 담기
    tokenized_query = re.findall(pattern, query)
    print(tokenized_query)

    doc_scores = bm25.get_scores(tokenized_query)

    sorted_indices = np.argsort(doc_scores)  # 값에 대한 정렬된 인덱스
    order_values = np.empty_like(sorted_indices)
    order_values[sorted_indices] = np.arange(len(doc_scores))


    return order_values

res2 = bm25_search_rank(query='"what" is the "noon" "report" in "iss" system', db_path="./db/chroma_db_02")
res2

['what', 'noon', 'report', 'iss']


array([    0, 35667, 35668, ..., 17868,  8965, 52566], dtype=int64)

In [8]:
len(res2)

53790

In [9]:
def rrf(all_rankings: list[list[int]]):
    """Takes in list of rankings produced by multiple retrieval algorithms,
    and returns newly of ranked and scored items."""
    scores = {} # key is the index and value is the score of that index
    # 1. Take every retrieval algorithm ranking
    for algorithm_ranks in all_rankings:
        # 2. For each ranking, take the index and the ranked position
        for rank, idx in enumerate(algorithm_ranks):
            # 3. Calculate the score and add it to the index
            if idx in scores:
                scores[idx] += 1 / (60 + rank)
            else:
                scores[idx] = 1 / (60 + rank)

    # 4. Sort the indices based on accumulated scores
    sorted_scores = sorted(scores.items(), key=lambda item: item[1], reverse=True)
    return sorted_scores

In [10]:
new_ranks = rrf([res1, res2])
new_ranks[:10]

[(35429, 0.01699409210619973),
 (47166, 0.016729158855143107),
 (0, 0.016721766121182067),
 (44995, 0.016431307068236697),
 (35667, 0.0164252958433114),
 (53152, 0.016183638275647654),
 (35668, 0.01615240548169952),
 (35669, 0.015965420234501735),
 (35670, 0.015666296716911006),
 (193, 0.01565292672028597)]

In [29]:
num = 35429
t_df = df.iloc[num:num+1, :]
t_df

Unnamed: 0,ids,first_div,second_div,filename,documents,metadatas
35429,88a29fd6-e3b9-40ec-8985-705e48c774a7,Rules,KR,PART 15_2024_Structural Rules for Membrane Typ...,This page explains PART 15_2024_Structural Rul...,{'File Name': 'PART 15_2024_Structural Rules f...


In [30]:
from pprint import pprint
pprint(t_df["metadatas"].values[0])
pprint(t_df["documents"].values[0])

{'File Name': 'PART 15_2024_Structural Rules for Membrane Type Liquefied '
              'Natural Gas Carriers',
 'File Path': '/content/drive/MyDrive/Rules/KR/PART 15_2024_Structural Rules '
              'for Membrane Type Liquefied Natural Gas Carriers.pdf',
 'First Division': 'Rules',
 'Page': 30,
 'Second Division': 'KR'}
('This page explains PART 15_2024_Structural Rules for Membrane Type Liquefied '
 'Natural Gas Carriers, that belongs to catogories of Rules and KR./nPt15 '
 'Structural Rules for Membrane Type Liquefied Natural Gas Carriers Ch1General '
 'Principles Pt15, Ch1,Sec4 Rules fortheClassification ofSteel Ships 2024 '
 '252.4Scantlings 2.4.1 Unless otherwise specified, symbols regarding '
 'scantlings and their units used in these Rules are those defined in Table 5. '
 'Symbols Meaning Units S Static load case S+D Static + Dynamic load case '
 '\ue00f\ue0e9\ue0fc Total sea pressure, see Ch 4, Sec 5, [1.1] kN/m2 '
 '\ue00f\ue0ed\ue0f2 Total internal pressure due to liqu