In [12]:
import chromadb
from langchain_chroma import Chroma
from langchain_ollama import OllamaEmbeddings
from pprint import pprint
import pandas as pd

In [13]:
def read_vectordb_as_df(db_path:str):
    client = chromadb.PersistentClient(path=db_path)
    for collection in client.list_collections():
        data = collection.get(include=['embeddings', 'documents', 'metadatas'])
        df = pd.DataFrame({"ids":data["ids"], 
                            "metadatas":data["metadatas"], 
                            "documents":data["documents"]})
        df["first_div"] = df["metadatas"].apply(lambda x: x["First Division"])
        df["second_div"] = df["metadatas"].apply(lambda x: x["Second Division"])
        df["filename"] = df["metadatas"].apply(lambda x: x["File Name"])
        df = df[["ids", "first_div", "second_div","filename","documents", "metadatas"]]
    return df

In [23]:
def read_vectordb_as_data(db_path:str):
    client = chromadb.PersistentClient(path=db_path)
    for collection in client.list_collections():
        data = collection.get(include=['embeddings', 'documents', 'metadatas'])
    return data

In [14]:
def delete_document(filename:str, db_path:str):
  vector_store = Chroma(collection_name="collection_01", persist_directory=db_path, embedding_function=OllamaEmbeddings(model="bge-m3:latest"))
  del_ids = vector_store.get(where={'File Name':filename})["ids"]
  vector_store.delete(del_ids)
  print("Document is deleted")

In [32]:
db_path = "./db/chroma_db_02"
vector_store = Chroma(collection_name="my_collection", persist_directory=db_path, embedding_function=OllamaEmbeddings(model="bge-m3:latest"))
print(vector_store)

<langchain_chroma.vectorstores.Chroma object at 0x0000021E4EE7E360>


In [39]:
df = read_vectordb_as_df(db_path=db_path)
df.head()

Unnamed: 0,ids,first_div,second_div,filename,documents,metadatas
0,faace8c4-ab2c-43b4-9b4e-7fc15319bc78,MANUAL,Common,[KISA Insight 2023 Vol.03] ChatGPT(챗GPT) 보ᄋ...,This page explains [KISA Insight 2023 Vol.03] ...,{'File Name': '[KISA Insight 2023 Vol.03] Chat...
1,1a3d1b93-e5d3-4a96-990d-e4ba6b976e29,MANUAL,Common,[KISA Insight 2023 Vol.03] ChatGPT(챗GPT) 보ᄋ...,This page explains [KISA Insight 2023 Vol.03] ...,{'File Name': '[KISA Insight 2023 Vol.03] Chat...
2,f67ce384-3df7-4ffe-aa13-9d187d73cb13,MANUAL,Common,[KISA Insight 2023 Vol.03] ChatGPT(챗GPT) 보ᄋ...,This page explains [KISA Insight 2023 Vol.03] ...,{'File Name': '[KISA Insight 2023 Vol.03] Chat...
3,5f7bdc1f-deb2-4f3d-9afb-2eafbf5d5192,MANUAL,Common,[KISA Insight 2023 Vol.03] ChatGPT(챗GPT) 보ᄋ...,This page explains [KISA Insight 2023 Vol.03] ...,{'File Name': '[KISA Insight 2023 Vol.03] Chat...
4,5edec5fa-fdde-41e4-a688-b77fb1fb75c3,MANUAL,Common,[KISA Insight 2023 Vol.03] ChatGPT(챗GPT) 보ᄋ...,This page explains [KISA Insight 2023 Vol.03] ...,{'File Name': '[KISA Insight 2023 Vol.03] Chat...


In [40]:
data = read_vectordb_as_data(db_path=db_path)
print(data['ids'][:5])
print(data['metadatas'][:5])
print(data['documents'][:5])

['faace8c4-ab2c-43b4-9b4e-7fc15319bc78', '1a3d1b93-e5d3-4a96-990d-e4ba6b976e29', 'f67ce384-3df7-4ffe-aa13-9d187d73cb13', '5f7bdc1f-deb2-4f3d-9afb-2eafbf5d5192', '5edec5fa-fdde-41e4-a688-b77fb1fb75c3']
[{'File Name': '[KISA Insight 2023 Vol.03] ChatGPT(챗GPT) 보안 위협과 시사점', 'File Path': '/content/drive/MyDrive/MANUAL/Common/[KISA Insight 2023 Vol.03] ChatGPT(챗GPT) 보안 위협과 시사점.pdf', 'First Division': 'MANUAL', 'Page': 0, 'Second Division': 'Common'}, {'File Name': '[KISA Insight 2023 Vol.03] ChatGPT(챗GPT) 보안 위협과 시사점', 'File Path': '/content/drive/MyDrive/MANUAL/Common/[KISA Insight 2023 Vol.03] ChatGPT(챗GPT) 보안 위협과 시사점.pdf', 'First Division': 'MANUAL', 'Page': 1, 'Second Division': 'Common'}, {'File Name': '[KISA Insight 2023 Vol.03] ChatGPT(챗GPT) 보안 위협과 시사점', 'File Path': '/content/drive/MyDrive/MANUAL/Common/[KISA Insight 2023 Vol.03] ChatGPT(챗GPT) 보안 위협과 시사점.pdf', 'First Division': 'MANUAL', 'Page': 2, 'Second D

In [35]:
df["first_div"].unique(), df["second_div"].unique()

(array(['MANUAL', 'Rules', 'PORT'], dtype=object),
 array(['Common', 'Integrated Smart Ship(ISS)', 'ABS', 'DNV', 'KR',
        'MARPOL', 'SOLAS', 'BV', 'Port Regulation', 'Win GD', 'Cryostar'],
       dtype=object))

In [36]:
len(df["filename"].unique()), df["filename"].unique()

(179,
 array(['[KISA Insight 2023 Vol.03] ChatGPT(챗GPT) 보안 위협과 시사점',
        'Integrated Smart Ship(ISS1.0)',
        '00-part-1A-ships-jul24_Conditions of Classification',
        '00-part-1D-alternative-jan24_ Alternative Arrangements, Novel Concepts and New Technologies',
        'DNV Rules for Classification of Ships _2016_07_User information and current rule chapters',
        'DNV Rules for Classification of Ships _2016_08_Introduction to Ship Classification',
        'DNV Rules for Classification of Ships _2016_09_Plan Approval Documentation Types – Definitions',
        'DNV Rules for Classification of Ships _2016_11_General regulations',
        'DNV Rules for Classification of Ships _2016_12_Class notations',
        'DNV Rules for Classification of Ships _2016_14_General Requirements for Materials',
        'DNV Rules for Classification of Ships _2016_15_Metallic materials',
        'DNV Rules for Classification of Ships _2016_16_Fabrication and testing of ships

In [52]:
def similarity_search(query:str, db_path:str, k:int=3):
    vector_store = Chroma(collection_name="collection_01", persist_directory=db_path, embedding_function=OllamaEmbeddings(model="bge-m3:latest"))
    results = vector_store.similarity_search_with_relevance_scores(query, k=k)
    return results

query = """
what is the noon report in iss system?
"""
res1 = similarity_search(query=query, db_path=db_path, k=3)
res1

[(Document(metadata={'File Name': 'DNV Rules for Classification of Ships _2016_55_Nautical Safety', 'File Path': '/content/drive/MyDrive/Rules/DNV/DNV Rules for Classification of Ships _2016_55_Nautical Safety.pdf', 'First Division': 'Rules', 'Page': 82, 'Second Division': 'DNV'}, page_content='This page explains DNV Rules for Classification of Ships _2016_55_Nautical Safety, that belongs to catogories of Rules and DNV./nRules for Ships, January 2014  Pt.6 Ch.8 Sec.7 Network based integration of naviga tion systems (ICS) – Page 83 DET N ORSKE V ERITAS AS — time — ENC — radar video. 302  The position, heading and speed information shall b e displayed together with the indication of its sou rce. Guidance note: Sensor data, e.g. GYR 1, GYR 2, GPS 1, GPS 2, EM log, Dop pler log, GPS, radar 1, radar 2 etc.; -or result of calculation or manual input; -unit if ambiguous, e.g. UTC for time. ---e-n-d---of---G-u-i-d-a-n-c-e---n-o-t-e-- 303  Display of sensor output data The ICS shall be capable 

In [53]:
import re
import numpy as np
from rank_bm25 import BM25Okapi
from langchain_core.documents import Document

def bm25_search(query:str, db_path:str):
    client = chromadb.PersistentClient(path=db_path)
    for collection in client.list_collections():
        data = collection.get(include=['documents', 'metadatas'])
    tokenized_corpus = [doc.split(" ") for doc in data["documents"]]
    bm25 = BM25Okapi(tokenized_corpus)

    pattern = r'"(.*?)"'  # 따옴표로 둘러싸인 단어만 검색 대상으로 리스트에 담기
    tokenized_query = re.findall(pattern, query)
    print(tokenized_query)

    doc_scores = bm25.get_scores(tokenized_query)
    top_indices = np.argsort(doc_scores)[-3:]   # 유사도 스코어 상위 3개 추리기
    top_indices = top_indices[::-1]   # 유사도 내림차순 정렬

    results = []  # 결과를 langchain Document 형태로 변경 (Similarity Search와 같은 형태로 만들어서 리턴)
    for idx in top_indices:
        document = (Document(metadata=data["metadatas"][idx], page_content=data["documents"][idx]), doc_scores[idx])
        results.append(document)
    
    return results

res2 = bm25_search(query='"what" is the "noon report" in "iss" system', db_path="./db/chroma_db_02")
res2

['what', 'noon report', 'iss']


[(Document(metadata={'File Name': '00-part-1D-alternative-jan24_ Alternative Arrangements, Novel Concepts and New Technologies', 'File Path': '/content/drive/MyDrive/Rules/ABS/00-part-1D-alternative-jan24_ Alternative Arrangements, Novel Concepts and New Technologies.pdf', 'First Division': 'Rules', 'Page': 120, 'Second Division': 'ABS'}, page_content='This page explains 00-part-1D-alternative-jan24_ Alternative Arrangements, Novel Concepts and New Technologies, that belongs to catogories of Rules and ABS./n|Qualification Stage|Item #|Question|Yes/No/NA|Evidence to support?|\n|-------------------|------|--------|---------|--------------------|\n|Feasibility Stage|1|Has what is specifically new and/or unique about the concept been clearly identified?|||\n|None|2|Has what specifically needs qualification been defined?|||\n|None|3|Have potential applications been identified?|||\n|None|4|Have goals, functional requirements, and fundamental objectives (e.g., RAM) for the identified applicat

In [63]:
semantic_top_5_matches_idx = [i[1] for i in res1]
print(semantic_top_5_matches_idx)
keyword_top_5_matches_idx = [i[1] for i in res2]
print(keyword_top_5_matches_idx)

[0.2513917228469573, 0.23287534789066622, 0.21835752874630066]
[9.76202767272458, 9.161182548835646, 9.006662281547236]


In [68]:
def rrf(all_rankings: list[list[int]]):
    """Takes in list of rankings produced by multiple retrieval algorithms,
    and returns newly of ranked and scored items."""
    scores = {} # key is the index and value is the score of that index
    # 1. Take every retrieval algorithm ranking
    for algorithm_ranks in all_rankings:
        print(algorithm_ranks)
        # 2. For each ranking, take the index and the ranked position
        for rank, idx in enumerate(algorithm_ranks):
            print(rank, idx)
            # 3. Calculate the score and add it to the index
            if idx in scores:
                scores[idx] += 1 / (60 + rank)
            else:
                scores[idx] = 1 / (60 + rank)

    # 4. Sort the indices based on accumulated scores
    sorted_scores = sorted(scores.items(), key=lambda item: item[1], reverse=True)
    return sorted_scores

In [69]:
new_ranks = rrf([semantic_top_5_matches_idx, keyword_top_5_matches_idx])
new_ranks

[0.2513917228469573, 0.23287534789066622, 0.21835752874630066]
0 0.2513917228469573
1 0.23287534789066622
2 0.21835752874630066
[9.76202767272458, 9.161182548835646, 9.006662281547236]
0 9.76202767272458
1 9.161182548835646
2 9.006662281547236


[(0.2513917228469573, 0.016666666666666666),
 (9.76202767272458, 0.016666666666666666),
 (0.23287534789066622, 0.01639344262295082),
 (9.161182548835646, 0.01639344262295082),
 (0.21835752874630066, 0.016129032258064516),
 (9.006662281547236, 0.016129032258064516)]