In [1]:
import chromadb
from langchain_chroma import Chroma
from langchain_ollama import OllamaEmbeddings
from pprint import pprint
import pandas as pd

In [2]:
def read_vectordb_as_df(db_path:str):
    client = chromadb.PersistentClient(path=db_path)
    for collection in client.list_collections():
        data = collection.get(include=['embeddings', 'documents', 'metadatas'])
        df = pd.DataFrame({"ids":data["ids"], 
                            "metadatas":data["metadatas"], 
                            "documents":data["documents"]})
        df["first_div"] = df["metadatas"].apply(lambda x: x["First Division"])
        df["second_div"] = df["metadatas"].apply(lambda x: x["Second Division"])
        df["filename"] = df["metadatas"].apply(lambda x: x["File Name"])
        df = df[["ids", "first_div", "second_div","filename","documents", "metadatas"]]
    return df

In [3]:
def read_vectordb_as_data(db_path:str):
    client = chromadb.PersistentClient(path=db_path)
    for collection in client.list_collections():
        data = collection.get(include=['embeddings', 'documents', 'metadatas'])
    return data

In [4]:
def delete_document(filename:str, db_path:str):
  vector_store = Chroma(collection_name="collection_01", persist_directory=db_path, embedding_function=OllamaEmbeddings(model="bge-m3:latest"))
  del_ids = vector_store.get(where={'File Name':filename})["ids"]
  vector_store.delete(del_ids)
  print("Document is deleted")

In [6]:
db_path = "./db/chroma_db_02"
vector_store = Chroma(collection_name="collection_01", persist_directory=db_path, embedding_function=OllamaEmbeddings(model="bge-m3:latest"))
print(vector_store)

<langchain_chroma.vectorstores.Chroma object at 0x0000018C66F07410>


In [7]:
df = read_vectordb_as_df(db_path=db_path)
df.head()

Unnamed: 0,ids,first_div,second_div,filename,documents,metadatas
0,faace8c4-ab2c-43b4-9b4e-7fc15319bc78,MANUAL,Common,[KISA Insight 2023 Vol.03] ChatGPT(챗GPT) 보ᄋ...,This page explains [KISA Insight 2023 Vol.03] ...,{'File Name': '[KISA Insight 2023 Vol.03] Chat...
1,1a3d1b93-e5d3-4a96-990d-e4ba6b976e29,MANUAL,Common,[KISA Insight 2023 Vol.03] ChatGPT(챗GPT) 보ᄋ...,This page explains [KISA Insight 2023 Vol.03] ...,{'File Name': '[KISA Insight 2023 Vol.03] Chat...
2,f67ce384-3df7-4ffe-aa13-9d187d73cb13,MANUAL,Common,[KISA Insight 2023 Vol.03] ChatGPT(챗GPT) 보ᄋ...,This page explains [KISA Insight 2023 Vol.03] ...,{'File Name': '[KISA Insight 2023 Vol.03] Chat...
3,5f7bdc1f-deb2-4f3d-9afb-2eafbf5d5192,MANUAL,Common,[KISA Insight 2023 Vol.03] ChatGPT(챗GPT) 보ᄋ...,This page explains [KISA Insight 2023 Vol.03] ...,{'File Name': '[KISA Insight 2023 Vol.03] Chat...
4,5edec5fa-fdde-41e4-a688-b77fb1fb75c3,MANUAL,Common,[KISA Insight 2023 Vol.03] ChatGPT(챗GPT) 보ᄋ...,This page explains [KISA Insight 2023 Vol.03] ...,{'File Name': '[KISA Insight 2023 Vol.03] Chat...


In [8]:
data = read_vectordb_as_data(db_path=db_path)
print(data['ids'][:5])
print(data['metadatas'][:5])
print(data['documents'][:5])

['faace8c4-ab2c-43b4-9b4e-7fc15319bc78', '1a3d1b93-e5d3-4a96-990d-e4ba6b976e29', 'f67ce384-3df7-4ffe-aa13-9d187d73cb13', '5f7bdc1f-deb2-4f3d-9afb-2eafbf5d5192', '5edec5fa-fdde-41e4-a688-b77fb1fb75c3']
[{'File Name': '[KISA Insight 2023 Vol.03] ChatGPT(챗GPT) 보안 위협과 시사점', 'File Path': '/content/drive/MyDrive/MANUAL/Common/[KISA Insight 2023 Vol.03] ChatGPT(챗GPT) 보안 위협과 시사점.pdf', 'First Division': 'MANUAL', 'Page': 0, 'Second Division': 'Common'}, {'File Name': '[KISA Insight 2023 Vol.03] ChatGPT(챗GPT) 보안 위협과 시사점', 'File Path': '/content/drive/MyDrive/MANUAL/Common/[KISA Insight 2023 Vol.03] ChatGPT(챗GPT) 보안 위협과 시사점.pdf', 'First Division': 'MANUAL', 'Page': 1, 'Second Division': 'Common'}, {'File Name': '[KISA Insight 2023 Vol.03] ChatGPT(챗GPT) 보안 위협과 시사점', 'File Path': '/content/drive/MyDrive/MANUAL/Common/[KISA Insight 2023 Vol.03] ChatGPT(챗GPT) 보안 위협과 시사점.pdf', 'First Division': 'MANUAL', 'Page': 2, 'Second D

In [9]:
df["first_div"].unique(), df["second_div"].unique()

(array(['MANUAL', 'Rules', 'PORT'], dtype=object),
 array(['Common', 'Integrated Smart Ship(ISS)', 'ABS', 'DNV', 'KR',
        'MARPOL', 'SOLAS', 'BV', 'Port Regulation', 'Win GD', 'Cryostar',
        'LR'], dtype=object))

In [11]:
len(df["filename"].unique()), df[df["second_div"]=="Port Regulation"]["filename"].unique()

(191,
 array(['Act on Port Regulations_Japan_1948',
        'Maritime and Port Authority of Singapore_2000',
        'Port Information Guide_Rotterdam_2024',
        'Port Regulations_EU_2017', 'Harbor Act_Republic of Korea_2017'],
       dtype=object))

In [12]:
df[df["filename"]=="Harbor Act_Republic of Korea_2017"].shape

(72, 6)

In [15]:
for i in df[df["filename"]=="Harbor Act_Republic of Korea_2017"]["documents"][-5:]:
    print(i)

This page explains Harbor Act_Republic of Korea_2017, that belongs to catogories of PORT and Port Regulation./n24.10.19. 2 12:40 Statutes of the Republic of Korea. and utilizing the harbor hinterland complex, he/she may request the Minister of Oceans and Fisheries to amend the plan to develop the harbor hinterland complex.. <Amended by Act No,11690, Mar. 23, 2013> (5)A plan to develop a harbor hinterland complex referred to in paragraph (2) shall contain the followinq: Provided, That matters referred to in subparagraph 7 may be included in a plan to develop a harbor hinterland complex after the harbor hinterland complex is designated, if it is deemed inevitable to formulate the plan: <Amended by Act No. 12545, Mar. 24, 2014; Act No. 14452, Dec. 20, 2016> 1.The name, location, and area of the harbor hinterland complex;. 2.The purposes for which the harbor hinterland complex is designated; 3.An entity that implements the development project of the harbor hinterland complex, and a period 

In [52]:
def similarity_search(query:str, db_path:str, k:int=3):
    vector_store = Chroma(collection_name="collection_01", persist_directory=db_path, embedding_function=OllamaEmbeddings(model="bge-m3:latest"))
    results = vector_store.similarity_search_with_relevance_scores(query, k=k)
    return results

query = """
what is the noon report in iss system?
"""
res1 = similarity_search(query=query, db_path=db_path, k=3)
res1

[(Document(metadata={'File Name': 'DNV Rules for Classification of Ships _2016_55_Nautical Safety', 'File Path': '/content/drive/MyDrive/Rules/DNV/DNV Rules for Classification of Ships _2016_55_Nautical Safety.pdf', 'First Division': 'Rules', 'Page': 82, 'Second Division': 'DNV'}, page_content='This page explains DNV Rules for Classification of Ships _2016_55_Nautical Safety, that belongs to catogories of Rules and DNV./nRules for Ships, January 2014  Pt.6 Ch.8 Sec.7 Network based integration of naviga tion systems (ICS) – Page 83 DET N ORSKE V ERITAS AS — time — ENC — radar video. 302  The position, heading and speed information shall b e displayed together with the indication of its sou rce. Guidance note: Sensor data, e.g. GYR 1, GYR 2, GPS 1, GPS 2, EM log, Dop pler log, GPS, radar 1, radar 2 etc.; -or result of calculation or manual input; -unit if ambiguous, e.g. UTC for time. ---e-n-d---of---G-u-i-d-a-n-c-e---n-o-t-e-- 303  Display of sensor output data The ICS shall be capable 