In [7]:
import chromadb
from langchain_chroma import Chroma
from langchain_ollama import OllamaEmbeddings
from pprint import pprint
import pandas as pd

In [8]:
def read_vectordb_as_df(db_path:str):
    client = chromadb.PersistentClient(path=db_path)
    for collection in client.list_collections():
        data = collection.get(include=['embeddings', 'documents', 'metadatas'])
        df = pd.DataFrame({"ids":data["ids"], 
                        #    "embeddings":data["embeddings"], 
                            "metadatas":data["metadatas"], 
                            "documents":data["documents"]})
        df["first_div"] = df["metadatas"].apply(lambda x: x["First Division"])
        df["second_div"] = df["metadatas"].apply(lambda x: x["Second Division"])
        df["filename"] = df["metadatas"].apply(lambda x: x["File Name"])
        df = df[["ids", "first_div", "second_div","filename","documents", "metadatas"]]
    return df

In [9]:
def similarity_search(query:str, db_path:str="./db/chroma_rule_db", k:int=3):
    vector_store = Chroma(collection_name="my_collection", persist_directory=db_path, embedding_function=OllamaEmbeddings(model="bge-m3:latest"))
    results = vector_store.similarity_search(query, k=k)
    return results

In [10]:
df = read_vectordb_as_df(db_path="./db/chroma_rule_db")
df.shape

(31379, 6)

In [11]:
df.head()

Unnamed: 0,ids,first_div,second_div,filename,documents,metadatas
0,28c56317-1ab8-4d21-883a-161d998b3f1e,Rules,ABS,00-part-1A-ships-jul24,"This page explains 00-part-1A-ships-jul24, tha...","{'File Name': '00-part-1A-ships-jul24', 'File ..."
1,e2ad6d33-e498-45a3-8805-c42e694c0ac9,Rules,ABS,00-part-1A-ships-jul24,"This page explains 00-part-1A-ships-jul24, tha...","{'File Name': '00-part-1A-ships-jul24', 'File ..."
2,40d4be30-b70b-447a-92df-149a3e771513,Rules,ABS,00-part-1A-ships-jul24,"This page explains 00-part-1A-ships-jul24, tha...","{'File Name': '00-part-1A-ships-jul24', 'File ..."
3,e7af57eb-9e40-4c75-a467-c4747d595b58,Rules,ABS,00-part-1A-ships-jul24,"This page explains 00-part-1A-ships-jul24, tha...","{'File Name': '00-part-1A-ships-jul24', 'File ..."
4,4e30f874-b475-4588-b936-c42a7ccafbea,Rules,ABS,00-part-1A-ships-jul24,"This page explains 00-part-1A-ships-jul24, tha...","{'File Name': '00-part-1A-ships-jul24', 'File ..."


In [12]:
df["first_div"].unique(), df["second_div"].unique()

(array(['Rules', 'Maker_Manuals', 'PORT'], dtype=object),
 array(['ABS', 'KR', 'MARPOL', 'SOLAS', 'Win GD', 'Cryostar',
        'Port Regulations'], dtype=object))

In [14]:
len(df["filename"].unique()), df["filename"].unique()

(78,
 array(['00-part-1A-ships-jul24', '00-part-1B-offshore-jul24',
        '00-part-1C-lhsc-jul24', '00-part-1D-alternative-jan24',
        '00-part-2-jul24', '00-part-7-jul24',
        'Casebook on Interpretations_Eng_2022', 'Circular (E) Total_2024',
        'Guidance for Approval of Risk-based Ship Design_2015',
        'Guidance for Approval of Service Suppliers_2024',
        'Guidance for Autonomous Ships_2024',
        'Guidance for Battery Systems on Board Ships_2024',
        'Guidance for Cyber Resilience of Ships and Systems_2024',
        'Guidance for Fatigue Strength Assessment Including Springing_2020',
        'Guidance for Floating LNG Bunkering Terminal_2018',
        'Guidance for LNG Fuel Ready Ships_2021',
        'Guidance for Maritime Cyber security System_2024',
        'Guidance for Noise and Vibration_2020',
        'Guidance for Shiplift and Transfer Systems_2017',
        'Guidance for Ships for Navigation in Ice_2024',
        'Guidance for Smart Systems_2

In [34]:
df1 = df[df["filename"]=="equipment-solutions-for-lng-and-lbl"]
df1.head()

Unnamed: 0,ids,first_div,second_div,filename,documents,metadatas
21026,ecc81790-597e-46b2-9172-ed4e52213f0c,Maker_Manuals,Cryostar,equipment-solutions-for-lng-and-lbl,This page explains equipment-solutions-for-lng...,{'File Name': 'equipment-solutions-for-lng-and...
21027,12a310e5-e838-4e81-946b-585b48c18b15,Maker_Manuals,Cryostar,equipment-solutions-for-lng-and-lbl,This page explains equipment-solutions-for-lng...,{'File Name': 'equipment-solutions-for-lng-and...
21028,8417b9d7-f341-451c-893f-4007b1b19f39,Maker_Manuals,Cryostar,equipment-solutions-for-lng-and-lbl,This page explains equipment-solutions-for-lng...,{'File Name': 'equipment-solutions-for-lng-and...
21029,dcd69f26-09ce-4612-bb4f-b4ec60290ab7,Maker_Manuals,Cryostar,equipment-solutions-for-lng-and-lbl,This page explains equipment-solutions-for-lng...,{'File Name': 'equipment-solutions-for-lng-and...
21030,eb3f7ab9-96a5-4f5d-adf9-854fd36950b0,Maker_Manuals,Cryostar,equipment-solutions-for-lng-and-lbl,This page explains equipment-solutions-for-lng...,{'File Name': 'equipment-solutions-for-lng-and...


In [35]:
for n in range(10):
    pprint(df1.iloc[n:n+1,:]["documents"].values[0])
    print(df1.iloc[n:n+1,:]["metadatas"].values[0])

('This page explains equipment-solutions-for-lng-and-lbl, that belongs to '
 'catogories of Maker_Manuals and Cryostar./nEQUIP MENT SOL UTIONS FOR LNG AND '
 'LBG\n'
 '\n'
 '||||\n'
 '|-|||')
{'File Name': 'equipment-solutions-for-lng-and-lbl', 'File Path': './Maker_Manuals/Cryostar/equipment-solutions-for-lng-and-lbl.pdf', 'First Division': 'Maker_Manuals', 'Page': 0, 'Second Division': 'Cryostar'}
('This page explains equipment-solutions-for-lng-and-lbl, that belongs to '
 'catogories of Maker_Manuals and Cryostar./n2SAFETY AND STANDARDS Safety is '
 'an integral part of CRYOSTAR’s management and manufacturing commitments. For '
 'each new development or project, the company performs a risk analysis using '
 'approved techniques such as HAZOP (Hazard Operability) and FMEA (Failure '
 'Mode and Effects Analysis).CRYOSTAR’s equipments and solutions comply with '
 'most stringent machine and safety regulations such as the Pressure Equipment '
 'Directive 97/23/CE (Module H and H1) and A

In [33]:
%timeit
query = "what is the obligation of the master in troubled vessel in singapore port?"
res = similarity_search(query=query)

for k in res:
    pprint(k.metadata)
    pprint(k.page_content)
    print("-"*70)


{'File Name': 'Maritime and Port Authority of Singapore_2000',
 'File Path': './PORT/Port Regulations/Maritime and Port Authority of '
              'Singapore_2000.pdf',
 'First Division': 'PORT',
 'Page': 29,
 'Second Division': 'Port Regulations'}
('This page explains Maritime and Port Authority of Singapore_2000, that '
 'belongs to catogories of PORT and Port '
 'Regulations./nPortMastermayrequirevesseltoleaveport 36.ThePortMaster '
 'maydirect avessel toleave theportifheisof theopinion thatitwould '
 'notbeintheinterest oftheAuthority forthe vessel toremain inport. Damaged '
 'vesselsentering port 37.Noperson maycause orpermit adamaged vessel toenter '
 'the portwithout theprior written permission ofthePortMaster whomay grant '
 'thewritten permission subject tosuch conditions asthePort Master thinks fit. '
 '[S518/2017 wef18/09/2017] PARTVI VESSELS BERTHED ALONGSIDE Unauthorised '
 'berthing, etc. 38.Noperson maycause orpermit avessel — (a)toproceed '
 'alongside, orliealongside