In [1]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import FAISS
from langchain_ollama import OllamaEmbeddings
from FlagEmbedding import FlagReranker
from typing import Any
import chromadb
import ollama
from langchain_chroma import Chroma
from langchain_ollama import OllamaEmbeddings
import pandas as pd
from pprint import pprint

  from .autonotebook import tqdm as notebook_tqdm


# Global Variables

In [2]:
embeddings =  OllamaEmbeddings(base_url="http://localhost:11434", model="bge-m3:latest")
embeddings

OllamaEmbeddings(model='bge-m3:latest', base_url='http://localhost:11434', client_kwargs={})

In [3]:
db_path = "./db/chroma_db_02"
vectorstore = Chroma(collection_name="collection_01", persist_directory=db_path, embedding_function=OllamaEmbeddings(base_url="http://localhost:11434", model="bge-m3:latest"))
vectorstore

<langchain_chroma.vectorstores.Chroma at 0x28be9d57e90>

In [None]:
from FlagEmbedding import FlagReranker
reranking_model_path = "D:/LLMs/bge-reranker-v2-m3"
reranker = FlagReranker(model_name_or_path=reranking_model_path, 
                        use_fp16=True,
                        batch_size=512,
                        max_length=2048,
                        normalize=True)
reranker

<FlagEmbedding.inference.reranker.encoder_only.base.BaseReranker at 0x28be7d30950>

In [5]:
query = "^Cargo Compressor^ installation"
refined_query = query.replace("^", "").lower()
refined_query

'cargo compressor installation'

# Semantic Search

In [6]:
def get_semantic_search_docs(query: str, k: int = 100, fetch_k: int = 200):
    """
    doc string
    """
    global vectorstore
    print("--------------<Semantic Search>-----------------")
    retriever = vectorstore.as_retriever(
        search_type="mmr", 
        search_kwargs={'k': k, "fetch_k": fetch_k}
    )
    result = retriever.invoke(query)
    print(f">>> Semantic Search Counts: {len(result)}")
    return result


In [7]:
# result = get_semantic_search_docs(query=refined_query, k=50, fetch_k=100)
# result

# Keywords Matching

In [8]:
import re

def get_keywords_matched_docs(query: str, documents: list, and_condition: bool = True):
    """
    doc string
    """
    print("--------------<Keywords Search>-----------------")
    pattern = r"\^([^^]+)\^"  # ^로 둘러싸인 단어만 검색 대상으로 리스트에 담기
    extracted_keywords = re.findall(pattern, query)
    lower_keywords = [keyword.lower() for keyword in extracted_keywords]
    print(f">>> lower_keywords: {lower_keywords}")

    lower_docs = [doc.page_content.lower() for doc in documents]
    if and_condition:
        matching_sentences = [sentence for sentence in lower_docs if all(keyword in sentence for keyword in lower_keywords)]
    else:
        matching_sentences = [sentence for sentence in lower_docs if any(keyword in sentence for keyword in lower_keywords)]

    matched_index = [lower_docs.index(doc) for doc in matching_sentences]
    final_matched_docs = [documents[i] for i in matched_index]

    print(f">>> 키워드 매칭칭 문서개수: {len(final_matched_docs)}")
    return final_matched_docs

In [9]:
# result = get_keywords_matched_docs(query=query, documents=result, and_condition=True)
# result

# BM 25 

In [10]:
import re
import numpy as np
from rank_bm25 import BM25Okapi


def get_bm25_top_docs(query: str, documents: list, top_k: int = 10):
    """
    doc string
    """
    print("--------------<BM25 Search>-----------------")
    tokenized_corpus = [doc.page_content.lower() for doc in documents]
    bm25 = BM25Okapi(tokenized_corpus)

    query = query.split(" ")
    query = [word.lower() for word in query]
    print(f">>> bm25 query: {query}")

    doc_scores = bm25.get_scores(query)
    sorted_indices = np.argsort(doc_scores)  # 값에 대한 정렬된 인덱스
    order_values = np.empty_like(sorted_indices)
    order_values[sorted_indices] = np.arange(len(doc_scores))
    top_index = [i for i, value in enumerate(order_values) if value < top_k]
    top_docs = [i for i in documents if documents.index(i) in top_index]
    print(f">>> 원 BM25 문서개수: {len(top_docs)}")
    
    if len(top_docs) >= top_k:
        top_docs = top_docs[:top_k]
    
    return top_docs



In [11]:
# result = get_bm25_top_docs(query=query, documents=result, top_k=20)
# result

# Reranking

In [12]:
# def reranking(query: str, docs: list, min_score: float = 0.5, top_k: int = 3):
#     """
#     doc string
#     """
#     global reranker
#     print("--------------<Reranking>-----------------")
#     reranked_docs = []
#     inputs = [[query, doc.page_content] for doc in docs]
#     scores = reranker.compute_score(inputs)
#     if not isinstance(scores, list):
#         scores = [scores]
#     score_index = [(score, idx) for idx, score in enumerate(scores) if score >= min_score]
#     sorted_score_index = sorted(score_index, key=lambda x: x[0], reverse=True)
#     sorted_indices = [idx for score, idx in sorted_score_index]
#     if len(sorted_indices) >= top_k:
#         sorted_indices = sorted_indices[:top_k]
#         sorted_score_index = sorted_score_index[:top_k]
#     reranked_docs = [docs[i] for i in sorted_indices]        
#     return sorted_score_index, reranked_docs

In [13]:
import heapq

def reranking(query: str, docs: list, min_score: float = 0.5, top_k: int = 3):
    """
    doc string
    """
    global reranker
    print("--------------<Reranking>-----------------")
    inputs = [[query, doc.page_content.lower()] for doc in docs]
    scores = reranker.compute_score(inputs)
    if not isinstance(scores, list):
        scores = [scores]

    print(f">>> scores: {scores}")

    # Filter scores by threshold and keep index
    filtered_scores = [(score, idx) for idx, score in enumerate(scores) if score >= min_score]

    # Get top_k using heapq (more efficient than sorting full list)
    top_scores = heapq.nlargest(top_k, filtered_scores, key=lambda x: x[0])

    # Get document objects from top indices
    reranked_docs = [docs[idx] for _, idx in top_scores]

    return top_scores, reranked_docs


In [14]:
# sorted_score_index, reranked_docs = reranking(query=refined_query, docs=result)
# sorted_score_index, reranked_docs

In [15]:
# reranked_docs[0].page_content

# Define Main Function

In [16]:
def hybrid_search(query: str):
    """
    doc string
    """
    pattern = r"\^([^^]+)\^"  # ^로 둘러싸인 단어만 검색 대상으로 리스트에 담기
    extracted_keywords = re.findall(pattern, query)
    query = query.lower()
    refined_query = query.replace("^", "").lower()

    print(f">>> origin query: {query}")
    print(f">>> refined_query: {refined_query}")
    
    if len(extracted_keywords) > 0:
        try: 
            docs = get_semantic_search_docs(query=refined_query, k=50, fetch_k=100)
            docs = get_keywords_matched_docs(query=query, documents=docs, and_condition=True)  
            top_scores, docs = reranking(query=refined_query, docs=docs, min_score=0.5, top_k=5)
            return top_scores, docs
        except IndexError:
            return "No Retrieved Docs"
    else: 
        try:
            docs = get_semantic_search_docs(query=refined_query, k=20, fetch_k=100)  # 키워드 지정이 없는 경우, 시맨틱 개수는 20개로..
            # docs = get_bm25_top_docs(query=refined_query, documents=docs, top_k=10)   # optional
            top_scores, docs = reranking(query=refined_query, docs=docs, min_score=0.5, top_k=5)
            return top_scores, docs
        except IndexError:
            return "No Retrieved Docs"


In [18]:
query = "technical specification for ^cargo compressor^ in lng carriers"
result = hybrid_search(query=query)
result

>>> origin query: technical specification for ^cargo compressor^ in lng carriers
>>> refined_query: technical specification for cargo compressor in lng carriers
--------------<Semantic Search>-----------------


You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


>>> Semantic Search Counts: 50
--------------<Keywords Search>-----------------
>>> lower_keywords: ['cargo compressor']
>>> 키워드 매칭칭 문서개수: 3
--------------<Reranking>-----------------
>>> scores: [0.5849905740604183, 0.1730955500070145, 0.689021202018376]


([(0.689021202018376, 2), (0.5849905740604183, 0)],
 [Document(metadata={'File Name': 'Classification of compressed natural gas carriers_517-NR_2007-04', 'File Path': '/content/drive/MyDrive/Rules/BV/Classification of compressed natural gas carriers_517-NR_2007-04.pdf', 'First Division': 'Rules', 'Page': 125, 'Second Division': 'BV'}, page_content='This page explains Classification of compressed natural gas carriers_517-NR_2007-04, that belongs to catogories of Rules and BV./nof water and/or condensate cargo in holds  2. the cargo heater low temperature alarm required in 4.2.7.2  3. the alarm signalling the presence of condensate cargo in the vent main as per 5.2.1.7  4. the indication of the pressure value in each cargo tank mentioned in 13.4.1; such indication is to give the setting pressure value of the relief valve and the minimum allowable pressure value in the cargo tank concerned  5. the high pressure and low pressure alarms, when required, for cargo tanks as per 13.4.1  6. the 

In [19]:
query = "Technical Specification for ^Cargo Compressor^ in lng carriers"
result = hybrid_search(query=query)
result

>>> origin query: technical specification for ^cargo compressor^ in lng carriers
>>> refined_query: technical specification for cargo compressor in lng carriers
--------------<Semantic Search>-----------------
>>> Semantic Search Counts: 50
--------------<Keywords Search>-----------------
>>> lower_keywords: ['cargo compressor']
>>> 키워드 매칭칭 문서개수: 3
--------------<Reranking>-----------------
>>> scores: [0.5849905740604183, 0.1730955500070145, 0.689021202018376]


([(0.689021202018376, 2), (0.5849905740604183, 0)],
 [Document(metadata={'File Name': 'Classification of compressed natural gas carriers_517-NR_2007-04', 'File Path': '/content/drive/MyDrive/Rules/BV/Classification of compressed natural gas carriers_517-NR_2007-04.pdf', 'First Division': 'Rules', 'Page': 125, 'Second Division': 'BV'}, page_content='This page explains Classification of compressed natural gas carriers_517-NR_2007-04, that belongs to catogories of Rules and BV./nof water and/or condensate cargo in holds  2. the cargo heater low temperature alarm required in 4.2.7.2  3. the alarm signalling the presence of condensate cargo in the vent main as per 5.2.1.7  4. the indication of the pressure value in each cargo tank mentioned in 13.4.1; such indication is to give the setting pressure value of the relief valve and the minimum allowable pressure value in the cargo tank concerned  5. the high pressure and low pressure alarms, when required, for cargo tanks as per 13.4.1  6. the 

In [20]:
query = "Technical Specification for ^Cargo^ ^Compressor^ in ^lng^ carriers"
result = hybrid_search(query=query)
result

>>> origin query: technical specification for ^cargo^ ^compressor^ in ^lng^ carriers
>>> refined_query: technical specification for cargo compressor in lng carriers
--------------<Semantic Search>-----------------
>>> Semantic Search Counts: 50
--------------<Keywords Search>-----------------
>>> lower_keywords: ['cargo', 'compressor', 'lng']
>>> 키워드 매칭칭 문서개수: 2
--------------<Reranking>-----------------
>>> scores: [0.5849905740604183, 0.7768891679396638]


([(0.7768891679396638, 1), (0.5849905740604183, 0)],
 [Document(metadata={'File Name': 'Catalogue_LNG T&T_WEB', 'File Path': '/content/drive/MyDrive/MANUAL/Cryostar/Catalogue_LNG T&T_WEB.pdf', 'First Division': 'MANUAL', 'Page': 6, 'Second Division': 'Cryostar'}, page_content='This page explains Catalogue_LNG T&T_WEB, that belongs to catogories of MANUAL and Cryostar./n7BOIL/hyphen.capOFF GAS AS A FUEL The world’s most popular compressors on LNG carriersFROM STEAM TURBINES TO MEDIUM PRESSURE 2/hyphen.capSTROKE ENGINES CRYOSTAR began producing compressors to supply boilers /f_itted to steam turbine propelled vessels. As new propulsion technologies were proposed, close co-operation with engine makers, shipyards and ship-owners led to the most versatile designs allowing /f_lexibility for operational needs.CARGO HANDLING FOR LNG CARRIERS AND LNG BUNKER VESSELS CRYOSTAR has been facilitating safe and reliable cargo handling on LNG carriers and bunker vessels since the 90’s. Tank pressure co

In [21]:
query = "Technical Specification for Cargo Compressor in lng carriers"
result = hybrid_search(query=query)
result

>>> origin query: technical specification for cargo compressor in lng carriers
>>> refined_query: technical specification for cargo compressor in lng carriers
--------------<Semantic Search>-----------------
>>> Semantic Search Counts: 20
--------------<Reranking>-----------------
>>> scores: [0.1569672946113173, 0.6030500380564152, 0.02893004440771399, 0.1775094835100656, 0.06520510703508027, 0.018670043747696825, 0.1730955500070145, 0.0043683073263230884, 0.013259443205033306, 0.7768893952307212, 0.6890211381606427, 0.19652044888756204, 0.008722220750727957, 0.008618786335968253, 0.0017428683638135284, 0.017912353057448146, 0.017187023857348337, 0.006424878364377303, 0.03167653318531904, 0.011079503541905747]


([(0.7768893952307212, 9), (0.6890211381606427, 10), (0.6030500380564152, 1)],
 [Document(metadata={'File Name': 'Catalogue_LNG T&T_WEB', 'File Path': '/content/drive/MyDrive/MANUAL/Cryostar/Catalogue_LNG T&T_WEB.pdf', 'First Division': 'MANUAL', 'Page': 6, 'Second Division': 'Cryostar'}, page_content='This page explains Catalogue_LNG T&T_WEB, that belongs to catogories of MANUAL and Cryostar./n7BOIL/hyphen.capOFF GAS AS A FUEL The world’s most popular compressors on LNG carriersFROM STEAM TURBINES TO MEDIUM PRESSURE 2/hyphen.capSTROKE ENGINES CRYOSTAR began producing compressors to supply boilers /f_itted to steam turbine propelled vessels. As new propulsion technologies were proposed, close co-operation with engine makers, shipyards and ship-owners led to the most versatile designs allowing /f_lexibility for operational needs.CARGO HANDLING FOR LNG CARRIERS AND LNG BUNKER VESSELS CRYOSTAR has been facilitating safe and reliable cargo handling on LNG carriers and bunker vessels since 

In [22]:
from pprint import pprint
pprint(result[1][0].page_content)

('This page explains Catalogue_LNG T&T_WEB, that belongs to catogories of '
 'MANUAL and Cryostar./n7BOIL/hyphen.capOFF GAS AS A FUEL The world’s most '
 'popular compressors on LNG carriersFROM STEAM TURBINES TO MEDIUM PRESSURE '
 '2/hyphen.capSTROKE ENGINES CRYOSTAR began producing compressors to supply '
 'boilers /f_itted to steam turbine propelled vessels. As new propulsion '
 'technologies were proposed, close co-operation with engine makers, shipyards '
 'and ship-owners led to the most versatile designs allowing /f_lexibility for '
 'operational needs.CARGO HANDLING FOR LNG CARRIERS AND LNG BUNKER VESSELS '
 'CRYOSTAR has been facilitating safe and reliable cargo handling on LNG '
 'carriers and bunker vessels since the 90’s. Tank pressure control during '
 'loading and unloading is a critical aspect of vessel operation. During '
 'loading vapour return compressors move the necessary volume of gas ashore to '
 'maintain safe tank pressure levels. FUEL GAS COMPRESSORS FOR MEDIUM

In [23]:
pprint(result[1][1].page_content)

('This page explains Classification of compressed natural gas '
 'carriers_517-NR_2007-04, that belongs to catogories of Rules and BV./nof '
 'water and/or condensate cargo in holds  2. the cargo heater low temperature '
 'alarm required in 4.2.7.2  3. the alarm signalling the presence of '
 'condensate cargo in the vent main as per 5.2.1.7  4. the indication of the '
 'pressure value in each cargo tank mentioned in 13.4.1; such indication is to '
 'give the setting pressure value of the relief valve and the minimum '
 'allowable pressure value in the cargo tank concerned  5. the high pressure '
 'and low pressure alarms, when required, for cargo tanks as per 13.4.1  6. '
 'the hull structure low temperature alarm required in 13.5.2  7. the gas '
 'detection equipment alarm required in 13.6.4  8. the cargo compressor high '
 'temperature alarm required in 17.4.2.2  9. the alarm for automatic shutdown '
 'of the cargo compressor for high pressure or high temperature, as required '
 'in 