In [1]:
import torch
from langchain_community.document_loaders import PyPDFLoader, UnstructuredEPubLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from chromadb.utils.embedding_functions import OllamaEmbeddingFunction
from collections import defaultdict
import chromadb
from chromadb.config import Settings
from chromadb import HttpClient
from langchain.embeddings import HuggingFaceEmbeddings
from datetime import datetime
from tqdm import tqdm
import os
import fitz

# Recursively find all .pdf files in subdirectories

def get_docs(directory):
    file_dict = defaultdict(list)
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.lower().endswith('.pdf') or  file.lower().endswith('.epub'):  # Check if it's a .pdf or .epub file         
                file_path =  os.path.join(root, file)                    #Obtain filepath and remove path length control with \\?\\
                file_name, file_ext = os.path.splitext(file)                         #Obtain filename without extension                                   
                parent_path = os.path.basename(os.path.dirname(file_path))           #Obtain parent directory
                file_size = os.path.getsize(file_path)                               #Obtain file size
                file_dict[file_name].append([file_path, parent_path, file_size, file_ext])


    return file_dict

In [None]:
import chromadb

# Initialize the Chroma client (PersistentClient for persistent storage)
client = chromadb.PersistentClient(path="D:\Bilgi\__Databases\Bilgi_my_chroma_db_with_langchain")

# Access your collection by name
collection = client.get_collection("my-rag-db")

total_items = collection.count()
print(f"Total number of element in chromadb: {total_items}")

batch_size = 100000

list_items_info = []
counter = 0
documents = defaultdict(list)

for offset in range(0, total_items, batch_size):
    # Retrieve a batch of data

    print(f"Counter: {counter} / {total_items}  ", end="\r")

    batch = collection.get(
        include=["documents", "metadatas"],
        limit=batch_size,
        offset=offset
    )

    # Process each item in the batch
    for doc, meta, id in zip(batch["documents"], batch["metadatas"], batch["ids"]):
        # Replace the following line with your processing logic
        #print(f"Document: {doc}\nMetadata: {meta}\nIds: {id}\n")

        path = meta.get("source", "Unknown")
        doc_name = path.split("\\")[-1]
        page = meta.get("page", "Unknown")
        total_pages = meta.get("total_pages", "Unknown")
        chunk_length = len(doc)

        list_items_info.append([id, chunk_length, path, doc_name, page, total_pages, doc])

        #Get the documents which are in the database
        if not path in documents:
            documents[path].append([chunk_length, total_pages])


        counter += 1
    

print(f"Number of database elements: {len(list_items_info)}")

directory = "D:\\Bilgi"
file_dict = get_docs(directory)

print(f"Number of documents  in harddisk: {len(file_dict)}")

print(f"Number of documents in database: {len(documents)}")



Total number of element in chromadb: 3076636
Number of database elements: 3076636
Number of documents  in harddisk: 1963
Number of documents in database: 1863


In [None]:
#Find missing docs after document chunks extraction
directory = "D:\\Bilgi"
file_dict = get_docs(directory)

missing_docs = []


counter = 0
for file, info in file_dict.items():
    print(counter, end="\r")
    counter += 1

    filepath = file_dict[file][0][0]
    doc_found = False
    for item in list_items_info:
        doc_name_in_list = item[2]
        if filepath == doc_name_in_list:
            doc_found = True
            break
        #print(filepath , " - ", doc_name_in_list)
    if not doc_found:
        missing_docs.append(filepath)

print(f"Number of missing docs: {len(missing_docs)}")


missing_docs

Number of missing docs: 100


['E:\\__ilker\\Bilgi\\Kitaplar\\AudioProcessing\\(ebook) Prentice Hall - Digital Processing Of Speech Signals (Lawrence R. Rabiner & Ronald W. Schafer) (scanned).pdf',
 'E:\\__ilker\\Bilgi\\Kitaplar\\AudioProcessing\\[eBook ENG] [DSP-audio] - Digital Audio Signal Processing - Zolzer.pdf',
 'E:\\__ilker\\Bilgi\\Kitaplar\\Big Data\\Data Mining Process with Neural Networks - Joseph P Bigus.pdf',
 'E:\\__ilker\\Bilgi\\Kitaplar\\Biography\\Mao Tse-Tung - A Biography By Ross Terrill.pdf',
 'E:\\__ilker\\Bilgi\\Kitaplar\\Business\\Amos Tversky And Daniel Kahneman - Prospect Theory - An Analysis Of Decision Under Risk (1979).pdf',
 'E:\\__ilker\\Bilgi\\Kitaplar\\Business\\MIT_Managing_the_Total_Customer_Experience.pdf',
 'E:\\__ilker\\Bilgi\\Kitaplar\\Circuits\\(ebook - electronics) - Practical RF Circuit Design for Modern Wireless Systems - Vol. 1- Passive Circuits & Systems (Besser Gilmore 2003).pdf',
 'E:\\__ilker\\Bilgi\\Kitaplar\\Circuits\\Allen, Holberg - CMOS Analog Circuit Design secon

In [None]:
#Full pipeline for a single document -- for testing

def create_batches(data, ids, batch_size):
    """Yield successive batch_size-sized chunks from data."""
    for i in range(0, len(data), batch_size):
        yield data[i:i + batch_size], ids[i:i + batch_size]


file_counter = 0

client = chromadb.HttpClient(
    host="localhost",  # Server's hostname or IP address
    port=8000,         # Port number the server is listening on
    settings=Settings(),
)
collection = client.get_or_create_collection(name="my-rag-db") 

embedding_function = OllamaEmbeddingFunction(
    model_name="nomic-embed-text",
    url="http://localhost:11434/api/embeddings",
)

#chroma run --host localhost --port 8000 --path D:/my_chroma_db_with_langchain2

file_path = "D:\\Bilgi\\Kitaplar\\Startup\\Creativity - The Discipline of Innovation. By Drucker, Peter.pdf"

error_flag = False
warning_flag = False
try:
    loader = PyPDFLoader(file_path, mode="single")
    doc = loader.load()      
except Warning as w:
    warning_flag = True
    #warning_logger.warning(f"{file_counter} -- {file_path}: {w}")
except Exception as e:
    # Capture the exception and traceback
    error_flag = True      
    #error_logger.error(f"{file_counter} -- {file_path}: {e}", exc_info=True)

file_counter += 1  
print(f"Error flag: {error_flag}, warning flag: {warning_flag}")

text_splitter = RecursiveCharacterTextSplitter(
                    chunk_size=512, chunk_overlap=128, add_start_index=True
                )

print(f"Text: {doc}")
chunks = text_splitter.split_documents(doc)
total_chunks = len(chunks)
print(f"Number of chunks: {total_chunks}")

# Generate unique IDs for each chunk

# Generate unique IDs for each chunk
chunk_ids = [f"doc_{file_counter}_chunk_{i}" for i in range(len(chunks))]


batch_size = 128  # Adjust based on your system's capacity
total_chunks = len(chunks)


#Add chunks data in batches for speed consideration. Operation is same as one chunk at a time in chromadb.
#with tqdm(total=total_chunks, desc=f"{str(file_counter)} / {str(1)} - Adding chunks of {file_path}", position=1, leave=True, unit="chunks") as pbar:        
for batch, batch_ids in create_batches(chunks, chunk_ids, batch_size):
    print("ilker")
    # Precompute embeddings for the current batch
    batch_documents = [chunk.page_content.encode('utf-8', errors='replace').decode('utf-8') for chunk in batch]
    batch_metadata = [chunk.metadata for chunk in batch]
    batch_embeddings = embedding_function(batch_documents)

    #Add the batch to the collection
    collection.upsert(
        documents=batch_documents,
        metadatas=batch_metadata,
        ids=batch_ids,
        embeddings=batch_embeddings
        )
    
        # Update the progress bar
        #pbar.update(len(batch))





#---------------------------------------------

import chromadb

# Initialize the Chroma client (PersistentClient for persistent storage)
client = chromadb.PersistentClient(path="D:\Bilgi\__Databases\Bilgi_my_chroma_db_with_langchain")

# Access your collection by name
collection = client.get_collection("my-rag-db")

total_items = collection.count()
print(f"Total number of element in chromadb: {total_items}")

batch_size = 100000

list_items_info = []
counter = 0
for offset in range(0, total_items, batch_size):
    # Retrieve a batch of data

    print(f"Counter: {counter} / {total_items}  ", end="\r")

    batch = collection.get(
        include=["documents", "metadatas"],
        limit=batch_size,
        offset=offset
    )

    # Process each item in the batch
    for doc, meta, id in zip(batch["documents"], batch["metadatas"], batch["ids"]):
        # Replace the following line with your processing logic
        #print(f"Document: {doc}\nMetadata: {meta}\nIds: {id}\n")

        path = meta.get("source", "Unknown")
        doc_name = path.split("\\")[-1]
        page = meta.get("page", "Unknown")
        total_pages = meta.get("total_pages", "Unknown")
        chunk_length = len(doc)

        list_items_info.append([id, chunk_length, path, doc_name, page, total_pages, doc])

        counter += 1
    

print(f"Number of elements: {len(list_items_info)}")

#print(list_items_info)

#---------------------------------------------------------------

directory = "D:\\Bilgi"
file_dict = get_docs(directory)

found_docs = []


counter = 0
for file, info in file_dict.items():
    filepath = file_dict[file][0][0]
    doc_found = False
    for item in list_items_info:
        doc_name_in_list = item[2]
        if filepath == doc_name_in_list:
            doc_found = True
            break
        #print(filepath , " - ", doc_name_in_list)
    if doc_found:
        counter += 1
        print(counter, end="\r")
        found_docs.append(filepath)

print(f"Number of found docs: {len(found_docs)}")


found_docs



Text: [Document(metadata={'producer': "Relais Int'l and AIS ImagePDF 1.06", 'creator': 'PyPDF', 'creationdate': '2003-03-06T06:57:40-05:00', 'moddate': '2007-03-18T17:52:25+01:00', 'source': 'E:\\__ilker\\Bilgi\\Kitaplar\\Startup\\Creativity - The Discipline of Innovation. By Drucker, Peter.pdf', 'total_pages': 7}, page_content='\n\x0c\n\x0c\n\x0c\n\x0c\n\x0c\n\x0c')]
Number of chunks: 0
Total number of element in chromadb: 0
Number of elements: 0
Number of found docs: 0


[]

In [None]:
#Only find similar document chunks according to queries -- for testing
from langchain_chroma import Chroma
from langchain_ollama import OllamaEmbeddings
from chromadb.utils.embedding_functions import OllamaEmbeddingFunction
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainFilter
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain_ollama import ChatOllama
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain.retrievers import EnsembleRetriever
from langchain_community.retrievers import BM25Retriever
from typing import List
import logging


# Initialize the Chroma client (PersistentClient for persistent storage)
embedding = OllamaEmbeddings(model="nomic-embed-text")

# embedding_function = OllamaEmbeddingFunction(
#     model_name="nomic-embed-text",
#     url="http://localhost:11434/api/embeddings",
# )

vector_store = Chroma(
    collection_name = "my-rag-db",
    embedding_function=embedding,
    persist_directory="D:\\Bilgi\\__Databases\\Bilgi_my_chroma_db_with_langchain"
    )




# Define your queries
queries = [
"""
başlamışlardı ki, tepeden inip de düzlüğe çıktıkları vakit uğursuz 
bir rüzgar ejderin püskürttüğü alevlerden artakalan yoğun duma­
nı onların bulu nduğu tarafa sürükledi ve atların bile aklını ba­
şından alacak kadar berbat bir kokuyla çevrelendiler. Sonra, sisin 
içinde ne yöne gittiklerini göremezken buna bir de ejder nefesinin 
o iğrenç kokusunun yarattığı panik eklenin ce, atları kont rol altın­
da tutmak imkansız hale geldi ve hayvanlar güzergah değiştirip
""",
"""
başlamışlardı ki, tepeden inip de düzlüğe çıktıkları vakit uğursuz 
bir rüzgar ejderin püskürttüğü alevlerden artakalan yoğun duma­
""",
"Emevi dönemi ne zaman başlar?"   
]

 
# Iterate over each query and perform similarity search with scores
for query in queries:
    results = await vector_store.asimilarity_search_with_score(query, k=1)  # Adjust 'k' as needed

    # Process and display results
    print(f"Results for query: '{query}'")
    for doc, score in results:
        print(f"Document: {doc.page_content}")  # Display first 100 characters
        print(f"Metadata: {doc.metadata}")  # Display first 100 characters
        print(f"ID: {doc.id}")  # Display first 100 characters
        print(f"Similarity Score (Distance): {score}\n")
        print("----------------------------------------------------------")

#---------------------------------------------------------------
#Create a retriever from vectorstore
retriever = vector_store.as_retriever(
    search_type="mmr", search_kwargs={ "k": 1, "lambda_mult": 0.7,  "score_threshold": 0.8, "fetch_k": 10}
)
found_chunks = retriever.invoke(queries[2])

for i in range(len(found_chunks)):
    print("Result \n")
    print(f"Chunk id: {found_chunks[i].id}, metadata: {found_chunks[i].metadata['source']} ")
    print(f"Content: {found_chunks[i].page_content}")
print(found_chunks)

print("----------------------------------------------------------")

#---------------------------------------------------------------


Results for query: '
başlamışlardı ki, tepeden inip de düzlüğe çıktıkları vakit uğursuz 
bir rüzgar ejderin püskürttüğü alevlerden artakalan yoğun duma­
nı onların bulu nduğu tarafa sürükledi ve atların bile aklını ba­
şından alacak kadar berbat bir kokuyla çevrelendiler. Sonra, sisin 
içinde ne yöne gittiklerini göremezken buna bir de ejder nefesinin 
o iğrenç kokusunun yarattığı panik eklenin ce, atları kont rol altın­
da tutmak imkansız hale geldi ve hayvanlar güzergah değiştirip
'
Document: başlamışlardı ki, tepeden inip de düzlüğe çıktıkları vakit uğursuz 
bir rüzgar ejderin püskürttüğü alevlerden artakalan yoğun duma­
nı onların bulu nduğu tarafa sürükledi ve atların bile aklını ba­
şından alacak kadar berbat bir kokuyla çevrelendiler. Sonra, sisin 
içinde ne yöne gittiklerini göremezken buna bir de ejder nefesinin 
o iğrenç kokusunun yarattığı panik eklenin ce, atları kont rol altın­
da tutmak imkansız hale geldi ve hayvanlar güzergah değiştirip
Metadata: {'creationdate': '2015-

In [1]:
#Full RAG app with hybrid search 1/2 - preparation of retrievers 

from langchain_chroma import Chroma
from langchain_ollama import OllamaEmbeddings
from chromadb.utils.embedding_functions import OllamaEmbeddingFunction
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainFilter
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain_ollama import ChatOllama
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain.retrievers import EnsembleRetriever
from langchain_core.retrievers import BaseRetriever
from langchain_community.retrievers import BM25Retriever
from rank_bm25 import BM25Okapi
from langchain.schema import Document
from langchain_core.runnables import Runnable, RunnableMap, RunnablePassthrough
from langchain_community.document_transformers import LongContextReorder
from pydantic import Field
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from typing import List, Dict
import logging
import json
import msgspec
from langchain.schema import Document
from rank_bm25 import BM25Okapi
from typing import List, Dict, Optional
import gc



class Data(msgspec.Struct):
    documents: list[str]
    metadatas: list[dict]


#Functions for Post-processing to create list of reference documents the LLM used from the all reference documents (text chunks + metadata)
def keyword_matching(llm_output, documents):
    matched_docs = []
    for doc in documents:
        doc_keywords = extract_keywords(doc)
        if any(keyword in llm_output for keyword in doc_keywords):
            matched_docs.append(doc)
    return matched_docs

def extract_keywords(document):
    # Implement your keyword extraction logic here
    # This could be as simple as splitting the document into words
    # or using more advanced techniques like TF-IDF or RAKE
    return document.split()

def similarity_assessment(llm_output, documents, threshold=0.5):
    # Ensure documents is a list of strings
    documents = list(documents)  # Convert generator to list if necessary
    documents = [str(doc) for doc in documents]  # Ensure each document is a string

    # Combine LLM output and documents into a single corpus
    corpus = [llm_output] + documents

    # Compute TF-IDF vectors
    vectorizer = TfidfVectorizer().fit_transform(corpus)
    vectors = vectorizer.toarray()

    # Extract LLM output vector and document vectors
    llm_vector = vectors[0]
    doc_vectors = vectors[1:]

    # Compute cosine similarity between LLM output and each document
    similarities = cosine_similarity([llm_vector], doc_vectors).flatten()

    # Filter documents based on similarity threshold
    matched_docs = [doc for doc, sim in zip(documents, similarities) if sim >= threshold]

    return matched_docs


#--------------------------------------------------------------------------------------------------
#Custom BM25Retriever in order to utilize if all keywords are inside a text chunk, give higher score to that chunk in ranking with scores
class CustomBM25Retriever(BaseRetriever):
    k1: float = Field(default=1.2)
    b: float = Field(default=0.75)
    phrase_boost: float = Field(default=1.5)
    k: int = Field(default=10)  # Number of top documents to retrieve
    documents: List[Document] = Field(default_factory=list)
    tokenized_docs: List[List[str]] = Field(default_factory=list)
    bm25: Optional[BM25Okapi] = None

    def __init__(self, documents: List[Document], k: int = 10, k1: float = 1.2, b: float = 0.75, phrase_boost: float = 1.5):
        super().__init__(documents=documents, k=k, k1=k1, b=b, phrase_boost=phrase_boost)
        self.k = k
        self.tokenized_docs = [self.tokenize(doc) for doc in self.documents]
        self.bm25 = BM25Okapi(self.tokenized_docs, k1=self.k1, b=self.b)

    @classmethod
    def from_texts(cls, texts: List[str], metadatas: Optional[List[Dict]] = None, k: int = 10, bm25_params: Optional[Dict] = None):
        """ Factory method to initialize from raw texts and metadata (similar to LangChain's BM25Retriever) """
        bm25_params = bm25_params or {"k1": 1.2, "b": 0.75}
        documents = [Document(page_content=text, metadata=meta) for text, meta in zip(texts, metadatas or [{}] * len(texts))]
        return cls(documents, k=k , **bm25_params)

    def tokenize(self, doc: Document):
        """ Tokenizes both content and metadata for retrieval """
        metadata_str = " ".join(f"{key}: {value}" for key, value in doc.metadata.items())
        full_text = f"{doc.page_content} {metadata_str}"
        return full_text.split()

    def _get_relevant_documents(self, query: str) -> List[Document]:
        """ Retrieve and rank documents using BM25 with metadata and phrase boosting """
        query_tokens = query.split()
        scores = self.bm25.get_scores(query_tokens)

        # Boost score if query appears as a phrase
        boosted_scores = []
        for i, doc in enumerate(self.documents):
            full_text = f"{doc.page_content} " + " ".join(f"{key}: {value}" for key, value in doc.metadata.items())
            phrase_bonus = self.phrase_boost if query in full_text else 1.0
            boosted_scores.append(scores[i] * phrase_bonus)

        # Rank documents by boosted BM25 score
        ranked_docs = sorted(zip(self.documents, boosted_scores), key=lambda x: x[1], reverse=True)
        return [doc[0] for doc in ranked_docs[:self.k]]  # Return only top_k documents

#--------------------------------------------------------------------------------------------------
# Function to process retrieved documents
class RetrieveAndFormat(Runnable):
    def __init__(self, retriever):
        self.retriever = retriever

    def invoke(self, inputs: Dict[str, str], config=None) -> Dict[str, str]:
        question = inputs['question']
        retrieved_docs: List[Document] = self.retriever.invoke(question)
        if not retrieved_docs:
            return {"context": "No info", "metadata": "No info", "question": question, "context_with_metadata": "No info"}
        
        reordering = LongContextReorder()
        reordered_docs = reordering.transform_documents(retrieved_docs)

        context = "\n\n".join([doc.page_content for doc in reordered_docs])

        # Initialize an empty set to track seen sources
        seen_sources = set()

        # Initialize a list to store unique metadata entries
        unique_metadata = []

        # Iterate over the retrieved documents
        counter=1
        for doc in reordered_docs:
            # Extract the 'source' metadata, defaulting to 'Unknown' if not present
            source = doc.metadata.get('source', 'Unknown')
            # If this source has not been encountered before, add it to the set and list
            if source not in seen_sources:
                seen_sources.add(source)
                unique_metadata.append(f"Source-{str(counter)}: {source}")
                counter += 1

        # Join the unique metadata entries into a single string with double newlines
        metadata = "\n\n".join(unique_metadata)

        context_with_metadata = "\n\n".join(["\n".join([doc.page_content, doc.metadata.get('source', 'Unknown')]) for doc in reordered_docs])

        return {"context": context, "metadata": metadata, "question": question, "context_with_metadata": context_with_metadata}
    

#--------------------------------------------------------------------------------------------------
# Initialize the Chroma client (PersistentClient for persistent storage)
#embedding = OllamaEmbeddings(model="nomic-embed-text")
embedding = OllamaEmbeddings(model="bge-m3")

vector_store = Chroma(
    collection_name = "my-rag-db",
    embedding_function=embedding,
    persist_directory="D:\\Bilgi\\__Databases\\Bilgi_my_chroma_db_with_langchain"
    )

print("Vectorstore created...")

#---------------------------------------------------------------

# #Create bm25 retriever and save it for one time
# stored_data = vector_store.get()
# doc_list =  stored_data.get('documents', [])
# metadata_list = stored_data.get('metadatas', [])
# data_to_save = {
#     'documents': doc_list,
#     'metadatas': metadata_list
# }

# print("Data to save created for BM25 retriever...")

# with open('D:\\Bilgi\\__Databases\\bm25_data_docs_and_metadata.json', 'w', encoding='utf-8') as file:
#     json.dump(data_to_save, file, ensure_ascii=False, indent=4)

# print("Data to saved for BM25 retriever...")

#------------------------
#Read the document chunks and metadata as a file and put them into the BM25REtriever -- heavy RAM load
with open('D:\\Bilgi\\__Databases\\bm25_data_docs_and_metadata.json', 'rb') as file:
    data = msgspec.json.decode(file.read(), type=Data)

print("bm25_data_docs_and_metadata.json read...")

# Initialize Custom BM25 Retriever instead of default BM25Retriever
#bm25_retriever = BM25Retriever.from_texts(data.documents, metadatas=data.metadatas, bm25_params={"k1": 0.8, "b": 0.5})
bm25_retriever = CustomBM25Retriever.from_texts(data.documents, data.metadatas, k=10, bm25_params={"k1": 0.8, "b": 0.5, "phrase_boost": 1.0})

print("bm25_retriever ready...") 

del data
gc.collect()
print("data is deleted from RAM...")
   

Vectorstore created...
bm25_data_docs_and_metadata.json read...
bm25_retriever ready...
data is deleted from RAM...


In [4]:
#Full RAG app with hybrid search 2/2 - applying doc search with question

question = "What is stock in corporate finance?"   


model = "mistral-nemo"  #"llama3.2:1b"
llm = ChatOllama(model=model, temperature=0.1, repeat_penalty=1.1, streaming=True)  #, callbacks=[StreamingStdOutCallbackHandler()]


QUERY_PROMPT = PromptTemplate(
input_variables=["question"],
template=
"""You are an AI language model assistant. Your task is taking a natural language query from a user and converting it into 
three queries for a vectorstore with the same language of the question. By generating multiple perspectives on the user question, 
your goal is to help the user overcome some of the limitations of the distance-based similarity search. 
Provide these alternative questions separated by newlines.
Original question: {question}""",
)

DEFAULT_PROMPT = PromptTemplate(
input_variables=["question"],
template=
"""You are an assistant tasked with taking a natural language
query from a user and converting it into a query for a vectorstore with the same language of the question.
In this process, you strip out information that is not relevant for
the retrieval task. Here is the user query: {question}""",
)


#Alternative parameters for search_type="similarity" or "mmr"
#search_as = {"score_threshold": 0.6, "k": 10}  #For search_type="similarity"
search_as = { "k": 5, "lambda_mult": 0.8,  "score_threshold": 0.6, "fetch_k": 20} #'filter': {'paper_title':'GPT-4 Technical Report'}
retriever_contextual = MultiQueryRetriever.from_llm(
    #vector_store.as_retriever(search_type="similarity_score_threshold",search_kwargs=search_as), 
    vector_store.as_retriever(search_type="mmr",search_kwargs=search_as), 
    llm=llm, 
    prompt=QUERY_PROMPT   #DEFAULT_PROMPT,
)

logging.basicConfig()
logging.getLogger("langchain.retrievers.multi_query").setLevel(logging.INFO)

#Hybrid retrieval: This is for extracting relevant text chunks and metadata according to the question. They will be sent to LLM for creating answer
# initialize the ensemble retriever
ensemble_retriever = EnsembleRetriever(
    retrievers=[bm25_retriever, retriever_contextual], rank_fusion="reciprocal"
)


print("ensemble_retriever created")

# RAG prompt: It contains retrieved text chunks and metadata, also chunks+metadata together
template = """Answer the question using only most relevant context-with-metadata among the provided ones with the same language of the question.
Generate concise and non-repetitive paragraphs. If there is no relevant information from the context or metadata, say only 'No info'.
Context:
{context}
Metadata:
{metadata}
Context-with-metadata:
{context_with_metadata}
Question: 
{question}
"""

prompt = ChatPromptTemplate.from_template(template)

# Create the RunnableSequence
chain = prompt | llm | StrOutputParser()

print("RAG chain ready...")

# Instantiate the custom Runnable with your retriever
retrieve_and_format = RetrieveAndFormat(ensemble_retriever)


# Retrieve context and metadata - sent the question to the LLM 
retrieval_output = retrieve_and_format.invoke({"question": question})



# Generate the answer using the complete chain. Answer is streamed.
res = chain.stream(retrieval_output)


print("\n")
print("----------------------")
print("----------------------")

#Print answer to the question by LLM
chunks = []
print("Answer: ")
for chunk in res:
    chunks.append(chunk)
    print(chunk, end="", flush=True)


#Get the answer of the LLM and look for retrieved chunks firstly. Then, apply similarity search to understand LLM chose which chunks and docs
res_string = ''.join(chunks)

documents= retrieval_output["context_with_metadata"].split("\n\n") #These are doc chunks with metadata (aka. book paths)
llm_output = res_string  #LLM answer


# # Keyword Matching
# matched_docs_keywords = keyword_matching(llm_output, documents)
# print("Documents matched by keyword:", len(matched_docs_keywords))

# Similarity Assessment
matched_docs_similarity = similarity_assessment(llm_output, documents, threshold=0.2) #Find similarities between LLM answer and retrieved text chunks


# Initialize an empty set to track seen sources
seen_sources = set()

# Initialize a list to store unique metadata entries
unique_metadata = []

# Iterate over the retrieved documents: If multiple chunks are obtained from the same document, only that document will be printed as references
counter=1
for doc in matched_docs_similarity:
    # Extract the 'source' metadata, defaulting to 'Unknown' if not present
    source = doc.split("\n")[-1]
    # If this source has not been encountered before, add it to the set and list
    if source not in seen_sources:
        seen_sources.add(source)
        unique_metadata.append(f"Source-{str(counter)}: {source}")
        counter += 1    

unique_metadata = "\n\n".join(unique_metadata)
print(f"\n\nRelevant documents LLM model used:\n\n{unique_metadata}") 


print("\n")
print("----------------------")
print("----------------------")

#Print all relevant documents according to the question -- for testing purpose
ref_docs =  retrieval_output['metadata']
print(f"All relevant documents:\n\n{ref_docs}")    

#Access the metadata and print that

print("\n")
print("----------------------")
print("----------------------")

#Print all relevant doc chunks with metadata (aka. book paths) -- for testing purpose
print(retrieval_output["context_with_metadata"])




#context =  retrieval_output['context']
#print(f"\nRelevant context and documents:\n\n{context}")

#ref_docs =  retrieval_output['metadata']
#print(f"\nRelevant context and documents:\n\n{list(ref_docs)}")

ensemble_retriever created
RAG chain ready...


INFO:langchain.retrievers.multi_query:Generated queries: ['"What does \'stock\' mean in corporate finance?"', '"How is \'stock\' defined in corporate finance?"', '"What\'s the definition of \'stock\' in corporate finance?"']




----------------------
----------------------
Answer: 
In corporate finance, stocks refer to equity securities that represent ownership in a corporation. Here are some key aspects of stocks:

1. **Ownership**: Stockholders (also known as shareholders) own a fraction of the company proportional to their holdings. They have voting rights and can participate in major corporate decisions.

2. **Dividends**: Corporations may distribute a portion of their profits as dividends to stockholders, typically on a per-share basis. However, dividends are not guaranteed and depend on the company's earnings and board of directors' decision.

3. **Capital Appreciation**: Stock prices can fluctuate based on the company's performance, market conditions, and other factors. When a stock is bought at a lower price and sold at a higher price, the owner profits from capital appreciation.

4. **Risk/Reward Trade-off**: Stocks are generally considered riskier than bonds or other debt securities because they d