In [None]:
"""
Scope: Document assistant for Langchain website documentation

Documentation sources:

https://api.python.langchain.com/en/v0.1/langchain_api_reference.html

https://python.langchain.com/docs/introduction/

https://langchain-ai.github.io/langgraph/

In order to download the web contents with all links, use the following command on cmd line for each source:
wget -r -A.html -P langgraph-docs https://langchain-ai.github.io/langgraph/

"""

In [1]:
from langchain_ollama import OllamaEmbeddings
from langchain_ollama import ChatOllama
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors.chain_filter import LLMChainFilter
from langchain import hub
from langchain.schema import Document
from langchain.retrievers import EnsembleRetriever
from langchain_core.retrievers import BaseRetriever
from langchain_community.retrievers import BM25Retriever
from rank_bm25 import BM25Okapi
from typing import List, Dict, Optional, Any
from langchain.chains.combine_documents.stuff import create_stuff_documents_chain
from langchain.chains.retrieval import create_retrieval_chain
from chromadb.config import Settings
from langchain_community.document_transformers import LongContextReorder
from langchain_core.runnables import Runnable
from langchain_chroma import Chroma
from pydantic import Field
import msgspec
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

nltk.download('punkt')
nltk.download('stopwords')

stop_words_en = set(stopwords.words('english'))
stop_words_tr = set(stopwords.words('turkish'))

import warnings
from langsmith.utils import LangSmithMissingAPIKeyWarning
warnings.filterwarnings("ignore", category=LangSmithMissingAPIKeyWarning)

class Data(msgspec.Struct):
    documents: list[str]
    metadata: list[dict]

#--------------------------------------------------------------------------------------------------
#Custom BM25Retriever in order to utilize if all keywords are inside a text chunk, give higher score to that chunk in ranking with scores

class CustomBM25Retriever(BaseRetriever):
    k1: float = Field(default=1.2)
    b: float = Field(default=0.75)
    phrase_boost: float = Field(default=1.5)
    k: int = Field(default=10)  # Number of top documents to retrieve
    documents: List[Document] = Field(default_factory=list)
    tokenized_docs: List[List[str]] = Field(default_factory=list)
    bm25: Optional[BM25Okapi] = None

    def __init__(self, documents: List[Document], k: int = 10, k1: float = 1.2, b: float = 0.75, phrase_boost: float = 1.5):
        super().__init__()
        self.documents = documents
        self.k = k
        self.k1 = k1
        self.b = b
        self.phrase_boost = phrase_boost
        self.tokenized_docs = [self.tokenize(doc) for doc in self.documents]
        self.bm25 = BM25Okapi(self.tokenized_docs, k1=self.k1, b=self.b)

    @classmethod
    def from_texts(cls, texts: List[str], metadatas: Optional[List[Dict]] = None, k: int = 10, bm25_params: Optional[Dict] = None):
        """ Factory method to initialize from raw texts and metadata (similar to LangChain's BM25Retriever) """
        bm25_params = bm25_params or {"k1": 1.2, "b": 0.75, "phrase_boost": 1.0}
        documents = [Document(page_content=text.lower(), metadata=meta or {}) for text, meta in zip(texts, metadatas or [{}] * len(texts))]
        return cls(documents=documents, k=k, **bm25_params)


    def tokenize(self, doc: Document):
        """ Tokenizes both content and metadata for retrieval """
        metadata_str = " ".join(f"{key}: {value}" for key, value in doc.metadata.items())
        full_text = f"{doc.page_content} {metadata_str}"
        return full_text.split()

    def _get_relevant_documents(self, query: str, *, run_manager=None) -> List[Document]:
        """ Retrieve and rank documents using BM25 with metadata and phrase boosting """
        if not self.bm25:
            raise ValueError("BM25 model not initialized. Call __init__ first.")

        query = query.lower()
        query = re.sub(r'[^\w\s.,%@()*-+/!&_?#|]', '', query) 

        tokens = word_tokenize(query)
        filtered_query_tokens = [word for word in tokens if word not in stop_words_en]

        scores = self.bm25.get_scores(filtered_query_tokens)

        # Boost score if query appears as a phrase

        boosted_scores = []
        for i, doc in enumerate(self.documents):
            full_text = f"{doc.page_content} " + " ".join(f"{key}: {value}" for key, value in doc.metadata.items())
            phrase_bonus = self.phrase_boost if query.lower() in full_text.lower() else 1.0           
            boosted_scores.append(scores[i] * phrase_bonus)

        # Rank documents by boosted BM25 score
        ranked_docs = sorted(zip(self.documents, boosted_scores), key=lambda x: x[1], reverse=True)
        return [doc[0] for doc in ranked_docs[:self.k]]  # Return only top_k documents


#-----------------------------------------------------------
# Function to process retrieved documents
class RetrieveAndReorder(Runnable):
    def __init__(self, retriever):
        self.retriever = retriever
        self.reordering = LongContextReorder()

    def invoke(self, input: Dict[str, Any], config=None, **kwargs) -> Dict[str, Any]:
        question = input['question']
        retrieved_docs: List[Document] = self.retriever.invoke(question)
        if not retrieved_docs:
            return {"context": [], "question": question}

        reordered_docs = self.reordering.transform_documents(retrieved_docs)
        return {"context": reordered_docs, "question": question}
    

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
# Initialize the Chroma client (PersistentClient for persistent storage)
from langchain import hub
from langchain.chains.combine_documents.stuff import create_stuff_documents_chain
from langchain.chains.retrieval import create_retrieval_chain


import warnings
from langsmith.utils import LangSmithMissingAPIKeyWarning
warnings.filterwarnings("ignore", category=LangSmithMissingAPIKeyWarning)

#----------------------------------------------------------
"""
Create the semantic retriever from the Chroma vectorstore
"""

embeddings = OllamaEmbeddings(model="bge-m3")

vector_store = Chroma(
    collection_name = "my-doc-assistant-db",
    embedding_function=embeddings,
    persist_directory="D:\\Langchain-Langgraph-Doc-WebSites\\__Databases\\langchain-docs-vectordb"
    )

search_as = { "k": 5, "lambda_mult": 0.8,  "score_threshold": 0.1, "fetch_k": 10} 
semantic_retriever = vector_store.as_retriever(search_type="mmr",search_kwargs=search_as)

print("Semantic retriever from ChromaDB ready...")

#----------------------------------------------------------
"""
Create the BM25 Retriever
"""

#Read the document chunks and metadata as a file and put them into the BM25REtriever -- heavy RAM load
folder = "D:\\Langchain-Langgraph-Doc-WebSites\\__Databases\\langchain-docs-bm25-db\\"
with open(folder + 'bm25_data_docs_and_metadata.json', 'rb') as file:
    data = msgspec.json.decode(file.read(), type=Data)

print("bm25_data_docs_and_metadata.json read...")

# Initialize Custom BM25 Retriever instead of default BM25Retriever
bm25_retriever = CustomBM25Retriever.from_texts(
                                            data.documents, 
                                            data.metadata, 
                                            k=10, 
                                            bm25_params={"k1": 1.0, "b": 0.5, "phrase_boost": 1.5}
                                            )

print("BM25 retriever ready...")
print(f"length of data bm25: {len(data.documents)}")

#-----------------------------------------------------------
"""
Create the ensemble retriever with BM25 retriever and semantic retriever. 
Then, reorder the input documents against position bias for llm.
"""
#Create the ensemble retriever
ensemble_retriever = EnsembleRetriever(
    retrievers=[bm25_retriever, semantic_retriever], 
    weights=[0.5, 0.5]
)

# Create the retrieval and reordering runnable
retrieve_and_reorder = RetrieveAndReorder(ensemble_retriever)

print("Ensemble retriever and RetrieveAndReorder ready...")

#-----------------------------------------------------------
"""
Create the retrieval chain to produce outputs.
"""
retrieval_qa_chat_prompt = hub.pull("langchain-ai/retrieval-qa-chat")

llm = ChatOllama(model="cogito:14b", temperature=0.1, repeat_penalty=1.1, disable_streaming=False)

# Create the LLM-based filter
llm_filter = LLMChainFilter.from_llm(llm)

# Create the contextual compression retriever
compression_retriever = ContextualCompressionRetriever(
    base_compressor=llm_filter,
    base_retriever=semantic_retriever #ensemble_retriever #bm25_retriever #semantic_retriever
)


combine_docs_chain = create_stuff_documents_chain(
    llm=llm, 
    prompt=retrieval_qa_chat_prompt
)


retrieval_chain = create_retrieval_chain(
    retriever = compression_retriever, #bm25_retriever, #compression_retriever #semantic_retriever
    combine_docs_chain = combine_docs_chain,
)



print("Processing chain ready...")
print("Start querying...")

Semantic retriever from ChromaDB ready...
bm25_data_docs_and_metadata.json read...
BM25 retriever ready...
length of data bm25: 430370
Ensemble retriever and RetrieveAndReorder ready...
Processing chain ready...
Start querying...


In [None]:
"""
Query to search in a RAG with streaming.
"""

import warnings
from langsmith.utils import LangSmithMissingAPIKeyWarning
warnings.filterwarnings("ignore", category=LangSmithMissingAPIKeyWarning)


query = "what is the definition of langchain?"

#Create a list of sources from metadata and show each source once, do not repeat
sources = []
seen_sources = set()
print("Response: ")
for chunk in retrieval_chain.stream({"input": query}):
    for key, value in chunk.items():     
        #Streaming part is under the key='answer'  
        if key == 'answer':
            print(chunk['answer'], end="", flush=True)
        #Metadata and full answer is under the key='context'
        #Do not add dublicated sources
        if key == 'context':
           all_sources = [doc.metadata["source"] for doc in chunk['context']]
           for source in all_sources:
                if source not in seen_sources:
                    seen_sources.add(source)            
                    sources.append(source)

print("\n\nSources: ")
for i, chunk in zip(range(len(sources)), sources):
    print(f"{i} - {chunk}\n", end="", flush=True)  


"""
!!!

Think and read about a fast and effective rag pipeline.
Think about finding a phrase or keyword in a document. so, show the documents
"""

Response: 
LangChain is an open source orchestration framework for building applications using large language models (LLMs) like chatbots and virtual agents. It was launched by Harrison Chase in October 2022 and has gained popularity as the fastest-growing open source project on Github in June 2023.

Sources: 
0 - D:\Langchain-Langgraph-Doc-WebSites\langchain-langgraph-docs\langchain-docs\python.langchain.com\v0.1\docs\modules\agents\agent_types\react\index.html
1 - D:\Langchain-Langgraph-Doc-WebSites\langchain-langgraph-docs\langchain-docs\python.langchain.com\v0.1\docs\modules\agents\agent_types\json_agent\index.html


'\n!!!\n\nThink and read about a fast and effective rag pipeline.\n'

In [9]:
"""
Query to search in a RAG without streaming.
"""

import warnings
from langsmith.utils import LangSmithMissingAPIKeyWarning
warnings.filterwarnings("ignore", category=LangSmithMissingAPIKeyWarning)


query = "what is the definition of langchain?"

response = retrieval_chain.invoke(input={"input": query})


print("Response: ")
for chunk in response["answer"]:
    print(chunk, end="", flush=True)

print("\nSources: ")
all_sources = [doc.metadata["source"] for doc in response["context"]]

sources = []
seen_sources = set()
for source in all_sources:
    if source not in seen_sources:
        seen_sources.add(source)            
        sources.append(source)

for i, chunk in zip(range(len(sources)), sources):
    print(f"{i} - {chunk}\n", end="", flush=True)  

Response: 
Based on the provided context, LangChain can be defined as:

An open source orchestration framework for building applications using large language models (LLMs). It provides a library of abstractions in Python and JavaScript that help developers create various AI-powered applications like chatbots and virtual agents. Launched by Harrison Chase in October 2022, it gained significant popularity as the fastest-growing open source project on GitHub by June 2023.

The framework simplifies integration with external data sources and software workflows, supporting multiple LLM providers including OpenAI, Google, and IBM. It includes tools for language models to interact with the world through interfaces that can be invoked effectively using serializable inputs (strings and Python dict objects).
Sources: 
0 - D:\Langchain-Langgraph-Doc-WebSites\langchain-langgraph-docs\langchain-docs\python.langchain.com\v0.1\docs\modules\agents\agent_types\openai_tools\index.html
1 - D:\Langchain-La