In [111]:
from dotenv import load_dotenv 
load_dotenv()

True

In [112]:
from langchain.document_loaders import DirectoryLoader 
from langchain.document_loaders import PyPDFLoader 
data_folder = "../data"  

loader = DirectoryLoader(data_folder, glob="*.pdf", loader_cls=PyPDFLoader)
documents = loader.load()


In [113]:
len(documents)

436

In [114]:
from langchain.text_splitter import RecursiveCharacterTextSplitter 
splitter=RecursiveCharacterTextSplitter(chunk_size=500,chunk_overlap=100)

In [115]:
chunks=splitter.split_documents(documents)

In [116]:
len(chunks)

5608

In [117]:
from langchain.embeddings import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


In [118]:
from langchain.vectorstores import FAISS 
vectorstore=FAISS.from_documents(chunks,embeddings)


In [119]:
retriver=vectorstore.as_retriever()

In [12]:
retriver = vectorstore.as_retriever(search_kwargs={"k": 5})


In [25]:
query="what is Endoplasmic reticulum (ER)"
docc=retriver.get_relevant_documents(query,k=10)

In [26]:
docc

[Document(id='64bebd7c-963a-4417-afa8-4a9f847795d0', metadata={'producer': '3-Heights(TM) PDF Optimization Shell 4.8.25.2 (http://www.pdf-tools.com)', 'creator': 'PageMaker 7.0', 'creationdate': '2017-01-18T18:28:08+05:30', 'author': 'Admin', 'title': 'IX Bio_Title Free.pmd', 'moddate': '2019-05-28T07:37:38+00:00', 'source': '..\\data\\ix biology em.pdf', 'total_pages': 194, 'page': 17, 'page_label': '18'}, page_content='Cell  its structure and functions6\nsubstances from one part of the cell to\nanother. This network of membranes is\nknown as the endoplasmic reticulum.\nThe endoplasmic reticulum (ER) is a\nlarge network of membrane-bound tubes\nand sheets. The ER membrane is similar in\nstructure to the plasma membrane.\nEndoplasmic reticulum may have some\ngranule like structure on its surface which\nare called ribosomes, Such ER is called\nrough endoplasmic reticulum (RER).\nAreas/sections of ER that do not have'),
 Document(id='1b74ad9c-9271-4b6c-9159-7a0ae3fac413', metadata={'prod

In [120]:
from langchain_groq import ChatGroq 
import os 
GROQ_API_KEY=os.getenv('GROQ_API_KEY')

llm=ChatGroq(api_key=GROQ_API_KEY,model='llama-3.3-70b-versatile')

In [14]:
from langchain_community.retrievers import BM25Retriever
from langchain.retrievers import EnsembleRetriever

retriever_dense = vectorstore.as_retriever(search_kwargs={"k": 10})  
bm25_retriever = BM25Retriever.from_documents(chunks)

hybrid_retriever = EnsembleRetriever(retrievers=[retriever_dense, bm25_retriever], weights=[0.5, 0.5])


In [121]:
from langchain.prompts import ChatPromptTemplate

prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a helpful and knowledgeable biology tutor. Answer clearly and accurately.if the query is out of sylabus just give Out of syllabus"),
    ("human", "Context:\n{context}\n\nQuestion: {question}")
])


In [122]:
from langchain.memory import ConversationBufferWindowMemory

memory = ConversationBufferWindowMemory(
    memory_key="chat_history",
    return_messages=True,
    k=3
)


In [125]:
from typing import List
from langchain.schema import Document, BaseRetriever 
from sentence_transformers import CrossEncoder
from pydantic import BaseModel  
from langchain.chains import ConversationalRetrievalChain
reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

def rerank_documents(query: str, retrieved_docs: List[Document]) -> List[Document]:
    docs_texts = [doc.page_content for doc in retrieved_docs]
    pairs = [(query, doc_text) for doc_text in docs_texts]
    scores = reranker.predict(pairs)
    sorted_docs = [doc for _, doc in sorted(zip(scores, retrieved_docs), key=lambda x: x[0], reverse=True)]
    return sorted_docs

class RerankRetriever(BaseRetriever, BaseModel): 
    base_retriever: BaseRetriever  
    top_k: int = 5  

    def get_relevant_documents(self, query: str) -> List[Document]:
        initial_docs = self.base_retriever.get_relevant_documents(query)
        reranked_docs = rerank_documents(query, initial_docs)
        return reranked_docs[:self.top_k]  

base_retriever = vectorstore.as_retriever(search_kwargs={"k": 10})  

custom_retriever = RerankRetriever(base_retriever=base_retriever, top_k=5)

qa_chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=custom_retriever,
    memory=memory,
    combine_docs_chain_kwargs={"prompt": prompt}
)
query="Explain first part in detail"
result = qa_chain({"question": query})
print(result["answer"])


  class RerankRetriever(BaseRetriever, BaseModel):


The forebrain, also known as the prosencephalon, is the most advanced part of the brain and is responsible for a wide range of higher-order functions. The components of the forebrain include:

1. Cerebrum: The cerebrum is the largest part of the brain and is divided into two hemispheres: the left hemisphere and the right hemisphere. The cerebrum is responsible for:
	* Controlling movement and sensation
	* Processing sensory information
	* Regulating emotions and speech
	* Interpreting sensations and responding to cold, heat, pain, and pressure
2. Diencephalon: The diencephalon is a rhomboidal-shaped lobe that lies between the cerebrum and the midbrain. It is divided into two main structures:
	* Thalamus: The thalamus acts as a relay center for sensory impulses, such as pain, temperature, and light. It also plays a role in regulating consciousness and sleep.
	* Hypothalamus: The hypothalamus is the master control center of the endocrine system and regulates a wide range of functions, in

In [126]:
result['chat_history']

[HumanMessage(content='Explain the parts of human brain and their functions', additional_kwargs={}, response_metadata={}),
 AIMessage(content='The human brain can be divided into three main parts: Forebrain, Midbrain, and Hindbrain.\n\n1. **Forebrain**: It consists of two main parts - Cerebrum and Olfactory lobes.\n   - **Cerebrum**: It is the largest part of the brain and is divided into two cerebral hemispheres. The cerebrum is responsible for:\n     * Controlling voluntary movements\n     * Processing sensory information\n     * Managing higher-level cognitive functions such as thought, emotion, and memory\n   - **Olfactory lobes**: These are responsible for processing sensory information related to smell.\n\n2. **Midbrain**: It acts as a relay center for auditory and visual information. It also plays a role in regulating body temperature and alertness.\n\n3. **Hindbrain**: It consists of three main parts - Cerebellum, Pons, and Medulla oblongata.\n   - **Cerebellum**: It is respons

In [102]:
from typing import List
from langchain.schema import Document
from langchain.vectorstores.base import VectorStoreRetriever
from sentence_transformers import CrossEncoder

# Load Cross-Encoder
reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

# Reranking function
def rerank_documents(query: str, retrieved_docs: List[Document]) -> List[Document]:
    docs_texts = [doc.page_content for doc in retrieved_docs]
    pairs = [(query, doc_text) for doc_text in docs_texts]
    scores = reranker.predict(pairs)
    sorted_docs = [doc for _, doc in sorted(zip(scores, retrieved_docs), key=lambda x: x[0], reverse=True)]
    return sorted_docs

# Custom Retriever class
class RerankRetriever(VectorStoreRetriever):
    def __init__(self, base_retriever: VectorStoreRetriever, top_k: int = 5):
        self.base_retriever = base_retriever
        self.top_k = top_k

    def get_relevant_documents(self, query: str) -> List[Document]:
        # First use the base retriever to get top_k docs
        initial_docs = self.base_retriever.get_relevant_documents(query)
        # Then rerank them using the Cross-Encoder
        reranked_docs = rerank_documents(query, initial_docs)
        return reranked_docs


  class RerankRetriever(VectorStoreRetriever):


In [103]:
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferWindowMemory

# Create the base retriever from your vector store
base_retriever = vectorstore.as_retriever(search_kwargs={"k": 10})  # base docs before rerank

# Wrap with the reranker
custom_retriever = RerankRetriever(base_retriever=base_retriever, top_k=5)

# Plug into the QA chain
qa_chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=custom_retriever,
    memory=memory,
    combine_docs_chain_kwargs={"prompt": prompt}
)



ValueError: "RerankRetriever" object has no field "base_retriever"

In [101]:
# Query as normal
query="Explain the parts of human brain and their functions"
result = qa_chain({"question": query})
print(result["answer"])


AttributeError: 'RerankRetriever' object has no attribute 'tags'

In [50]:
from sentence_transformers import CrossEncoder
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI
from langchain.schema import Document
from typing import List

reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

def rerank_documents(query: str, retrieved_docs: List[Document]) -> List[Document]:
    """Re-rank retrieved documents using a Cross-Encoder."""
    docs_texts = [doc.page_content for doc in retrieved_docs]
    scores = reranker.predict([(query, doc) for doc in docs_texts])
    
    sorted_docs = [doc for _, doc in sorted(zip(scores, retrieved_docs), key=lambda x: x[0], reverse=True)]
    return sorted_docs

query = "Explain the parts of human brain and their functions"
retrieved_docs = retriver.get_relevant_documents(query,k=5)  
reranked_docs = rerank_documents(query, retrieved_docs)  

qa_chain = load_qa_chain(llm, chain_type="stuff")

final_answer = qa_chain.run(input_documents=reranked_docs, question=query)


print("\n💡 AI Answer:", final_answer)



💡 AI Answer: The human brain can be divided into several parts, each with distinct functions. Here's an overview of the main parts of the brain and their functions:

1. **Forebrain**: The forebrain is the most advanced part of the brain and is responsible for higher-level cognitive functions such as thought, emotion, and movement. It consists of two main parts:
	* **Cerebrum**: The cerebrum is the largest part of the brain and is divided into two hemispheres: the left hemisphere and the right hemisphere. The cerebrum is responsible for processing sensory information, controlling movement, and managing higher-level cognitive functions such as thought, emotion, and memory.
	* **Diencephalon**: The diencephalon is a small region that connects the cerebrum to the brainstem. It plays a role in relaying sensory information to the cerebrum and regulating body functions such as sleep, hunger, and thirst.
2. **Midbrain**: The midbrain is a small region that connects the forebrain to the hindbr