## Hybrid Retriever- Combining Dense And Sparse Retriever

In [1]:
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.retrievers import BM25Retriever
from langchain.retrievers import EnsembleRetriever
from langchain.schema import Document



In [1]:
## loading the embedding model
from app.utils.model_loader import ModelLoader
from app.ingestion.file_loader import FileLoader 


model_loader = ModelLoader(model_provider="huggingface")
embedding_model = model_loader.load_llm()
 
## loading the document 
doc_path =r"app\uploads\policy-1-5.pdf"
file_loader = FileLoader()
documents = file_loader.load_pdf(doc_path)

## spliting the documetn into chunks
from langchain.text_splitter import RecursiveCharacterTextSplitter  

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
docs = text_splitter.split_documents(documents)
print(f"Number of chunks: {len(docs)}")



Loading config....
LLM loading...
Loading model from provider: 
Loading model from huggingface:


  from .autonotebook import tqdm as notebook_tqdm


Number of chunks: 30


In [5]:
# Step 2: Dense Retriever (FAISS + HuggingFace)
# embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
dense_vectorstore = FAISS.from_documents(docs, embedding_model)
dense_retriever = dense_vectorstore.as_retriever()

In [None]:
## creating pincone dense vectore store 


In [7]:
dense_retriever.invoke("What is Post Hospitalization?")

[Document(id='8f50c129-ccdb-4386-b8c9-60411328fac7', metadata={'producer': 'iLovePDF', 'creator': '', 'creationdate': '', 'source': 'app\\uploads\\policy-1-5.pdf', 'file_path': 'app\\uploads\\policy-1-5.pdf', 'total_pages': 5, 'format': 'PDF 1.7', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2025-08-30T07:52:53+00:00', 'trapped': '', 'modDate': 'D:20250830075253Z', 'creationDate': '', 'page': 2}, page_content='c) it requires your rehabilitation or for you to be specially trained to cope with it \nd) it continues indefinitely \ne) it comes back or is likely to come back. \n \n2.27 In-patient Care means treatment for which the insured person has to stay in a hospital for more than 24 hours for a covered \nevent. \n \n2.28 Insured / Insured Person means person(s) named in the schedule of the Policy. \n \n2.29 Intensive Care Unit means an identified section, ward or wing of a hospital which is under the constant supervision of a \ndedicated medical practitioner(s),

In [3]:
### Sparse Retriever(BM25)
sparse_retriever=BM25Retriever.from_documents(docs)
sparse_retriever.k=3 ##top- k documents to retriever

## step 4 : Combine with Ensemble Retriever
hybrid_retriever=EnsembleRetriever(
    retrievers=[dense_retriever,sparse_retriever],
    weight=[0.7,0.3]
)


NameError: name 'dense_retriever' is not defined

In [12]:
hybrid_retriever

EnsembleRetriever(retrievers=[VectorStoreRetriever(tags=['FAISS', 'HuggingFaceEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x0000021F1C5EF260>, search_kwargs={}), BM25Retriever(vectorizer=<rank_bm25.BM25Okapi object at 0x0000021F306F3770>, k=3)], weights=[0.5, 0.5])

In [13]:
# Step 5: Query and get results
query = "What is Post Hospitalization?"
results = hybrid_retriever.invoke(query)

# Step 6: Print results
for i, doc in enumerate(results):
    print(f"\n🔹 Document {i+1}:\n{doc.page_content}")


🔹 Document 1:
ii. the in-patient hospitalisation claim for such hospitalisation is admissible by the Company 
Post hospitalisation shall be considered as part of the hospitalisation claim. 
 
3.1.4 Domiciliary Hospitalisation 
The Company shall Company shall indemnify the medical expenses incurred under domiciliary hospitalization, including Pre 
Hospitalisation expenses and Post Hospitalisation expenses, up to the limit as shown in the Table of Benefits. 
 
Exclusions 
Domiciliary hospitalisation shall not cover: 
i. 
Treatment of less than three days 
ii. Expenses incurred for alternative treatment 
iii. Expenses incurred for maternity or infertility 
iv. Expenses incurred for any of the following diseases; 
a) Asthma  
b) Bronchitis 
c) Chronic nephritis and nephritic syndrome 
d) Diarrhoea and all type of dysenteries including 
gastroenteritis 
e) Epilepsy 
f) 
Influenza, cough and cold 
g) All psychiatric or psychosomatic disorders 
h) Pyrexia of unknown origin for less than ten 

### RAG Pipeline with hybrid retriever

In [6]:
from langchain.chat_models import init_chat_model
from langchain.prompts import PromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains.retrieval import create_retrieval_chain
import os
from langchain_groq import ChatGroq
from langchain_google_genai import ChatGoogleGenerativeAI
from dotenv import load_dotenv
load_dotenv()
# groq_client = ChatGroq(model="deepseek-r1-distill-llama-70b", temperature=0)
model_name = "gemini-2.5-flash"
llm = ChatGoogleGenerativeAI(
    model=model_name,
    google_api_key= os.getenv("GEMINI_API_KEY"),
)

In [None]:
# Step 5: Prompt Template
prompt = PromptTemplate.from_template("""
You are a legal/insurance domain expert and policy analyst. 
Use the following extracted clauses from policy documents to answer the question.  
If you can't find the answer, say "I don't know".
Context clauses:

{context}

Question: {input}
""")
### Create stuff Docuemnt Chain
document_chain=create_stuff_documents_chain(llm=llm,prompt=prompt)

## create Full rAg chain
rag_chain=create_retrieval_chain(retriever=hybrid_retriever,combine_docs_chain=document_chain)
rag_chain

# Step 9: Ask a question
query = {"input": "What is Post Hospitalization?"}
response = rag_chain.invoke(query)

# Step 10: Output
print("✅ Answer:\n", response["answer"])

print("\n📄 Source Documents:")
for i, doc in enumerate(response["context"]):
    print(f"\nDoc {i+1}: {doc.page_content}")


In [26]:
context = ""
for i, doc in enumerate(response["context"]):
    context += f"\nDoc {i+1}: {doc.page_content}\n"
context

'\nDoc 1: ii. the in-patient hospitalisation claim for such hospitalisation is admissible by the Company \nPost hospitalisation shall be considered as part of the hospitalisation claim. \n \n3.1.4 Domiciliary Hospitalisation \nThe Company shall Company shall indemnify the medical expenses incurred under domiciliary hospitalization, including Pre \nHospitalisation expenses and Post Hospitalisation expenses, up to the limit as shown in the Table of Benefits. \n \nExclusions \nDomiciliary hospitalisation shall not cover: \ni. \nTreatment of less than three days \nii. Expenses incurred for alternative treatment \niii. Expenses incurred for maternity or infertility \niv. Expenses incurred for any of the following diseases; \na) Asthma  \nb) Bronchitis \nc) Chronic nephritis and nephritic syndrome \nd) Diarrhoea and all type of dysenteries including \ngastroenteritis \ne) Epilepsy \nf) \nInfluenza, cough and cold \ng) All psychiatric or psychosomatic disorders \nh) Pyrexia of unknown origin 

## Evaluation of hybrid retriever

In [10]:
import os
os.environ["LANGSMITH_TRACING"]="true"
os.environ["LANGSMITH_ENDPOINT"]="https://api.smith.langchain.com"
os.environ["LANGSMITH_API_KEY"] = os.getenv("LANGSMITH_API_KEY")
from langsmith import Client, wrappers
from openevals.llm import create_llm_as_judge
from openevals.prompts import RAG_HELPFULNESS_PROMPT, RAG_GROUNDEDNESS_PROMPT, RAG_RETRIEVAL_RELEVANCE_PROMPT
from openai import OpenAI
client = Client()


In [12]:
# Define an LLM as a judge evaluator to evaluate correctness of the output
# Import a prebuilt evaluator prompt from openevals (https://github.com/langchain-ai/openevals) and create an evaluator.
from openevals.prompts import CORRECTNESS_PROMPT
def correctness_evaluator(inputs: dict, outputs: dict, reference_outputs: dict):
    evaluator = create_llm_as_judge(
        prompt=CORRECTNESS_PROMPT,
        model= "gemini-2.5-flash",
        judge = llm,
        feedback_key="correctness",
    )
    eval_result = evaluator(
        inputs=inputs,
        outputs=outputs,
        reference_outputs=reference_outputs
    )
    return eval_result

from openevals.prompts import CORRECTNESS_PROMPT, RAG_GROUNDEDNESS_PROMPT, RAG_HELPFULNESS_PROMPT, RAG_RETRIEVAL_RELEVANCE_PROMPT

def groundedness_evaluator(inputs: dict, outputs: dict, reference_outputs: dict = None):
    groundedness = create_llm_as_judge(
        prompt=RAG_GROUNDEDNESS_PROMPT,
        model= "gemini-2.5-flash",
        judge = llm,
        feedback_key="groundedness",
    )
    # Extract context from outputs
    context = outputs.get('context', '')
    if not context:
        return {"key": "groundedness", "score": 0, "comment": "No context available for evaluation"}
    
    eval_result = groundedness(
        context={"context": context},
        outputs=outputs,
    )
    return eval_result

from openevals.prompts import CORRECTNESS_PROMPT, RAG_GROUNDEDNESS_PROMPT, RAG_HELPFULNESS_PROMPT, RAG_RETRIEVAL_RELEVANCE_PROMPT
def retrival_relevance_evaluator(inputs: dict, outputs: dict, reference_outputs: dict = None):
    evaluator = create_llm_as_judge(
        prompt=RAG_RETRIEVAL_RELEVANCE_PROMPT,
        model= "gemini-2.5-flash",
        judge = llm,
        feedback_key="retrieval_relevance",
    )
    # Extract context from outputs
    context = outputs.get('context', '')
    if not context:
        return {"key": "retrieval_relevance", "score": 0, "comment": "No context available for evaluation"}
    
    eval_result = evaluator(
        inputs=inputs,
        context={"context": context}
    )
    return eval_result

# Define an LLM as a judge evaluator to evaluate correctness of the output
# Import a prebuilt evaluator prompt from openevals (https://github.com/langchain-ai/openevals) and create an evaluator.
from openevals.prompts import CORRECTNESS_PROMPT
def helpfulness_evaluator(inputs: dict, outputs: dict):
    evaluator = create_llm_as_judge(
        prompt=RAG_HELPFULNESS_PROMPT,
        model= "gemini-2.5-flash",
        judge = llm,
        feedback_key="helpfulness",
    )
    eval_result = evaluator(
        inputs=inputs,
        outputs=outputs,
    )
    return eval_result




In [None]:
# Define the application logic you want to evaluate inside a target function. For example, this may be one LLM call that includes the new prompt you are testing, a part of your application or your end to end application
# The SDK will automatically send the inputs from the dataset to your target function
def target(inputs: dict) -> dict:
    
    # Fix: Create proper input format for rag_chain
    # The rag_chain expects a dict with 'input' key, not just the question string
    query_input = {"input": inputs['question']}
    print(f"Invoking RAG chain with input: {query_input}")
    # Get the answer
    response = rag_chain.invoke(query_input)
    print(f"RAG chain response: {response}")
    
    # Extract context from response and convert Document objects to text
    context_text = ""
    if 'context' in response and response['context']:
        for i, doc in enumerate(response['context']):
            # Convert Document object to string content
            if hasattr(doc, 'page_content'):
                context_text += f"Doc {i+1}: {doc.page_content}\n"
            else:
                context_text += f"Doc {i+1}: {str(doc)}\n"
    print(f"Extracted context text: {context_text}")
    
    return {
        "answer": response.get('answer', ''),
        "context": context_text  # Now it's a string, not Document objects
    }

In [None]:
# After running the evaluation, a link will be provided to view the results in langsmith
experiment_results = client.evaluate(
    target,
    data="ClariDoc Evaluation dataset",
    evaluators=[
        correctness_evaluator,groundedness_evaluator,helpfulness_evaluator,retrival_relevance_evaluator   
        ],
    experiment_prefix="hybrid_retrieval_without_metadata_filtering",
    max_concurrency=2,
)

## lets do with pinecone vectore store 

In [55]:
import os
from pinecone import Pinecone
from langchain_pinecone import PineconeVectorStore
from pinecone import ServerlessSpec
from langchain_pinecone import PineconeVectorStore
from datetime import datetime
from uuid import uuid4# Assuming embedding_model is correctly defined
# Assuming embedding_model is correctly defined
# from langchain_openai import OpenAIEmbeddings
# embedding_model = OpenAIEmbeddings(model="text-embedding-ada-002")

# Replace with your Pinecone API key
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

# Initialize Pinecone client
pc = Pinecone(api_key=PINECONE_API_KEY)

# Specify your existing index name and namespace
index_name = "rag-project1"
namespace_name = "hackrx-index"



if not pc.has_index(index_name):
            pc.create_index(
                name = index_name,
                dimension=1024,
                metric="cosine",
                spec = ServerlessSpec(cloud="aws", region="us-east-1")
            )

index = pc.Index(index_name)
        # model_loader = ModelLoader(model_provider="openai")
        # embedding_model = model_loader.load_llm()
uuids = [str(uuid4()) for _ in range(len(docs))]

vector_store = PineconeVectorStore.from_documents(documents=docs,index_name=index_name, embedding=embedding_model, namespace = namespace_name)
# vector_store.add_documents(documents=docs, ids=uuids)
retriever = vector_store.as_retriever(
            search_type="similarity",
            search_kwargs={"k": 5,"namespace": namespace_name}
        )



In [41]:
query = "National Insuarance company"
vector_store.similarity_search(query)

[Document(id='e45ef235-05d2-4614-bdf9-b577249cf552', metadata={'author': '', 'creationDate': '', 'creationdate': '', 'creator': '', 'file_path': 'app\\uploads\\policy-1-5.pdf', 'format': 'PDF 1.7', 'keywords': '', 'modDate': 'D:20250830075253Z', 'moddate': '2025-08-30T07:52:53+00:00', 'page': 0.0, 'producer': 'iLovePDF', 'source': 'app\\uploads\\policy-1-5.pdf', 'subject': '', 'title': '', 'total_pages': 5.0, 'trapped': ''}, page_content='National Insurance Co. Ltd. \nPremises No. 18-0374, Plot no. CBD-81,  \nNew Town, Kolkata - 700156 \nPage 1 of 25 \nNational Parivar Mediclaim Plus Policy \nUIN: NICHLIP25039V032425 \n \nNational Insurance Company Limited \n \n \n \n \n \nCIN - U10200WB1906GOI001713 \nIRDAI Regn. No. – 58 \n \n           Issuing Office \nNational Parivar Mediclaim Plus Policy  \n \nWhereas the Proposer designated in the schedule hereto has by a Proposal together with Declaration, which shall be the basis of \nthis contract and is deemed to be incorporated herein, has 

In [43]:
retriever = vector_store.as_retriever()

In [56]:
# Query the retriever
results = retriever.invoke("Post Hospitalization")
for doc in results:
    print(doc.page_content)


c) it requires your rehabilitation or for you to be specially trained to cope with it 
d) it continues indefinitely 
e) it comes back or is likely to come back. 
 
2.27 In-patient Care means treatment for which the insured person has to stay in a hospital for more than 24 hours for a covered 
event. 
 
2.28 Insured / Insured Person means person(s) named in the schedule of the Policy. 
 
2.29 Intensive Care Unit means an identified section, ward or wing of a hospital which is under the constant supervision of a 
dedicated medical practitioner(s), and which is specially equipped for the continuous monitoring and treatment of patients who 
are in a critical condition, or require life support facilities and where the level of care and supervision is considerably more 
sophisticated and intensive than in the ordinary and other wards. 
 
2.30 Injury means accidental physical bodily harm excluding disease solely and directly caused by external, violent and visible and
3.1.2 Pre Hospitalisatio

In [46]:


## step 4 : Combine with Ensemble Retriever
hybrid_retriever=EnsembleRetriever(
    retrievers=[retriever,sparse_retriever],
    weight=[0.7,0.3]
)



In [50]:
# Step 5: Prompt Template
prompt = PromptTemplate.from_template("""
You are a legal/insurance domain expert and policy analyst. 
Use the following extracted clauses from policy documents to answer the question.  
If you can't find the answer, say "I don't know".
Context clauses:

{context}

Question: {input}
""")
### Create stuff Docuemnt Chain
document_chain=create_stuff_documents_chain(llm=llm,prompt=prompt)

## create Full rAg chain
rag_chain=create_retrieval_chain(retriever=hybrid_retriever,combine_docs_chain=document_chain)
rag_chain

# Step 9: Ask a question
query = {"input": "What is Post Hospitalization?"}
response = rag_chain.invoke(query)

# Step 10: Output
print("✅ Answer:\n", response["answer"])

print("\n📄 Source Documents:")
for i, doc in enumerate(response["context"]):
    print(f"\nDoc {i+1}: {doc.page_content}")


✅ Answer:
 **Post Hospitalisation** refers to the medical expenses incurred up to sixty days immediately after the insured person is discharged from the hospital.

For these expenses to be covered:
i. They must be incurred for the same condition for which the insured person’s hospitalisation was required.
ii. The in-patient hospitalisation claim for such hospitalisation must be admissible by the Company.

Post hospitalisation is considered as part of the overall hospitalisation claim.

📄 Source Documents:

Doc 1: ii. the in-patient hospitalisation claim for such hospitalisation is admissible by the Company 
Post hospitalisation shall be considered as part of the hospitalisation claim. 
 
3.1.4 Domiciliary Hospitalisation 
The Company shall Company shall indemnify the medical expenses incurred under domiciliary hospitalization, including Pre 
Hospitalisation expenses and Post Hospitalisation expenses, up to the limit as shown in the Table of Benefits. 
 
Exclusions 
Domiciliary hospital

In [51]:
def target(inputs: dict) -> dict:
    
    # Fix: Create proper input format for rag_chain
    # The rag_chain expects a dict with 'input' key, not just the question string
    query_input = {"input": inputs['question']}
    print(f"Invoking RAG chain with input: {query_input}")
    # Get the answer
    response = rag_chain.invoke(query_input)
    print(f"RAG chain response: {response}")
    
    # Extract context from response and convert Document objects to text
    context_text = ""
    if 'context' in response and response['context']:
        for i, doc in enumerate(response['context']):
            # Convert Document object to string content
            if hasattr(doc, 'page_content'):
                context_text += f"Doc {i+1}: {doc.page_content}\n"
            else:
                context_text += f"Doc {i+1}: {str(doc)}\n"
    print(f"Extracted context text: {context_text}")
    
    return {
        "answer": response.get('answer', ''),
        "context": context_text  # Now it's a string, not Document objects
    }

In [None]:
# After running the evaluation, a link will be provided to view the results in langsmith
experiment_results = client.evaluate(
    target,
    data="ClariDoc Evaluation dataset",
    evaluators=[
        correctness_evaluator,groundedness_evaluator,helpfulness_evaluator,retrival_relevance_evaluator   
        ],
    experiment_prefix="hybrid_retrieval_without_metadata_filtering",
    max_concurrency=2,
)

## Evaluation with Metadata filtering and hybrid retirever

In [1]:
from app.services.RAG_service import RAGService

rag = RAGService()
doc_path = r"app\uploads\policy-1-5.pdf"
rag.load_and_split_document(type = "pdf", path = doc_path )
# rag.create_query_embedding("Post hospitalisation is part of hospitalisation claim.")    



  from .autonotebook import tqdm as notebook_tqdm


[RAGService] Initializing service...
[RAGService] Loading LLM model (gemini)...
Loading config....
LLM loading...
Loading model from provider: 
Loading model from gemini:
[RAGService] LLM model loaded.
[RAGService] Loading embedding model (huggingface)...
Loading models (one-time initialization)...
Loading config....
LLM loading...
Loading model from provider: 
Loading model from huggingface:
[RAGService] Embedding model loaded.
[RAGService] Initialization complete.
[RAGService] Loading document. Type: pdf, Path: app\uploads\policy-1-5.pdf, URL: None
[RAGService] Loading PDF from path: app\uploads\policy-1-5.pdf
[RAGService] Detecting document type scheme...
[RAGService] Document type scheme detected: document_types='Insurance'
[RAGService] Document type model: <class 'app.schemas.metadata_schema.InsuranceMetadata'>
[RAGService] Splitting document into chunks...
Processing first page, setting up metadata extraction...
doc number: 0
<class 'str'>
processing keywords update for page 1
Co

In [2]:
rag.create_vector_store()

[RAGService] Creating vector store...
[RAGService] Vector store created. Index: <pinecone.db_data.index.Index object at 0x00000201DC713860>, Namespace: rag-project2025-09-15-17-35


In [5]:
rag.result

[Document(id='434d04e7-a75b-48e2-92f5-26fdf7525f49', metadata={'added_new_keyword': True, 'author': '', 'chunk_id': 'e04c02ed-f795-40f1-a5fa-dfde98f13868_p4', 'coverage_type': ['In-patient treatment', 'Pre-hospitalisation', 'Post-hospitalisation', 'Domiciliary hospitalisation', 'Room & ICU charges', 'Medical practitioner fees', 'Anaesthesia, blood, oxygen', 'Surgical appliances', 'Medicines & drugs', 'Diagnostic procedures', 'Internal prosthetics, devices', 'Dental treatment (injury)', 'Plastic surgery (disease/injury)', 'Hormone replacement therapy', 'Vitamins & tonics (treatment)', 'Circumcision (treatment)', 'Cataract surgery', 'Hazardous sports treatment'], 'creationDate': '', 'creationdate': '', 'creator': '', 'doc_category': ['Insurance'], 'doc_id': 'e04c02ed-f795-40f1-a5fa-dfde98f13868', 'doc_type': ['Policy'], 'exclusions': ['Treatment less than three days', 'Alternative treatment expenses', 'Maternity expenses', 'Infertility expenses', 'Asthma', 'Bronchitis', 'Chronic nephriti

In [14]:
rag.retrive_documents("What is Post Hospitalization?")

[RAGService] Retrieving documents from vector store...
[RAGService] Creating query embedding...
[RAGService] Query embedding created: [-0.1565595418214798, 0.346627801656723, -0.8134770393371582, 0.4836694598197937, -1.377659797668457, -1.1484183073043823, 0.023506447672843933, 0.2133348286151886, 0.920676589012146, -0.22855184972286224, 0.22771185636520386, 0.15379832684993744, -0.06810766458511353, -0.686731219291687, -0.3311452269554138, 0.001679081004112959, -1.114246129989624, -1.0797597169876099, -0.8531802296638489, 0.5780619382858276, -0.2819533944129944, 0.7541430592536926, -0.30368295311927795, 0.01811445876955986, -0.8184762597084045, 0.3397812843322754, -0.0691937729716301, -0.05857159197330475, 0.9108926653862, 0.629870593547821, -0.08993957936763763, 0.5095597505569458, 0.23646695911884308, 0.34162047505378723, -0.24565593898296356, -0.3388043940067291, 0.6904820799827576, -1.3323283195495605, 0.30785393714904785, 0.3070054352283478, 0.22371603548526764, 0.741714298725128

In [18]:
rag.result[0].page_content

'Pre hospitalisation shall be considered as part of the Hospitalisation claim. \n \n3.1.3 Post Hospitalisation \nThe Company shall indemnify the medical expenses incurred up to sixty days immediately after the insured person is discharged \nfrom hospital, provided that: \ni. \nsuch medical expenses are incurred for the same condition for which the insured person’s hospitalisation was required, and \nii. the in-patient hospitalisation claim for such hospitalisation is admissible by the Company \nPost hospitalisation shall be considered as part of the hospitalisation claim. \n \n3.1.4 Domiciliary Hospitalisation \nThe Company shall Company shall indemnify the medical expenses incurred under domiciliary hospitalization, including Pre'

In [14]:
type(rag.retriever)

app.retrieval.retriever.Retriever

In [4]:
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.retrievers import BM25Retriever
from langchain.retrievers import EnsembleRetriever
from langchain.schema import Document

### Sparse Retriever(BM25)
sparse_retriever=BM25Retriever.from_documents(rag.chunks)
sparse_retriever.k=3 ##top- k documents to retriever

# Fix: Use the actual Langchain retriever from rag.retriever.retriever
# The rag.retriever is a custom class, but rag.retriever.retriever is the actual Langchain retriever
hybrid_retriever = EnsembleRetriever(
    retrievers=[rag.retriever.retriever, sparse_retriever],  # Use .retriever attribute
    weights=[0.7, 0.3]  # Fix: 'weights' not 'weight'
)

In [16]:
hybrid_retriever.invoke("What is Post Hospitalization?")

[Document(id='434d04e7-a75b-48e2-92f5-26fdf7525f49', metadata={'added_new_keyword': True, 'author': '', 'chunk_id': 'e04c02ed-f795-40f1-a5fa-dfde98f13868_p4', 'coverage_type': ['In-patient treatment', 'Pre-hospitalisation', 'Post-hospitalisation', 'Domiciliary hospitalisation', 'Room & ICU charges', 'Medical practitioner fees', 'Anaesthesia, blood, oxygen', 'Surgical appliances', 'Medicines & drugs', 'Diagnostic procedures', 'Internal prosthetics, devices', 'Dental treatment (injury)', 'Plastic surgery (disease/injury)', 'Hormone replacement therapy', 'Vitamins & tonics (treatment)', 'Circumcision (treatment)', 'Cataract surgery', 'Hazardous sports treatment'], 'creationDate': '', 'creationdate': '', 'creator': '', 'doc_category': ['Insurance'], 'doc_id': 'e04c02ed-f795-40f1-a5fa-dfde98f13868', 'doc_type': ['Policy'], 'exclusions': ['Treatment less than three days', 'Alternative treatment expenses', 'Maternity expenses', 'Infertility expenses', 'Asthma', 'Bronchitis', 'Chronic nephriti

In [None]:
from langchain.prompts import PromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains.retrieval import create_retrieval_chain
import os

# Step 5: Prompt Template
prompt = PromptTemplate.from_template("""
You are a legal/insurance domain expert and policy analyst. 
Use the following extracted clauses from policy documents to answer the question.  
If you can't find the answer, say "I don't know".
Context clauses:

{context}

Question: {input}
""")
### Create stuff Docuemnt Chain
document_chain=create_stuff_documents_chain(llm=llm,prompt=prompt)

## create Full rAg chain
rag_chain=create_retrieval_chain(retriever=hybrid_retriever,combine_docs_chain=document_chain)
rag_chain

# Step 9: Ask a question
query = {"input": "What is Post Hospitalization?"}
response = rag_chain.invoke(query)

# Step 10: Output
print("✅ Answer:\n", response["answer"])

print("\n📄 Source Documents:")
for i, doc in enumerate(response["context"]):
    print(f"\nDoc {i+1}: {doc.page_content}")


In [8]:
def target(inputs: dict) -> dict:
    
    # Fix: Create proper input format for rag_chain
    # The rag_chain expects a dict with 'input' key, not just the question string
    query_input = {"input": inputs['question']}
    print(f"Invoking RAG chain with input: {query_input}")
    # Get the answer
    response = rag_chain.invoke(query_input)
    print(f"RAG chain response: {response}")
    
    # Extract context from response and convert Document objects to text
    context_text = ""
    if 'context' in response and response['context']:
        for i, doc in enumerate(response['context']):
            # Convert Document object to string content
            if hasattr(doc, 'page_content'):
                context_text += f"Doc {i+1}: {doc.page_content}\n"
            else:
                context_text += f"Doc {i+1}: {str(doc)}\n"
    print(f"Extracted context text: {context_text}")
    
    return {
        "answer": response.get('answer', ''),
        "context": context_text  # Now it's a string, not Document objects
    }

In [13]:
# After running the evaluation, a link will be provided to view the results in langsmith
experiment_results = client.evaluate(
    target,
    data="ClariDoc Evaluation dataset",
    evaluators=[
        correctness_evaluator,groundedness_evaluator,helpfulness_evaluator,retrival_relevance_evaluator
        ],
    experiment_prefix="hybrid_retrieval_with_metadata_filtering",
    max_concurrency=2,
)

View the evaluation results for experiment: 'hybrid_retrieval_with_metadata_filtering-097a429a' at:
https://smith.langchain.com/o/4160afa5-3425-51be-b938-333f7d240b5b/datasets/03968ce6-4471-4d75-8b05-5338d32e32e2/compare?selectedSessions=83b57dd9-98f3-47ca-b247-ece92b98bb57




0it [00:00, ?it/s]

Invoking RAG chain with input: {'input': "What criteria must a standalone healthcare facility meet to be considered an 'AYUSH Hospital' under this policy?"}
Invoking RAG chain with input: {'input': 'If a patient needs domiciliary hospitalization for epilepsy, would the expenses be covered under this policy?'}


Key 'additionalProperties' is not supported in schema, ignoring
Key 'parameters' is not supported in schema, ignoring


RAG chain response: {'input': "What criteria must a standalone healthcare facility meet to be considered an 'AYUSH Hospital' under this policy?", 'context': [Document(id='4022fa1e-c92e-4fe9-98f5-0c01a0a2a6fa', metadata={'added_new_keyword': True, 'author': '', 'chunk_id': 'a2bd86c6-3c32-4810-a475-2c44af8b854e_p4', 'coverage_type': ['In-patient treatment', 'Pre-hospitalisation', 'Post-hospitalisation', 'Domiciliary hospitalisation'], 'creationDate': '', 'creationdate': '', 'creator': '', 'doc_category': ['Insurance'], 'doc_id': 'a2bd86c6-3c32-4810-a475-2c44af8b854e', 'doc_type': ['Policy document'], 'exclusions': ['Domiciliary treatment under three days', 'Alternative treatment expenses', 'Maternity/infertility expenses', 'Asthma', 'Bronchitis', 'Chronic nephritis', 'Nephritic syndrome', 'Diarrhea', 'Dysenteries', 'Gastroenteritis', 'Epilepsy', 'Influenza, cough, cold', 'Psychiatric disorders', 'Psychosomatic disorders', 'Pyrexia of unknown origin', 'Tonsillitis', 'Upper respiratory tra

Key 'additionalProperties' is not supported in schema, ignoring
Key 'parameters' is not supported in schema, ignoring


RAG chain response: {'input': 'If a patient needs domiciliary hospitalization for epilepsy, would the expenses be covered under this policy?', 'context': [Document(id='cf22ac2a-fca5-4242-b0bc-18c581d28513', metadata={'added_new_keyword': True, 'author': '', 'chunk_id': 'a2bd86c6-3c32-4810-a475-2c44af8b854e_p4', 'coverage_type': ['In-patient treatment', 'Pre-hospitalisation', 'Post-hospitalisation', 'Domiciliary hospitalisation'], 'creationDate': '', 'creationdate': '', 'creator': '', 'doc_category': ['Insurance'], 'doc_id': 'a2bd86c6-3c32-4810-a475-2c44af8b854e', 'doc_type': ['Policy document'], 'exclusions': ['Domiciliary treatment under three days', 'Alternative treatment expenses', 'Maternity/infertility expenses', 'Asthma', 'Bronchitis', 'Chronic nephritis', 'Nephritic syndrome', 'Diarrhea', 'Dysenteries', 'Gastroenteritis', 'Epilepsy', 'Influenza, cough, cold', 'Psychiatric disorders', 'Psychosomatic disorders', 'Pyrexia of unknown origin', 'Tonsillitis', 'Upper respiratory tract 

Key 'additionalProperties' is not supported in schema, ignoring
Key 'parameters' is not supported in schema, ignoring


RAG chain response: {'input': 'Does the policy cover plastic surgery to improve my appearance?', 'context': [Document(id='5f941575-ee8c-41a2-b2aa-7379eb3a21d4', metadata={'added_new_keyword': True, 'author': '', 'chunk_id': 'a2bd86c6-3c32-4810-a475-2c44af8b854e_p4', 'coverage_type': ['In-patient treatment', 'Pre-hospitalisation', 'Post-hospitalisation', 'Domiciliary hospitalisation'], 'creationDate': '', 'creationdate': '', 'creator': '', 'doc_category': ['Insurance'], 'doc_id': 'a2bd86c6-3c32-4810-a475-2c44af8b854e', 'doc_type': ['Policy document'], 'exclusions': ['Domiciliary treatment under three days', 'Alternative treatment expenses', 'Maternity/infertility expenses', 'Asthma', 'Bronchitis', 'Chronic nephritis', 'Nephritic syndrome', 'Diarrhea', 'Dysenteries', 'Gastroenteritis', 'Epilepsy', 'Influenza, cough, cold', 'Psychiatric disorders', 'Psychosomatic disorders', 'Pyrexia of unknown origin', 'Tonsillitis', 'Upper respiratory tract infection', 'Laryngitis', 'Pharyngitis', 'Arth

Key 'additionalProperties' is not supported in schema, ignoring
Key 'parameters' is not supported in schema, ignoring
Key 'additionalProperties' is not supported in schema, ignoring
Key 'parameters' is not supported in schema, ignoring


RAG chain response: {'input': "What is the key difference between 'Migration' and 'Portability' as defined in the policy?", 'context': [Document(id='5f941575-ee8c-41a2-b2aa-7379eb3a21d4', metadata={'added_new_keyword': True, 'author': '', 'chunk_id': 'a2bd86c6-3c32-4810-a475-2c44af8b854e_p4', 'coverage_type': ['In-patient treatment', 'Pre-hospitalisation', 'Post-hospitalisation', 'Domiciliary hospitalisation'], 'creationDate': '', 'creationdate': '', 'creator': '', 'doc_category': ['Insurance'], 'doc_id': 'a2bd86c6-3c32-4810-a475-2c44af8b854e', 'doc_type': ['Policy document'], 'exclusions': ['Domiciliary treatment under three days', 'Alternative treatment expenses', 'Maternity/infertility expenses', 'Asthma', 'Bronchitis', 'Chronic nephritis', 'Nephritic syndrome', 'Diarrhea', 'Dysenteries', 'Gastroenteritis', 'Epilepsy', 'Influenza, cough, cold', 'Psychiatric disorders', 'Psychosomatic disorders', 'Pyrexia of unknown origin', 'Tonsillitis', 'Upper respiratory tract infection', 'Laryng

Key 'additionalProperties' is not supported in schema, ignoring
Key 'parameters' is not supported in schema, ignoring
Key 'additionalProperties' is not supported in schema, ignoring
Key 'parameters' is not supported in schema, ignoring


RAG chain response: {'input': 'Are the costs of vitamins and tonics prescribed by a doctor during hospitalization covered?', 'context': [Document(id='3b63a148-0547-41f2-8a01-b5f3c704d8e1', metadata={'added_new_keyword': True, 'author': '', 'chunk_id': 'a2bd86c6-3c32-4810-a475-2c44af8b854e_p4', 'coverage_type': ['In-patient treatment', 'Pre-hospitalisation', 'Post-hospitalisation', 'Domiciliary hospitalisation'], 'creationDate': '', 'creationdate': '', 'creator': '', 'doc_category': ['Insurance'], 'doc_id': 'a2bd86c6-3c32-4810-a475-2c44af8b854e', 'doc_type': ['Policy document'], 'exclusions': ['Domiciliary treatment under three days', 'Alternative treatment expenses', 'Maternity/infertility expenses', 'Asthma', 'Bronchitis', 'Chronic nephritis', 'Nephritic syndrome', 'Diarrhea', 'Dysenteries', 'Gastroenteritis', 'Epilepsy', 'Influenza, cough, cold', 'Psychiatric disorders', 'Psychosomatic disorders', 'Pyrexia of unknown origin', 'Tonsillitis', 'Upper respiratory tract infection', 'Laryn

Key 'additionalProperties' is not supported in schema, ignoring
Key 'parameters' is not supported in schema, ignoring
Key 'additionalProperties' is not supported in schema, ignoring
Key 'parameters' is not supported in schema, ignoring
1it [00:33, 33.63s/it]Key 'additionalProperties' is not supported in schema, ignoring
Key 'parameters' is not supported in schema, ignoring


RAG chain response: {'input': 'If I am injured while participating in a mountaineering expedition as a hobby, what is the maximum amount the policy will cover for my treatment?', 'context': [Document(id='5f941575-ee8c-41a2-b2aa-7379eb3a21d4', metadata={'added_new_keyword': True, 'author': '', 'chunk_id': 'a2bd86c6-3c32-4810-a475-2c44af8b854e_p4', 'coverage_type': ['In-patient treatment', 'Pre-hospitalisation', 'Post-hospitalisation', 'Domiciliary hospitalisation'], 'creationDate': '', 'creationdate': '', 'creator': '', 'doc_category': ['Insurance'], 'doc_id': 'a2bd86c6-3c32-4810-a475-2c44af8b854e', 'doc_type': ['Policy document'], 'exclusions': ['Domiciliary treatment under three days', 'Alternative treatment expenses', 'Maternity/infertility expenses', 'Asthma', 'Bronchitis', 'Chronic nephritis', 'Nephritic syndrome', 'Diarrhea', 'Dysenteries', 'Gastroenteritis', 'Epilepsy', 'Influenza, cough, cold', 'Psychiatric disorders', 'Psychosomatic disorders', 'Pyrexia of unknown origin', 'Ton

Key 'additionalProperties' is not supported in schema, ignoring
Key 'parameters' is not supported in schema, ignoring
2it [00:45, 20.74s/it]

RAG chain response: {'input': 'I missed my premium payment due date. Do I have any window to pay it without losing my policy benefits for pre-existing diseases?', 'context': [Document(id='203047cd-015e-45d8-8df9-720fbc176dd2', metadata={'added_new_keyword': True, 'author': '', 'chunk_id': 'a2bd86c6-3c32-4810-a475-2c44af8b854e_p4', 'coverage_type': ['In-patient treatment', 'Pre-hospitalisation', 'Post-hospitalisation', 'Domiciliary hospitalisation'], 'creationDate': '', 'creationdate': '', 'creator': '', 'doc_category': ['Insurance'], 'doc_id': 'a2bd86c6-3c32-4810-a475-2c44af8b854e', 'doc_type': ['Policy document'], 'exclusions': ['Domiciliary treatment under three days', 'Alternative treatment expenses', 'Maternity/infertility expenses', 'Asthma', 'Bronchitis', 'Chronic nephritis', 'Nephritic syndrome', 'Diarrhea', 'Dysenteries', 'Gastroenteritis', 'Epilepsy', 'Influenza, cough, cold', 'Psychiatric disorders', 'Psychosomatic disorders', 'Pyrexia of unknown origin', 'Tonsillitis', 'Uppe

Key 'additionalProperties' is not supported in schema, ignoring
Key 'parameters' is not supported in schema, ignoring
Key 'additionalProperties' is not supported in schema, ignoring
Key 'parameters' is not supported in schema, ignoring
Key 'additionalProperties' is not supported in schema, ignoring
Key 'parameters' is not supported in schema, ignoring
Key 'additionalProperties' is not supported in schema, ignoring
Key 'parameters' is not supported in schema, ignoring
Key 'additionalProperties' is not supported in schema, ignoring
Key 'parameters' is not supported in schema, ignoring
3it [01:03, 19.43s/it]Key 'additionalProperties' is not supported in schema, ignoring
Key 'parameters' is not supported in schema, ignoring
4it [01:08, 13.92s/it]Key 'additionalProperties' is not supported in schema, ignoring
Key 'parameters' is not supported in schema, ignoring
Key 'additionalProperties' is not supported in schema, ignoring
Key 'parameters' is not supported in schema, ignoring
Key 'additio