In [1]:
from langchain.document_loaders import DirectoryLoader
from langchain_community.document_loaders import PyMuPDFLoader
from dotenv import load_dotenv

In [2]:
pdf_directory_path = "../Data"

loader = DirectoryLoader(
    path=pdf_directory_path,
    glob="**/*.pdf",
    loader_cls=PyMuPDFLoader
)

documents = loader.load()
documents

[Document(metadata={'producer': 'Microsoft® Word 2013', 'creator': 'Microsoft® Word 2013', 'creationdate': '2016-08-14T17:35:35+05:30', 'source': '../Data/Benefitsneedandimportanceofdailyexercise.pdf', 'file_path': '../Data/Benefitsneedandimportanceofdailyexercise.pdf', 'total_pages': 7, 'format': 'PDF 1.5', 'title': '', 'author': 'Khan', 'subject': '', 'keywords': '', 'moddate': '2016-08-16T12:03:17+04:00', 'trapped': '', 'modDate': "D:20160816120317+04'00'", 'creationDate': "D:20160814173535+05'30'", 'page': 0}, page_content='See discussions, stats, and author profiles for this publication at: https://www.researchgate.net/publication/306118434\nBeneﬁts, need and importance of daily exercise\nArticle · January 2016\nCITATIONS\n113\nREADS\n254,801\n1 author:\nMohammed Abou Elmagd\nRAK Medical and Health Sciences University\n15 PUBLICATIONS\xa0\xa0\xa0234 CITATIONS\xa0\xa0\xa0\nSEE PROFILE\nAll content following this page was uploaded by Mohammed Abou Elmagd on 16 August 2016.\nThe user

In [11]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
final_document = text_splitter.split_documents(documents)
final_document

[Document(metadata={'producer': 'Microsoft® Word 2013', 'creator': 'Microsoft® Word 2013', 'creationdate': '2016-08-14T17:35:35+05:30', 'source': '../Data/Benefitsneedandimportanceofdailyexercise.pdf', 'file_path': '../Data/Benefitsneedandimportanceofdailyexercise.pdf', 'total_pages': 7, 'format': 'PDF 1.5', 'title': '', 'author': 'Khan', 'subject': '', 'keywords': '', 'moddate': '2016-08-16T12:03:17+04:00', 'trapped': '', 'modDate': "D:20160816120317+04'00'", 'creationDate': "D:20160814173535+05'30'", 'page': 0}, page_content='See discussions, stats, and author profiles for this publication at: https://www.researchgate.net/publication/306118434\nBeneﬁts, need and importance of daily exercise\nArticle · January 2016\nCITATIONS\n113\nREADS\n254,801\n1 author:\nMohammed Abou Elmagd\nRAK Medical and Health Sciences University\n15 PUBLICATIONS\xa0\xa0\xa0234 CITATIONS\xa0\xa0\xa0\nSEE PROFILE\nAll content following this page was uploaded by Mohammed Abou Elmagd on 16 August 2016.\nThe user

In [14]:
from langchain.vectorstores import Milvus
from langchain_google_genai import GoogleGenerativeAIEmbeddings

embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

In [15]:
import os
load_dotenv()
URI = "https://in03-96fdb1711aa6d58.serverless.gcp-us-west1.cloud.zilliz.com"
api_key = os.getenv("MILVUS_API_KEY")




In [16]:
from pymilvus import MilvusClient, CollectionSchema, FieldSchema, DataType

mc = MilvusClient(uri=URI, token=api_key)
collection_name = "LangChainCollection"
index_name = "vector"
field_name = "vector"

# Step 1: Release the collection (must be done before dropping the index)
try:
    print(f"Releasing collection '{collection_name}' from memory...")
    mc.release_collection(collection_name)
    print("Collection released.")
except Exception as e:
    print("Collection may already be released.", e)

# Step 2: Drop the index
try:
    print(f"Dropping existing index '{index_name}'...")
    mc.drop_index(collection_name=collection_name, index_name=index_name)
    print("Index dropped.")
except Exception as e:
    print("Could not drop index.", e)

fields = [
    FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
    FieldSchema(name="vector", dtype=DataType.FLOAT_VECTOR, dim=768),
    FieldSchema(name="page_content", dtype=DataType.VARCHAR, max_length=10000, default_value=""),
    FieldSchema(name="file_path", dtype=DataType.VARCHAR, max_length=1024),
    FieldSchema(name="title", dtype=DataType.VARCHAR, max_length=512),
    FieldSchema(name="author", dtype=DataType.VARCHAR, max_length=512),
    FieldSchema(name="creator", dtype=DataType.VARCHAR, max_length=512),
    FieldSchema(name="producer", dtype=DataType.VARCHAR, max_length=512),
    FieldSchema(name="source", dtype=DataType.VARCHAR, max_length=1024),
    FieldSchema(name="format", dtype=DataType.VARCHAR, max_length=64),
    FieldSchema(name="subject", dtype=DataType.VARCHAR, max_length=512),
    FieldSchema(name="keywords", dtype=DataType.VARCHAR, max_length=512),
    FieldSchema(name="creationdate", dtype=DataType.VARCHAR, max_length=64),
    FieldSchema(name="moddate", dtype=DataType.VARCHAR, max_length=64),
    FieldSchema(name="modDate", dtype=DataType.VARCHAR, max_length=64),
    FieldSchema(name="creationDate", dtype=DataType.VARCHAR, max_length=64),
    FieldSchema(name="trapped", dtype=DataType.VARCHAR, max_length=64),
    FieldSchema(name="page", dtype=DataType.INT64),
    FieldSchema(name="total_pages", dtype=DataType.INT64)
]

index_types = ["FLAT", "HNSW", "IVF_FLAT"]
for index_type in index_types:
    collection_name = f"demo_{index_type.lower()}"
    
    if mc.has_collection(collection_name):
        mc.drop_collection(collection_name)

    # Define schema as a list of field dicts
    # Define schema as a list of field dicts
    
    schema = CollectionSchema(fields=fields, description="Schema for PDF vector DB",enable_dynamic_field=True)
    mc.create_collection(collection_name=collection_name, schema=schema)

    index_params = mc.prepare_index_params()
    index_params.add_index(
        field_name="vector",
        index_type=index_type,
        index_name="vector_index",
        metric_type="COSINE",
        params={
            # You can customize these per index type
            "nlist": 128 if index_type == "IVF_FLAT" else None,
            "M": 16 if index_type == "HNSW" else None,
            "efConstruction": 200 if index_type == "HNSW" else None
        }
    )

    mc.create_index(collection_name=collection_name, index_params=index_params)
    print(f"Created collection `{collection_name}` with {index_type} index.")

Releasing collection 'LangChainCollection' from memory...
Collection released.
Dropping existing index 'vector'...
Index dropped.
Created collection `demo_flat` with FLAT index.
Created collection `demo_hnsw` with HNSW index.
Created collection `demo_ivf_flat` with IVF_FLAT index.


In [17]:
print(final_document[0].dict())

{'id': None, 'metadata': {'producer': 'Microsoft® Word 2013', 'creator': 'Microsoft® Word 2013', 'creationdate': '2016-08-14T17:35:35+05:30', 'source': '../Data/Benefitsneedandimportanceofdailyexercise.pdf', 'file_path': '../Data/Benefitsneedandimportanceofdailyexercise.pdf', 'total_pages': 7, 'format': 'PDF 1.5', 'title': '', 'author': 'Khan', 'subject': '', 'keywords': '', 'moddate': '2016-08-16T12:03:17+04:00', 'trapped': '', 'modDate': "D:20160816120317+04'00'", 'creationDate': "D:20160814173535+05'30'", 'page': 0}, 'page_content': 'See discussions, stats, and author profiles for this publication at: https://www.researchgate.net/publication/306118434\nBeneﬁts, need and importance of daily exercise\nArticle · January 2016\nCITATIONS\n113\nREADS\n254,801\n1 author:\nMohammed Abou Elmagd\nRAK Medical and Health Sciences University\n15 PUBLICATIONS\xa0\xa0\xa0234 CITATIONS\xa0\xa0\xa0\nSEE PROFILE\nAll content following this page was uploaded by Mohammed Abou Elmagd on 16 August 2016.\

/var/folders/qs/5xbclsgs1c50_zd37hnz6w70z4qndk/T/ipykernel_77488/2891738753.py:1: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  print(final_document[0].dict())


In [27]:
vector_store1 = Milvus.from_documents(
    documents=final_document,
    embedding=embeddings,
    collection_name='demo_flat',
    connection_args={"uri": URI, "token": api_key},
    text_field="page_content",
    drop_old=False
)

vector_store2 = Milvus.from_documents(
    documents=final_document,
    embedding=embeddings,
    collection_name='demo_hnsw',
    connection_args={"uri": URI, "token": api_key},
    text_field="page_content",
    drop_old=False
)

vector_store3 = Milvus.from_documents(
    documents=final_document,
    embedding=embeddings,
    collection_name='demo_ivf_flat',
    connection_args={"uri": URI, "token": api_key},
    text_field="page_content",
    drop_old=False
)

In [28]:
results = vector_store1.similarity_search_with_score("How good is excercise?")
for doc, score in results:
    print(f"\nFLAT Score: {score:.4f}")
    print(f"FLAT Content: {doc.page_content[:200]}...")

results = vector_store2.similarity_search_with_score("How good is excercise?")
for doc, score in results:
    print(f"\nHNSW Score: {score:.4f}")
    print(f"HNSW Content: {doc.page_content[:200]}...")

results = vector_store3.similarity_search_with_score("How good is excercise?")
for doc, score in results:
    print(f"\nIVF Score: {score:.4f}")
    print(f"IVF Content: {doc.page_content[:200]}...")


FLAT Score: 0.5734
FLAT Content: Adults should also do muscle-strengthening activities of moderate or greater intensity and 
that involve all major muscle groups on 2 or more days a week, as these activities provide 
additional healt...

FLAT Score: 0.5734
FLAT Content: Adults should also do muscle-strengthening activities of moderate or greater intensity and 
that involve all major muscle groups on 2 or more days a week, as these activities provide 
additional healt...

FLAT Score: 0.5927
FLAT Content: the body with every heartbeat and the pulmonary system to 
increase the maximum amount of oxygen that the lungs can 
take in. Exercise lowers blood pressure, slightly decreases the 
levels of total an...

FLAT Score: 0.5927
FLAT Content: the body with every heartbeat and the pulmonary system to 
increase the maximum amount of oxygen that the lungs can 
take in. Exercise lowers blood pressure, slightly decreases the 
levels of total an...

HNSW Score: 0.5734
HNSW Content: Adults should 

In [29]:
retriever1 = vector_store1.as_retriever(
    search_type="similarity_score_threshold",
    search_kwargs={"k" : 5, "score_threshold":0.7}
)
retriever2 = vector_store2.as_retriever(
    search_type="similarity_score_threshold",
    search_kwargs={"k" : 5, "score_threshold":0.7}
)
retriever3 = vector_store3.as_retriever(
    search_type="similarity_score_threshold",
    search_kwargs={"k" : 5, "score_threshold":0.7}
)

In [30]:
import time

start_time = time.time()
retriever1.invoke("Health benefits of excercise?")
end_time = time.time()
print(f"Retriever time for FLAT: {end_time - start_time:.4f} seconds")

start_time = time.time()
retriever2.invoke("Health benefits of excercise?")
end_time = time.time()
print(f"Retriever time for HNSW: {end_time - start_time:.4f} seconds")

start_time = time.time()
retriever3.invoke("Health benefits of excercise?")
end_time = time.time()
print(f"Retriever time for IVF: {end_time - start_time:.4f} seconds")

NotImplementedError: 

In [91]:
#from langchain.retrievers import ContextualCompressionRetriever
#from langchain.retrievers.merger_retriever import MergerRetriever

# Assume you already have a base vector retriever:
#base_retriever1 = vector_store1.as_retriever(search_type="mmr", search_kwargs={"k": 10, "fetch_k": 20})
#results = base_retriever1.invoke("How good is exercise?")
#for doc in results:
##    print(doc.page_content)
#base_retriever2 = vector_store2.as_retriever(search_type="mmr", search_kwargs={"k": 10, "fetch_k": 20})
#results = base_retriever2.invoke("How good is exercise?")
#for doc in results:
#    print(doc.page_content)
#base_retriever3 = vector_store3.as_retriever(search_type="mmr", search_kwargs={"k": 10, "fetch_k": 20})
#results = base_retriever3.invoke("How good is exercise?")
#for doc in results:
#    print(doc.page_content)



In [68]:
from langchain_google_genai import ChatGoogleGenerativeAI
model = ChatGoogleGenerativeAI(model='gemini-1.5-flash')

In [69]:
from langchain import hub
prompt = hub.pull("rlm/rag-prompt")



In [70]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

In [71]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [72]:
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

In [76]:
rag_chain.invoke("Negative effects of not doing excercises?")

No index params provided. Could not determine relevance function. Use L2 distance as default.


'Not exercising increases the risk of chronic diseases like cardiovascular disease, type 2 diabetes, and certain cancers.  It also elevates the risk of anxiety, depression, poor sleep, and reduced quality of life.  Inactivity contributes to lower cardiorespiratory fitness and increases the likelihood of heart disease and stroke.'

In [92]:
from docx import Document

# Your RAG chain result
response = rag_chain.invoke("How good is exercise?")

# Create a new DOCX file
doc = Document()

# Add a title
doc.add_heading("RAG Chain Output", level=1)

# Add the question
doc.add_paragraph("**Question:** How good is exercise?")

# Add the generated answer
doc.add_paragraph("**Answer:**")
doc.add_paragraph(response)

# Optional: include source docs if using retriever
if hasattr(rag_chain, "retriever"):
    doc.add_paragraph("**Retrieved Source Chunks:**")
    sources = rag_chain.retriever.invoke("How good is exercise?")
    for i, doc_chunk in enumerate(sources, start=1):
        doc.add_paragraph(f"Source {i}:", style='List Number')
        doc.add_paragraph(doc_chunk.page_content)

# Save the file
doc.save("rag_output.docx")

ModuleNotFoundError: No module named 'exceptions'