## MultiModal PDF-RAG with Langchain and Milvus Vector database

In [1]:
import warnings
warnings.filterwarnings("ignore")

 ### Load and parse Multimodal PDF

In [2]:
import os

os.environ["PATH"] += os.pathsep + "/opt/homebrew/bin"
from unstructured.partition.pdf import partition_pdf 

raw_pdf_elements=partition_pdf(
    filename="data/SIABestPractices.pdf", 
    strategy="hi_res",                                      
    extract_images_in_pdf=True,                            
    extract_image_block_output_dir="extracted_data1",
    infer_table_structure=True,  
    )


Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
The `max_size` parameter is deprecated and will be removed in v4.26. Please specify in `size['longest_edge'] instead`.


In [3]:
#raw_pdf_elements

each elements has properties like:
    element.text        # The actual text (if it's a text-based element)
    element.category    # Type of element (e.g., "NarrativeText", "Title", etc.)
    element.metadata    # Metadata like page number, coordinates, etc.

In [4]:
for ele in raw_pdf_elements:
    print(f"page_content: {ele.text}")
    print(f"category: {ele.category}")
    print(f"Pagenumber: {ele.metadata.page_number}")
    print(f"image: {ele.metadata.image_path}")


    print("----")


page_content: Home (/s/)
category: Title
Pagenumber: 1
image: None
----
page_content: Engage Updates Product Centers Resources
category: Header
Pagenumber: 1
image: None
----
page_content: (Pore
category: Title
Pagenumber: 1
image: None
----
page_content: 
category: Image
Pagenumber: 1
image: extracted_data1/figure-1-1.jpg
----
page_content: More
category: Title
Pagenumber: 1
image: None
----
page_content: Have a Question? Ask the Community
category: Title
Pagenumber: 1
image: None
----
page_content: (Q Search...
category: Title
Pagenumber: 1
image: None
----
page_content: Search...
category: Title
Pagenumber: 1
image: None
----
page_content: Search )
category: Title
Pagenumber: 1
image: None
----
page_content: Search
category: NarrativeText
Pagenumber: 1
image: None
----
page_content: SIA Best Practices: Architecture, Conﬁguration, Security, and Operations
category: Title
Pagenumber: 1
image: None
----
page_content: Secure Infrastructure Access (SIA) offers an agentless, SaaS solution

### Convert Elements into Langchain Documents

In [5]:
#Convert Elements into Langchain Documents

from langchain.schema import Document
documents =[]
for ele in raw_pdf_elements:
    if ele.text:
        documents.append(
            Document(
            page_content=ele.text,
            metadata={"type": ele.category, "page_number": ele.metadata.page_number}
            )
            )
    elif ele.category =="image" and ele.metadata== "image_path":
        documents.append(Document(
            page_content=f"[Image: {ele.metadata.image_path}]",
            metadata={"type": "image", "page_number":ele.metadata.page_number}
        ))

In [6]:
documents

[Document(metadata={'type': 'Title', 'page_number': 1}, page_content='Home (/s/)'),
 Document(metadata={'type': 'Header', 'page_number': 1}, page_content='Engage Updates Product Centers Resources'),
 Document(metadata={'type': 'Title', 'page_number': 1}, page_content='(Pore'),
 Document(metadata={'type': 'Title', 'page_number': 1}, page_content='More'),
 Document(metadata={'type': 'Title', 'page_number': 1}, page_content='Have a Question? Ask the Community'),
 Document(metadata={'type': 'Title', 'page_number': 1}, page_content='(Q Search...'),
 Document(metadata={'type': 'Title', 'page_number': 1}, page_content='Search...'),
 Document(metadata={'type': 'Title', 'page_number': 1}, page_content='Search )'),
 Document(metadata={'type': 'NarrativeText', 'page_number': 1}, page_content='Search'),
 Document(metadata={'type': 'Title', 'page_number': 1}, page_content='SIA Best Practices: Architecture, Conﬁguration, Security, and Operations'),
 Document(metadata={'type': 'NarrativeText', 'page_

### Embedd Documents

In [7]:
# Embedd documents
import os
os.environ["HF_TOKEN"]=os.getenv("HF_TOKEN")

from langchain.embeddings import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")


  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")


### Store in Milvus

In [8]:
# Store in Milvus
# Milvus Lite 
### The easiest way to prototype is to use Milvus Lite, where everything is stored in a local vector database file. Only the Flat index can be used.

from langchain_milvus import Milvus

URI = "./milvus_example03.db"

vector_store = Milvus(
    embedding_function=embeddings,
    connection_args={"uri": URI},
    index_params={"index_type": "FLAT", "metric_type": "L2"},
)

In [9]:

from uuid import uuid4
uuids=[]
for i in range(len(documents)):
    uuids.append(str(uuid4()))

uuids


['b714d755-0bdc-4e9a-9160-cc192587b103',
 '004e9889-b257-408b-8fc9-5cf34f882a06',
 '26baf642-48d7-4b3b-91f9-7ee215ac2fa1',
 '06bb981d-816a-46b9-b8ec-fc4f6e0e0547',
 'e20a50ee-be8f-4755-b44d-df4cb5624fb2',
 '546d61b4-c601-4c7a-a57b-a4f5ad51778f',
 '8ce928b5-2def-4403-ba03-d5d62f522086',
 '06093bda-263d-4f12-92a0-f7599731ab86',
 '99e20e3c-3760-4b68-9e5b-22bb1f78ee53',
 '1b9b61c0-5db8-45c6-8d33-ba7c2095169b',
 '93964f87-271f-4273-941f-f1250fdd7634',
 '8d587493-3f86-4a13-97d0-8b9027c7bb97',
 '99b3ba45-7927-4023-a353-c943d41087e7',
 '8dc879cc-1530-4f3a-a765-354e819720b4',
 'b41e36c5-0ecb-4413-8813-835ff25c4049',
 '4b2c88b4-2ddb-4fd1-8f44-249470047945',
 'b483b2b6-977a-4640-a09c-553b9e7ceb48',
 '2ea560dc-2b04-448f-ad34-be82e7d0c047',
 'c3915964-df8b-48d4-a0ea-e3aeab6464bb',
 '8fcf0349-0f9a-45b4-99bd-3c4cffebec5c',
 '60283852-990a-481c-9712-16d3c31ee25c',
 '0a28ffa5-c920-4e32-b317-9e88a751f689',
 'afde1022-67b4-45f0-8886-759cd35bd3d1',
 'd37f590f-879c-4ea2-b8e7-053d65cbf862',
 '691cbee3-d5c0-

### Add items to vector store

In [10]:
#Add items to vector store
vector_store.add_documents(documents=documents, ids=uuids)

['b714d755-0bdc-4e9a-9160-cc192587b103',
 '004e9889-b257-408b-8fc9-5cf34f882a06',
 '26baf642-48d7-4b3b-91f9-7ee215ac2fa1',
 '06bb981d-816a-46b9-b8ec-fc4f6e0e0547',
 'e20a50ee-be8f-4755-b44d-df4cb5624fb2',
 '546d61b4-c601-4c7a-a57b-a4f5ad51778f',
 '8ce928b5-2def-4403-ba03-d5d62f522086',
 '06093bda-263d-4f12-92a0-f7599731ab86',
 '99e20e3c-3760-4b68-9e5b-22bb1f78ee53',
 '1b9b61c0-5db8-45c6-8d33-ba7c2095169b',
 '93964f87-271f-4273-941f-f1250fdd7634',
 '8d587493-3f86-4a13-97d0-8b9027c7bb97',
 '99b3ba45-7927-4023-a353-c943d41087e7',
 '8dc879cc-1530-4f3a-a765-354e819720b4',
 'b41e36c5-0ecb-4413-8813-835ff25c4049',
 '4b2c88b4-2ddb-4fd1-8f44-249470047945',
 'b483b2b6-977a-4640-a09c-553b9e7ceb48',
 '2ea560dc-2b04-448f-ad34-be82e7d0c047',
 'c3915964-df8b-48d4-a0ea-e3aeab6464bb',
 '8fcf0349-0f9a-45b4-99bd-3c4cffebec5c',
 '60283852-990a-481c-9712-16d3c31ee25c',
 '0a28ffa5-c920-4e32-b317-9e88a751f689',
 'afde1022-67b4-45f0-8886-759cd35bd3d1',
 'd37f590f-879c-4ea2-b8e7-053d65cbf862',
 '691cbee3-d5c0-

### Creation of retriever class for vector store

In [11]:
#creation of retriever class for vector store

retriever =vector_store.as_retriever(
        search_type="similarity_score_threshold",
        search_kwargs={'score_threshold': 0.6,'k': 6} #hyperparameter
)

### Querying the VectorStore

In [12]:
# Query directly - Similarity search

res1= vector_store.similarity_search(query="what is strong accounts",k=3)
res1
for res in res1:
    print(f"# {res.page_content}")

# Strong Accounts
# Strong accounts are privileged accounts with elevated access rights that provide temporary accounts used for ZSP access. Strong accounts can be considered “Service Accounts” in that they are used by the SIA service to automate access for the ZSP ﬂow. To provision ephemeral users, SIA needs to access a strong account, which can create the ephemeral users on the target machines and delete them when the session ends.
# Strong accounts can either be speciﬁed in SIA directly or you can conﬁgure SIA to retrieve the account from the vault. Customers are advised to opt for vaulting and rotating the strong accounts. By vaulting strong accounts, Organizations can comply with business security objective and mitigate malicious or risky account usage. Additionally, regular rotation of strong account passwords helps to mitigate the risk of credential theft or misuse by malicious actors. This can be integrated with self-hosted deployments of CyberArk Privileged Access Management (

In [13]:
# Query directly - Similarity search with score

res2= vector_store.similarity_search_with_score(query="what is strong accounts",k=3)
res2


[(Document(metadata={'page_number': 3, 'pk': 'e00ce1d6-2fd3-4b1e-b542-e4ce9ddd4fcc', 'type': 'Title'}, page_content='Strong Accounts'),
  0.3603437840938568),
 (Document(metadata={'page_number': 3, 'pk': 'e69da931-6574-4959-80d1-f938b9e3b75c', 'type': 'NarrativeText'}, page_content='Strong accounts are privileged accounts with elevated access rights that provide temporary accounts used for ZSP access. Strong accounts can be considered “Service Accounts” in that they are used by the SIA service to automate access for the ZSP ﬂow. To provision ephemeral users, SIA needs to access a strong account, which can create the ephemeral users on the target machines and delete them when the session ends.'),
  0.6745815277099609),
 (Document(metadata={'page_number': 3, 'pk': 'a1844abd-ec22-4fc4-9bef-25a3f840644f', 'type': 'NarrativeText'}, page_content='Strong accounts can either be speciﬁed in SIA directly or you can conﬁgure SIA to retrieve the account from the vault. Customers are advised to opt

In [14]:
for res,score in res2:
    print(f"* {res.page_content},{score:3f}")
   

* Strong Accounts,0.360344
* Strong accounts are privileged accounts with elevated access rights that provide temporary accounts used for ZSP access. Strong accounts can be considered “Service Accounts” in that they are used by the SIA service to automate access for the ZSP ﬂow. To provision ephemeral users, SIA needs to access a strong account, which can create the ephemeral users on the target machines and delete them when the session ends.,0.674582
* Strong accounts can either be speciﬁed in SIA directly or you can conﬁgure SIA to retrieve the account from the vault. Customers are advised to opt for vaulting and rotating the strong accounts. By vaulting strong accounts, Organizations can comply with business security objective and mitigate malicious or risky account usage. Additionally, regular rotation of strong account passwords helps to mitigate the risk of credential theft or misuse by malicious actors. This can be integrated with self-hosted deployments of CyberArk Privileged A

In [15]:
# search_type = ""similarity"" 
# Returns the documents most similar to the query using a distance metric (like cosine similarity or Euclidean distance) between embedding vectors.

retriever =vector_store.as_retriever(
        #search_type="similarity_score_threshold",
        search_type="similarity",
        search_kwargs={'k': 6} #hyperparameter
)
response_1 =retriever.invoke("what is strong accounts?")

for doc in response_1:
    print(doc.page_content)


Strong Accounts
Strong accounts are privileged accounts with elevated access rights that provide temporary accounts used for ZSP access. Strong accounts can be considered “Service Accounts” in that they are used by the SIA service to automate access for the ZSP ﬂow. To provision ephemeral users, SIA needs to access a strong account, which can create the ephemeral users on the target machines and delete them when the session ends.
Strong accounts can either be speciﬁed in SIA directly or you can conﬁgure SIA to retrieve the account from the vault. Customers are advised to opt for vaulting and rotating the strong accounts. By vaulting strong accounts, Organizations can comply with business security objective and mitigate malicious or risky account usage. Additionally, regular rotation of strong account passwords helps to mitigate the risk of credential theft or misuse by malicious actors. This can be integrated with self-hosted deployments of CyberArk Privileged Access Management (PAM) b

In [16]:
# search_type = ""similarity_score_threshold"" 
# Only returns documents with a similarity score above a certain threshold.

retriever =vector_store.as_retriever(
        search_type="similarity_score_threshold",
        search_kwargs={"score_threshold": 0.6} #hyperparameter
)
response_2 =retriever.invoke("what is strong accounts?")

for doc in response_2:
    print(doc.page_content)

Strong Accounts
Strong accounts are privileged accounts with elevated access rights that provide temporary accounts used for ZSP access. Strong accounts can be considered “Service Accounts” in that they are used by the SIA service to automate access for the ZSP ﬂow. To provision ephemeral users, SIA needs to access a strong account, which can create the ephemeral users on the target machines and delete them when the session ends.
Strong accounts can either be speciﬁed in SIA directly or you can conﬁgure SIA to retrieve the account from the vault. Customers are advised to opt for vaulting and rotating the strong accounts. By vaulting strong accounts, Organizations can comply with business security objective and mitigate malicious or risky account usage. Additionally, regular rotation of strong account passwords helps to mitigate the risk of credential theft or misuse by malicious actors. This can be integrated with self-hosted deployments of CyberArk Privileged Access Management (PAM) b

In [17]:
# search_type = "mmr" 
# Maximal Marginal Relevance 
# # It tries to avoid redundancy among results.
retriever =vector_store.as_retriever(
        search_type="mmr",
        search_kwargs={'k': 6} #hyperparameter
)
res3 =retriever.invoke("what is strong accounts?")
res3
for doc in res3:
    print(doc.page_content)

Strong Accounts
Strong accounts are privileged accounts with elevated access rights that provide temporary accounts used for ZSP access. Strong accounts can be considered “Service Accounts” in that they are used by the SIA service to automate access for the ZSP ﬂow. To provision ephemeral users, SIA needs to access a strong account, which can create the ephemeral users on the target machines and delete them when the session ends.
While the strong accountant may be a member of the Domain Administrators group, this doesn't follow the principle of least privilege and is not considered best practice. To adhere to the principle of least privilege and separation of duties, organizations should subdivide strong accounts into privilege silos. This granular approach helps to ensure that the strong account does not have widespread access throughout your environment. You may also choose to select SIA strong accounts based on credential boundary or network area which may also help align access rig

### # Reranking
After the first stage of retrieval, we need to rerank the candidates to get a better result.

In [18]:
# Reranking
from langchain_openai import ChatOpenAI
from langchain.retrievers.document_compressors import LLMListwiseRerank

llm = ChatOpenAI(model="gpt-4o")
reranker = LLMListwiseRerank.from_llm(llm=llm)

# Apply reranking
reranked_docs = reranker.compress_documents(documents=res3, query="what is strong account?")

# Show results
for doc in reranked_docs:
    print(doc.page_content)

Strong accounts are privileged accounts with elevated access rights that provide temporary accounts used for ZSP access. Strong accounts can be considered “Service Accounts” in that they are used by the SIA service to automate access for the ZSP ﬂow. To provision ephemeral users, SIA needs to access a strong account, which can create the ephemeral users on the target machines and delete them when the session ends.
While the strong accountant may be a member of the Domain Administrators group, this doesn't follow the principle of least privilege and is not considered best practice. To adhere to the principle of least privilege and separation of duties, organizations should subdivide strong accounts into privilege silos. This granular approach helps to ensure that the strong account does not have widespread access throughout your environment. You may also choose to select SIA strong accounts based on credential boundary or network area which may also help align access rights with organiz

### RAG Pipeline

In [19]:
# RAG part

# LLM
from langchain_groq import ChatGroq
llm = ChatGroq(model ="llama-3.3-70b-versatile")

In [20]:
# prompt
from langchain import hub
prompt = hub.pull("rlm/rag-prompt")

In [21]:
# checking the prompt template variables...
import pprint
pprint.pprint(prompt.messages)

[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"), additional_kwargs={})]


In [22]:
# output parser
from langchain_core.output_parsers import StrOutputParser

In [23]:
# chain creation
from langchain_core.runnables import RunnablePassthrough, RunnableLambda

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

chain =(
    { 
      "context":retriever| RunnableLambda(format_docs),
      "question":RunnablePassthrough()
     }
    |prompt
    |llm
    |StrOutputParser()
)

In [24]:
response=chain.invoke("Explain the monitoring part of SIA operation?")
print(response)

The monitoring part of SIA operation involves tracking key metrics such as CPU usage, memory utilization, and network throughput using monitoring tools. Comprehensive logging is also implemented to capture detailed information about SIA operations, including connection attempts and failures, to identify patterns or anomalies. Regular review of logs and establishment of a baseline of normal operating parameters help detect deviations that may require attention.


In [25]:
response=chain.invoke("list me the related articles for Hardening??")
print(response)


There is a related article titled "Security Best Practices for Hardening Your CyberArk Identity Deployment". This article provides security best practices for hardening CyberArk Identity deployments. The exact list of related articles is not fully provided in the given context, but the mentioned article is one of them.
