In [24]:
import fitz # PyMuPDF used for PDF processing
from langchain_core.documents import Document # used to create Document objects
from transformers import CLIPProcessor, CLIPModel # used for image and text embeddings
from PIL import Image # used for image processing
import torch # used for tensor operations
import numpy as np # used for numerical operations
from langchain.chat_models import init_chat_model #used for chat model interactions
from langchain.prompts import PromptTemplate # used for prompt templates
from langchain.schema.messages import HumanMessage # used for message formatting
from sklearn.metrics.pairwise import cosine_similarity # used for similarity calculations
import os # used for file operations
import base64 # used for encoding images
import io # used for in-memory file operations
from langchain.text_splitter import RecursiveCharacterTextSplitter # used for text splitting
from langchain_community.vectorstores import FAISS # used for vector store
from langchain_google_genai import ChatGoogleGenerativeAI


An error occurred: module 'importlib.metadata' has no attribute 'packages_distributions'




In [27]:
##clip model
import os
from dotenv import load_dotenv
load_dotenv()

#set up the environment 
os.environ["GOOGLE_API_KEY"] = os.getenv("GEMINI_API_KEY")

#initialize the model for unified embeddings
clip_model=CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor=CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

In [4]:
### Embedding function for images and text
def embed_image(image_data):
    """Embed an image using CLIP model."""
    if isinstance(image_data, str):# if image_data is a file path
        image=Image.open(image_data).convert("RGB") # Open image from file path
    else: # if image_data is bytes
        image=image_data
    
    input=clip_processor(images=image,return_tensors="pt") # Process the image
    with torch.no_grad():
        features=clip_model.get_image_features(**input) # Get image features
        #normalize the features to unit vector
        features=features/ features.norm(dim=-1,keepdim=True)
        return features.squeeze().numpy() # Return as numpy array
    
def embed_text(text):
    """Embed text using CLIP model."""
    input=clip_processor(text=[text],return_tensors="pt",padding=True,truncation=True,max_length=77) # Process the text
    with torch.no_grad():
        features=clip_model.get_text_features(**input) # Get text features
        #normalize the features to unit vector
        features=features/ features.norm(dim=-1,keepdim=True)
        return features.squeeze().numpy() # Return as numpy array

In [11]:
### PDF Processing and Document Creation
pdf_path="multimodal_sample.pdf" # Path to the PDF file
doc=fitz.open(pdf_path) # Open the PDF file
#storage for all documents
documents=[] # List to store document texts
all_embeddings=[] # List to store all embeddings
image_data_store={} # Dictionary to store image data

#text splitter
splitter=RecursiveCharacterTextSplitter(chunk_size=500,chunk_overlap=50) # Initialize text splitter

In [12]:
doc

Document('multimodal_sample.pdf')

In [13]:
for i,page in enumerate(doc):
    ##process text
    text=page.get_text() # Extract text from the page
    if(text.strip()!=""): # If text is not empty
        #split text into chunks
        text_chunks=splitter.split_text(text)
        for chunk in text_chunks:
            documents.append(Document(page_content=chunk,metadata={"page":i})) # Create Document object and add to list
            emb=embed_text(chunk) # Embed the text chunk
            all_embeddings.append(emb) # Add embedding to list

## process images
for img_index, img in enumerate(page.get_images(full=True)):
        try:
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            
            # Convert to PIL Image
            pil_image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
            
            # Create unique identifier
            image_id = f"page_{i}_img_{img_index}"
            
            # Store image as base64 for later use with GPT-4V
            buffered = io.BytesIO()
            pil_image.save(buffered, format="PNG")
            img_base64 = base64.b64encode(buffered.getvalue()).decode()
            image_data_store[image_id] = img_base64
            
            # Embed image using CLIP
            embedding = embed_image(pil_image)
            all_embeddings.append(embedding)
            
            # Create document for image
            image_doc = Document(
                page_content=f"[Image: {image_id}]",
                metadata={"page": i, "type": "image", "image_id": image_id}
            )
            documents.append(image_doc)
            
        except Exception as e:
            print(f"Error processing image {img_index} on page {i}: {e}")
            continue

doc.close()

In [15]:
documents

[Document(metadata={'page': 0}, page_content='Annual Revenue Overview\nThis document summarizes the revenue trends across Q1, Q2, and Q3. As illustrated in the chart\nbelow, revenue grew steadily with the highest growth recorded in Q3.\nQ1 showed a moderate increase in revenue as new product lines were introduced. Q2 outperformed\nQ1 due to marketing campaigns. Q3 had exponential growth due to global expansion.'),
 Document(metadata={'page': 0, 'type': 'image', 'image_id': 'page_0_img_0'}, page_content='[Image: page_0_img_0]')]

In [16]:
## create FAISS vector store
embeddings_array=np.array(all_embeddings) # Convert embeddings to numpy array

#create custom FAISS vector store since we have precomputed embeddings
vector_store=FAISS.from_embeddings(
    text_embeddings=[(doc.page_content,emb) for doc,emb in zip(documents,embeddings_array)],
    embedding=None, # No embedding function since we have precomputed embeddings
    metadatas=[doc.metadata for doc in documents] # Metadata for each document
)
vector_store


`embedding_function` is expected to be an Embeddings object, support for passing in a function will soon be removed.


<langchain_community.vectorstores.faiss.FAISS at 0x1649e2190>

In [None]:
llm=ChatGoogleGenerativeAI(model="gemini-2.5-flash", temperature=0) # Initialize Gemini Pro chat model
resp=llm.invoke("Hello, world!")
print(resp)

content="Hello, world! It's great to connect with you. How can I help you today?" additional_kwargs={} response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', 'model_name': 'gemini-2.5-flash', 'safety_ratings': []} id='run--d0edef05-2d2e-438a-a790-64d881210576-0' usage_metadata={'input_tokens': 5, 'output_tokens': 50, 'total_tokens': 55, 'input_token_details': {'cache_read': 0}, 'output_token_details': {'reasoning': 30}}


In [None]:
def retrieve_multimodal(query,top_k=3): #   Unified retrieval for text and images
    """Unified retrieval for text and images."""
    query_embedding=embed_text(query) # Embed the query text

    results=vector_store.similarity_search_by_vector(
        embedding=query_embedding,
        k=top_k
    )
    return results

In [None]:
def create_multimodal_message(query, retrieved_docs): # Create a multimodal message for the LLM
    """Create a message for the LLM including text and images."""
    message_content=query + "\n\n"
    for doc in retrieved_docs:
        if doc.metadata.get("type")=="image":
            image_id=doc.metadata["image_id"]
            image_base64=image_data_store[image_id]
            # Include image in the message in a format GPT-4V understands
            message_content+=f"<image>{image_base64}</image>\n"
        else:
            message_content+=doc.page_content + "\n\n"
    return HumanMessage(content=message_content)

In [37]:
def multimodal_pdf_rag_pipeline(query):
    """Main pipeline for multimodal RAG."""
    # Retrieve relevant documents
    context_docs = retrieve_multimodal(query, top_k=5)
    
    # Create multimodal message
    message = create_multimodal_message(query, context_docs)
    
    # Get response from GPT-4V
    response = llm.invoke([message])
    
    # Print retrieved context info
    print(f"\nRetrieved {len(context_docs)} documents:")
    for doc in context_docs:
        doc_type = doc.metadata.get("type", "unknown")
        page = doc.metadata.get("page", "?")
        if doc_type == "text":
            preview = doc.page_content[:100] + "..." if len(doc.page_content) > 100 else doc.page_content
            print(f"  - Text from page {page}: {preview}")
        else:
            print(f"  - Image from page {page}")
    print("\n")
    
    return response.content

In [38]:
if __name__ == "__main__":
    # Example queries
    queries = [
        "What does the chart on page 1 show about revenue trends?",
        "Summarize the main findings from the document",
        "What visual elements are present in the document?"
    ]
    
    for query in queries:
        print(f"\nQuery: {query}")
        print("-" * 50)
        answer = multimodal_pdf_rag_pipeline(query)
        print(f"Answer: {answer}")
        print("=" * 70)


Query: What does the chart on page 1 show about revenue trends?
--------------------------------------------------

Retrieved 2 documents:
  - Image from page 0
  - Image from page 0


Answer: The chart on page 1 shows a clear upward trend in revenue across the three quarters:

*   **Q1 Revenue:** Approximately $250,000
*   **Q2 Revenue:** Approximately $500,000
*   **Q3 Revenue:** Approximately $950,000

This indicates that revenue grew steadily from Q1 to Q2, and then experienced a much more significant, almost exponential, increase in Q3, making Q3 the quarter with the highest revenue.

Query: Summarize the main findings from the document
--------------------------------------------------

Retrieved 2 documents:
  - Image from page 0
  - Image from page 0


Answer: The document summarizes the annual revenue trends across Q1, Q2, and Q3, showing a steady increase throughout the year, with the highest growth recorded in Q3.

Key findings include:
*   **Q1:** Experienced a moderate re