In [28]:
!pip install pymupdf langchain-core transformers pillow torch numpy langchain scikit-learn langchain-community



In [29]:
import fitz  # PyMuPDF
from langchain_core.documents import Document
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch
import numpy as np
from langchain.chat_models import init_chat_model
from langchain.prompts import PromptTemplate
from langchain.schema.messages import HumanMessage
from sklearn.metrics.pairwise import cosine_similarity
import os
import base64
import io
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS

In [30]:
###Clip Model
import os
from dotenv import load_dotenv
load_dotenv()

## set up the environment
os.environ["GOOGLE_API_KEY"]=os.getenv("GOOGLE_API_KEY")

### initialize the Clip Model for unified embeddings
clip_model=CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor=CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
clip_model.eval()

CLIPModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 512)
      (position_embedding): Embedding(77, 512)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=512, out_features=2048, bias=True)
            (fc2): Linear(in_features=2048, out_features=512, bias=True)
          )
          (layer_norm2): LayerNorm((512,), eps=1e-05,

In [68]:
## Embeddings Funstions
def embed_images(image_data):
    """Embed image using CLIP"""
    if isinstance(image_data,str):
        image=Image.open(image_data).convert("RGB")
    else:
        image=image_data
    inputs=clip_processor(images=image,return_tensors="pt")
    with torch.no_grad():
        features=clip_model.get_image_features(**inputs)
        # Normalize embeddings to uunit vector
        features=features/features.norm(dim=1,keepdim=True)
        return features.squeeze().numpy()
def embed_text(text):
    """Embed text using CLIP"""
    inputs=clip_processor(
        text=text,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=77  # Clip max token length
    )
    with torch.no_grad():
        features=clip_model.get_text_features(**inputs)
        # Normalize embeddings
        features=features/features.norm(dim=1,keepdim=True)
        return features.squeeze().numpy()

In [99]:
# Process PDF
# pdf_path="multimodal_sample.pdf"
# doc=fitz.open(pdf_path)

all_docs=[]
all_embeddings=[]
images_data_store={} # store actual image data fro llm 

# Text Splitter
splitter=RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)

In [33]:
doc

Document('Q1_2025-26_Fact_Sheet.pdf')

In [100]:
with fitz.open("Q1_2025-26_Fact_Sheet.pdf") as doc:
    for i,page in enumerate(doc):
        ## process text
        text=page.get_text()
        if text.strip():
            ##create temporary document for splitting
            temp_doc = Document(page_content=text, metadata={"page": i, "type": "text"})
            text_chunks = splitter.split_documents([temp_doc])

            #Embed each chunk using CLIP
            for chunk in text_chunks:
                embedding = embed_text(chunk.page_content)
                all_embeddings.append(embedding)
                all_docs.append(chunk)



        ## process images
        ##Three Important Actions:

        ##Convert PDF image to PIL format
        ##Store as base64 for GPT-4V (which needs base64 images)
        ##Create CLIP embedding for retrieval

        for img_index, img in enumerate(page.get_images(full=True)):
            try:
                xref = img[0]
                base_image = doc.extract_image(xref)
                image_bytes = base_image["image"]
                
                # Convert to PIL Image
                pil_image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
                
                # Create unique identifier
                image_id = f"page_{i}_img_{img_index}"
            
                # Store image as base64 for later use with GPT-4V
                buffered = io.BytesIO()
                pil_image.save(buffered, format="PNG")
                img_base64 = base64.b64encode(buffered.getvalue()).decode()
                images_data_store[image_id] = img_base64
                
                # Embed image using CLIP
                embedding = embed_images(pil_image)
                all_embeddings.append(embedding)
                
                # Create document for image
                image_doc = Document(
                    page_content=f"[Image: {image_id}]",
                    metadata={"page": i, "type": "image", "image_id": image_id}
                )
                all_docs.append(image_doc)
            
            except Exception as e:
                print(f"Error processing image {img_index} on page {i}: {e}")
                continue





In [101]:
all_docs

[Document(metadata={'page': 0, 'type': 'text'}, page_content='TCS Financial Results\nQuarter I Ended FY 2025-26\nJuly 10, 2025'),
 Document(metadata={'page': 0, 'type': 'image', 'image_id': 'page_0_img_0'}, page_content='[Image: page_0_img_0]'),
 Document(metadata={'page': 0, 'type': 'image', 'image_id': 'page_0_img_1'}, page_content='[Image: page_0_img_1]'),
 Document(metadata={'page': 1, 'type': 'text'}, page_content='Disclaimer\n2\nCertain statements in this release concerning our future prospects are forward-looking statements. Forward-looking statements by \ntheir nature involve a number of risks and uncertainties that could cause actual results to differ materially from market \nexpectations. These risks and uncertainties include, but are not limited to, our ability to manage growth, intense competition'),
 Document(metadata={'page': 1, 'type': 'text'}, page_content='among global IT services companies, various factors which may affect our profitability, such as wage increases or 

In [74]:
all_embeddings

[array([-3.07186856e-03,  4.88309227e-02,  1.72758196e-02,  2.23813727e-02,
         2.65792813e-02,  5.53548113e-02, -7.11715920e-03,  1.38289079e-01,
         1.44108152e-02,  3.18832286e-02,  7.04939589e-02,  1.54625308e-02,
        -5.20862220e-03,  3.59369852e-02,  3.92487496e-02, -4.34995964e-02,
        -2.65491125e-03,  2.23267097e-02,  3.54238530e-03,  4.75970581e-02,
         2.43786629e-02, -8.34360998e-03,  6.59229308e-02, -1.89985763e-02,
         2.58217473e-02,  3.86505621e-03,  1.90904401e-02, -8.23216792e-03,
        -9.19821952e-03, -8.70313146e-04,  2.78594736e-02, -4.06720936e-02,
        -6.11858517e-02, -3.48192900e-02,  2.33721975e-02,  1.93780679e-02,
         1.04226461e-02, -3.23913097e-02,  7.88689628e-02, -9.19085182e-03,
         1.38245914e-02,  4.00150893e-03, -1.92327239e-03,  8.05021322e-04,
        -6.59380704e-02,  5.53065958e-03,  5.57880253e-02, -2.75336672e-02,
        -5.53484354e-03,  1.81391053e-02,  1.36337429e-02,  8.90739560e-02,
         2.9

In [102]:
# Create unified FAISS vector store with CLIP embeddings
embeddings_array = np.array(all_embeddings)
embeddings_array

array([[-0.00307187,  0.04883092,  0.01727582, ..., -0.00182937,
         0.00922514,  0.04888212],
       [-0.01015805, -0.01360098, -0.01759228, ...,  0.05918153,
        -0.02358762, -0.01489908],
       [ 0.04154531,  0.0721312 ,  0.00935536, ...,  0.04621322,
        -0.05699993,  0.01690328],
       ...,
       [-0.01164207,  0.01691118, -0.03496092, ...,  0.0836313 ,
        -0.03942812,  0.0017378 ],
       [-0.0101187 ,  0.02821845, -0.01209205, ...,  0.00876872,
        -0.01249511,  0.00679457],
       [-0.01015805, -0.01360098, -0.01759228, ...,  0.05918153,
        -0.02358762, -0.01489908]], shape=(98, 512), dtype=float32)

In [76]:
(all_docs,embeddings_array)


([Document(metadata={'page': 0, 'type': 'text'}, page_content='TCS Financial Results\nQuarter I Ended FY 2025-26\nJuly 10, 2025'),
  Document(metadata={'page': 0, 'type': 'image', 'image_id': 'page_0_img_0'}, page_content='[Image: page_0_img_0]'),
  Document(metadata={'page': 0, 'type': 'image', 'image_id': 'page_0_img_1'}, page_content='[Image: page_0_img_1]'),
  Document(metadata={'page': 1, 'type': 'text'}, page_content='Disclaimer\n2\nCertain statements in this release concerning our future prospects are forward-looking statements. Forward-looking statements by \ntheir nature involve a number of risks and uncertainties that could cause actual results to differ materially from market \nexpectations. These risks and uncertainties include, but are not limited to, our ability to manage growth, intense competition \namong global IT services companies, various factors which may affect our profitability, such as wage increases or an appreciating \nRupee, our ability to attract and retain 

In [103]:
# Create custom FAISS index since we have precomputed embeddings
vector_store = FAISS.from_embeddings(
    text_embeddings=[(doc.page_content, emb) for doc, emb in zip(all_docs, embeddings_array)],
    embedding=None,  # We're using precomputed embeddings
    metadatas=[doc.metadata for doc in all_docs]
)
vector_store

`embedding_function` is expected to be an Embeddings object, support for passing in a function will soon be removed.


<langchain_community.vectorstores.faiss.FAISS at 0x2454f2f8910>

In [81]:
!pip install -U langchain-google-genai




In [84]:
import langchain_google_genai
# llm = init_chat_model("gemini-pro", model_provider="google_genai")
llm = init_chat_model("gemini-2.5-pro", model_provider="google_genai")

In [85]:
llm

ChatGoogleGenerativeAI(model='models/gemini-2.5-pro', google_api_key=SecretStr('**********'), client=<google.ai.generativelanguage_v1beta.services.generative_service.client.GenerativeServiceClient object at 0x000002454EA18C10>, default_metadata=(), model_kwargs={})

In [104]:
def retrieve_multimodal(query, k=5):
    """Unified retrieval using CLIP embeddings for both text and images."""
    # Embed query using CLIP
    query_embedding = embed_text(query)
    
    # Search in unified vector store
    results = vector_store.similarity_search_by_vector(
        embedding=query_embedding,
        k=k
    )
    return results

In [105]:
def create_multimodal_message(query, retrieved_docs):
    """Create a message with both text and images for Google-gemini-2.5 pro."""
    content = []
    
    # Add the query
    content.append({
        "type": "text",
        "text": f"Question: {query}\n\nContext:\n"
    })
    
    # Separate text and image documents
    text_docs = [doc for doc in retrieved_docs if doc.metadata.get("type") == "text"]
    image_docs = [doc for doc in retrieved_docs if doc.metadata.get("type") == "image"]
    
    # Add text context
    if text_docs:
        text_context = "\n\n".join([
            f"[Page {doc.metadata['page']}]: {doc.page_content}"
            for doc in text_docs
        ])
        content.append({
            "type": "text",
            "text": f"Text excerpts:\n{text_context}\n"
        })
    
    # Add images
    for doc in image_docs:
        image_id = doc.metadata.get("image_id")
        if image_id and image_id in images_data_store:
            content.append({
                "type": "text",
                "text": f"\n[Image from page {doc.metadata['page']}]:\n"
            })
            content.append({
                "type": "image_url",
                "image_url": {
                    "url": f"data:image/png;base64,{images_data_store[image_id]}"
                }
            })
    
    # Add instruction
    content.append({
        "type": "text",
        "text": "\n\nPlease answer the question based on the provided text and images."
    })
    
    return HumanMessage(content=content)

In [None]:
def multimodal_pdf_rag_pipeline(query):
    """Main pipeline for multimodal RAG."""
    # Retrieve relevant documents
    context_docs = retrieve_multimodal(query, k=5)
    
    # Create multimodal message
    message = create_multimodal_message(query, context_docs)
    
    # Get response from gemini
    response = llm.invoke([message])
    
    # Print retrieved context info
    print(f"\nRetrieved {len(context_docs)} documents:")
    for doc in context_docs:
        doc_type = doc.metadata.get("type", "unknown")
        page = doc.metadata.get("page", "?")
        if doc_type == "text":
            preview = doc.page_content[:100] + "..." if len(doc.page_content) > 100 else doc.page_content
            print(f"  - Text from page {page}: {preview}")
        else:
            print(f"  - Image from page {page}")
    print("\n")
    
    return response.content

In [106]:
if __name__ == "__main__":
    # Example queries
    queries = [
        "What visual elements are present in the document?",
        "Summarize the overall performance"
    ]
    
    for query in queries:
        print(f"\nQuery: {query}")
        print("-" * 50)
        answer = multimodal_pdf_rag_pipeline(query)
        print(f"Answer: {answer}")
        print("=" * 70)


Query: What visual elements are present in the document?
--------------------------------------------------

Retrieved 5 documents:
  - Text from page 16: Revenue
7,505
7,421
100.00
100.00
Cost of revenue
4,514
4,517
60.15
60.87
Gross margin
2,991
2,904
3...
  - Text from page 1: only as of the date on which it was made. 
The Company assumes no obligation to revise or update any...
  - Text from page 13: 76,310
75,190
12.19
11.85
Fees to external consultants
1,450
1,760
0.23
0.28
Facility expenses
3,110...
  - Text from page 17: 37
38
0.50
0.51
Depreciation
34
34
0.45
0.46
Travel
27
27
0.36
0.36
Communication
21
22
0.28
0.30
Pr...
  - Text from page 19: Revenue
62,613
           
63,437
      
100.00
      
100.00
      
Expenditure
  a) Employee Costs...


Answer: Based on the provided text excerpts, the primary visual elements present in the document are **tables**.

Here are the specific details about these tables:

*   **Structure:** They are structured in columns and rows to pres