## AI Search Multi-Modal RAG - Advanced Document Intelligence Lab

![document-intelligence](./Assets/document_intelligence.png)

### Installing Required Packages


In [None]:
%pip install openai PyMuPDF requests python-dotenv matplotlib

### Loading Variables from the .env file

In [None]:
from openai import AzureOpenAI
import os
from dotenv import load_dotenv
load_dotenv()

service_endpoint = os.getenv("AZURE_SEARCH_SERVICE_ENDPOINT")
index_name = os.getenv("AZURE_SEARCH_INDEX_NAME")
key = os.getenv("AZURE_SEARCH_API_KEY")

### Creating Azure OpenAI Client

In [None]:
from openai import AzureOpenAI

azure_openai_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")  
azure_openai_key = os.getenv("AZURE_OPENAI_KEY")

azure_openai_client = AzureOpenAI(
    api_key=azure_openai_key,
    api_version="2024-02-15-preview",
    azure_endpoint=azure_openai_endpoint
)

### Creating the Embedding Generator Function

In [None]:
def generate_embeddings(client, text):
    embedding_model = os.getenv("AZURE_OPENAI_EMBEDDING_MODEL")
    
    response = client.embeddings.create(
        input=text,
        model = embedding_model
    )
    
    embeddings=response.model_dump()
    return embeddings['data'][0]['embedding']
    

In [None]:
user_query = "can you tell me something about the invoice which has microsoft fabric implementation on invoice 1?"
vectorised_user_query = generate_embeddings(azure_openai_client, user_query)
print(vectorised_user_query)

In [None]:
context=[]

### Sending API Call to Azure AI Search Index

In [None]:
import requests
import json


url = f"{service_endpoint}/indexes/{index_name}/docs/search?api-version=2023-11-01"
    
headers = {
        "Content-Type": "application/json",
        "api-key": key
    }
    
body =   {
        "count": True,
        "select": "document_title, content_text, locationMetadata, image_document_id",
        "vectorQueries": [
            {
                "vector": vectorised_user_query,
                "k": 10,
                "fields": "content_embedding",
                "kind": "vector"
            }
        ]
    }
    
response = requests.post(url, headers=headers, data=json.dumps(body))
documents = response.json()['value']

for doc in documents:
    context.append(dict(
        {
            "document_title": doc['document_title'],
            "chunk": doc['content_text'],
            "score": doc['@search.score'],
            "locationMetadata": doc['locationMetadata'] if 'locationMetadata' in doc else None,
            "image_document_id": doc['image_document_id'] if 'image_document_id' in doc else None
        }
    ))
    
for doc in context:
    print(doc)



### Function for Location and Displaying Context in the PDF Docs

In [None]:
def highlight_pdf_chunk(
    document_title: str,
    storage_account_url: str,
    content_text: str = None,
    location_metadata: dict = None,
    download_dir: str = ".",
    zoom: float = 2.0
):
    def get_clean_text(text: str) -> str:
        # Clean and truncate for search
        text = text.replace('\r', ' ').replace('\n', ' ').strip()
        return ' '.join(text.split()[:20])  # First 20 words

    # Build and fetch PDF
    pdf_url = f"{storage_account_url.rstrip('/')}/{document_title}"
    os.makedirs(download_dir, exist_ok=True)
    pdf_path = os.path.join(download_dir, document_title)

    if not os.path.exists(pdf_path):
        try:
            response = requests.get(pdf_url)
            response.raise_for_status()
            if not response.content.startswith(b'%PDF'):
                raise ValueError("Downloaded file is not a valid PDF.")
            with open(pdf_path, 'wb') as f:
                f.write(response.content)
        except Exception as e:
            print(f"[ERROR] Failed to download/save PDF: {e}")
            return

    try:
        doc = fitz.open(pdf_path)
    except Exception as e:
        print(f"[ERROR] Failed to open PDF: {e}")
        return

    img = None
    title = ""

    # ========== LOCATION METADATA ========== #
    if location_metadata:
        try:
            page_number = location_metadata.get("pageNumber", 1) - 1
            page = doc.load_page(page_number)
            page_width, page_height = page.rect.width, page.rect.height
            
            print(f"Page dimensions: {page_width} x {page_height}")
            
            bounding_polygons = location_metadata.get("boundingPolygons")
            if isinstance(bounding_polygons, str):
                bounding_polygons = json.loads(bounding_polygons)
            
            print(f"Bounding polygons: {bounding_polygons}")

            # Render page at zoom
            mat = fitz.Matrix(zoom, zoom)
            pix = page.get_pixmap(matrix=mat)
            img = Image.open(BytesIO(pix.tobytes("png")))
            draw = ImageDraw.Draw(img)

            for poly in bounding_polygons:
                # Extract coordinates and handle different coordinate systems
                x_vals, y_vals = [], []
                for point in poly:
                    x = point["x"]
                    y = point["y"]
                    
                    # Check if coordinates appear to be in inches (common for Document Intelligence)
                    # Convert inches to points (1 inch = 72 points)
                    if x > 1.0:  # Likely in inches
                        x_points = x * 72
                        y_points = y * 72
                        print(f"Converting from inches: ({x}, {y}) -> ({x_points}, {y_points}) points")
                    else:
                        # Assume normalized coordinates
                        x_points = x * page_width
                        y_points = y * page_height
                        print(f"Using normalized: ({x}, {y}) -> ({x_points}, {y_points}) points")
                    
                    x_vals.append(x_points)
                    y_vals.append(y_points)

                # Get bounding rectangle
                x0, y0, x1, y1 = min(x_vals), min(y_vals), max(x_vals), max(y_vals)
                
                print(f"Drawing rectangle: ({x0}, {y0}) to ({x1}, {y1})")
                
                # Ensure coordinates are within page bounds
                x0 = max(0, min(x0, page_width))
                y0 = max(0, min(y0, page_height))
                x1 = max(0, min(x1, page_width))
                y1 = max(0, min(y1, page_height))
                
                # Draw rectangle with zoom applied
                draw.rectangle(
                    [x0 * zoom, y0 * zoom, x1 * zoom, y1 * zoom],
                    outline="red", width=5
                )
                
                # Add a small text label for debugging
                draw.text((x0 * zoom, y0 * zoom - 25), f"Box {len(x_vals)}", fill="red")

            title = f"{document_title} - Page {page_number + 1} [via location metadata]"

        except Exception as e:
            print(f"[ERROR] Processing location metadata failed: {e}")
            import traceback
            traceback.print_exc()
            if not content_text:
                doc.close()
                return

    # ========== TEXT SEARCH FALLBACK ========== #
    if content_text and img is None:
        clean_text = get_clean_text(content_text)
        found = False

        for page_number in range(len(doc)):
            page = doc.load_page(page_number)
            matches = page.search_for(clean_text)

            if matches:
                mat = fitz.Matrix(zoom, zoom)
                pix = page.get_pixmap(matrix=mat)
                img = Image.open(BytesIO(pix.tobytes("png")))
                draw = ImageDraw.Draw(img)

                for rect in matches:
                    draw.rectangle(
                        [rect.x0 * zoom, rect.y0 * zoom, rect.x1 * zoom, rect.y1 * zoom],
                        outline="blue", width=3
                    )

                title = f"{document_title} - Page {page_number + 1} [via text search]"
                found = True
                break

        if not found:
            print("[INFO] Text not found in PDF.")
            doc.close()
            return

    # ========== DISPLAY OUTPUT ========== #
    if img:
        plt.figure(figsize=(12, 12))
        plt.imshow(img)
        plt.axis("off")
        plt.title(title)
        plt.show()
    else:
        print("[ERROR] No image generated for display.")

    doc.close()

### Sending Call to GPT Engine for Summarisation

In [None]:
system_prompt = f""""You are meant to behave as a RAG chatbot that derives its context from a database stored in Azure AI Search Solution.
please answer strictly from the context from the database provided and if you dont have an answer please politely say so. dont include any extra 
information that is not in the context and dont include links as well.
the context passed to you will be in the form of a pythonic list with each object in the list having the following structure:

{{
    "document_title": "the title of the document",
    "chunk": "the chunk of text from the document",
    "score": "the score of the match based on cosine similarity",
    "locationMetadata": "the location metadata if available, else None",
    "image_document_id": "the image document id if available, else None",
}}

the pythonic list contains best 10 matches to the user query based on cosine similarity of the embeddings of the user query and the review descriptions.
please structure your answers in a very professional manner and in such a way that the user does not get to know that its RAG working under the hood
and its as if they are talking to a human. """

user_prompt = f""" the user query is: {user_query}
the context is : {context}"""

chat_completions_response = azure_openai_client.chat.completions.create(
    model = os.getenv("AZURE_OPENAI_CHAT_COMPLETIONS_MODEL"),
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt}
    ],
    temperature=0.7
)

print(chat_completions_response.choices[0].message.content)



### Displaying the Context in the PDF Docs

In [None]:
for context_item in context:
    document_title = context_item.get("document_title")
    content_text = context_item.get("chunk")
    location_metadata = context_item.get("locationMetadata")
    document_title = context_item.get("document_title")

    storage_account_url = f"https://{os.getenv('STORAGE_ACCOUNT_NAME')}.blob.core.windows.net/{os.getenv('STORAGE_ACCOUNT_CONTAINER_NAME')}/{os.getenv('STORAGE_ACCOUNT_FOLDER_NAME')}/"
    highlight_pdf_chunk(
            document_title=document_title,
            storage_account_url=storage_account_url,
            content_text=content_text,
            location_metadata=location_metadata
        )
 