## Multi-Modal RAG with Azure AI Content Understanding - Data Prep

![rag_data_prep](./Assets/rag_data_prep.png)

### Install Dependencies and Libraries

In [None]:
%pip install python-dotenv pdfminer.six openai

### Setting Up the Environment 

In [None]:
import os
from dotenv import load_dotenv
import requests
import json

load_dotenv()

# Content Understanding Service Connections
CONTENT_UNDERSTANDING_ENDPOINT = os.getenv("CONTENT_UNDERSTANDING_ENDPOINT").strip().rstrip('/')
CONTENT_UNDERSTANDING_API_KEY = os.getenv("CONTENT_UNDERSTANDING_API_KEY")

# Storage Account Connections
storage_account_endpoint = os.getenv("STORAGE_ACCOUNT_ENDPOINT").strip().rstrip('/')
storage_container_name = os.getenv("STORAGE_CONTAINER_NAME")

### Creating Functions for Performing Video Analysis using Prebuilt Video Analyzer

In [None]:
# Create a function to analyze video using prebuilt video analyzer - makes API calls to Content Understanding service
def analyze_video(video_url):
    prebuilt_video_analyzer_url = f"{CONTENT_UNDERSTANDING_ENDPOINT}/contentunderstanding/analyzers/prebuilt-videoAnalyzer:analyze?api-version=2025-05-01-preview"

    body = {
        "url": video_url
    }

    video_analysis_result = {}

    try:
        headers = {
                    "Content-Type": "application/json",
                    "Ocp-Apim-Subscription-Key": CONTENT_UNDERSTANDING_API_KEY
                }

        response = requests.post(prebuilt_video_analyzer_url, headers=headers, json=body)
        response.raise_for_status()
        result = response.json()
        analysis_id = result.get("id")
        print("Analysis ID:", analysis_id)

        # Using the analysis ID to get results; polling until the analysis is complete
        get_result_url = f"{CONTENT_UNDERSTANDING_ENDPOINT}/contentunderstanding/analyzerResults/{analysis_id}?api-version=2025-05-01-preview"
        
        headers = {
            "Ocp-Apim-Subscription-Key": CONTENT_UNDERSTANDING_API_KEY
        }
        analysis_status = "Running"
        while analysis_status == "Running":
            status_response = requests.get(get_result_url, headers=headers)
            status_response.raise_for_status()
            status_result = status_response.json()
            analysis_status = status_result.get("status")
            print("Current Analysis Status:", analysis_status)
            if analysis_status == "Running":
                import time
                time.sleep(3)  # Wait before polling again
        result_response = requests.get(get_result_url, headers=headers)
        result_response.raise_for_status()
        video_analysis_result = result_response.json()
        print("Video Analysis Result:", video_analysis_result)
        return video_analysis_result

    except requests.RequestException as e:
        print(f"Error occurred: {e}")


# Creating a Function to extract relevant information from the video analysis result and prepare it for RAG
def build_simple_video_dataset(
    analysis_result,
    video_url,
    title,
    include_full_transcript_row=True,
    include_words=False,          # set True to include word-level timings in each phrase
    ensure_ascii=False
):
    """
    Builds dataset rows with segments + their transcripts.
    Output rows: [{"document_title": ..., "content_text": <JSON string>}]

    content_text JSON schema (per segment):
    {
      "about": "...",
      "video_url": "...",
      "segment_id": "...",
      "time_window": {"start_ms": int, "end_ms": int},
      "segment_description": "...",
      "transcript": [
        {
          "speaker": "Speaker 1",
          "start_ms": 1360,
          "end_ms": 6640,
          "text": "...",
          "confidence": 0.937,
          "words": [ { "start_ms": ..., "end_ms": ..., "text": "..." }, ... ]   # only if include_words=True
        },
        ...
      ],
      "transcript_text": "Concatenated transcript text for this segment"
    }

    If include_full_transcript_row=True, an extra row is appended with the entire transcript.
    """
    result = (analysis_result or {}).get("result", {}) or {}
    contents = result.get("contents") or []

    # --- Collect segments from both shapes ---
    segments = []
    for c in contents:
        # Shape 1
        for s in c.get("segments") or []:
            segments.append({
                "segmentId": s.get("segmentId"),
                "startTimeMs": s.get("startTimeMs"),
                "endTimeMs": s.get("endTimeMs"),
                "description": s.get("description") or ""
            })
        # Shape 2
        fields = c.get("fields") or {}
        f_segments = ((fields.get("Segments") or {}).get("valueArray")) or []
        for item in f_segments:
            vo = (item or {}).get("valueObject") or {}
            segments.append({
                "segmentId": ((vo.get("SegmentId") or {}).get("valueString")),
                "startTimeMs": ((vo.get("StartTimeMs") or {}).get("valueInteger")),
                "endTimeMs": ((vo.get("EndTimeMs") or {}).get("valueInteger")),
                "description": ((vo.get("Description") or {}).get("valueString")) or ""
            })

    # --- Collect transcript phrases ---
    all_phrases = []
    for c in contents:
        for p in c.get("transcriptPhrases") or []:
            phrase = {
                "speaker": p.get("speaker"),
                "start_ms": p.get("startTimeMs"),
                "end_ms": p.get("endTimeMs"),
                "text": p.get("text"),
                "confidence": p.get("confidence")
            }
            if include_words:
                phrase["words"] = [
                    {
                        "start_ms": w.get("startTimeMs"),
                        "end_ms": w.get("endTimeMs"),
                        "text": w.get("text")
                    } for w in (p.get("words") or [])
                ]
            all_phrases.append(phrase)

    def overlaps(seg_start, seg_end, p_start, p_end):
        """True if [p_start, p_end] overlaps [seg_start, seg_end]."""
        if seg_start is None or seg_end is None or p_start is None or p_end is None:
            return False
        return not (p_end < seg_start or p_start > seg_end)

    rows = []

    # If no segments, create a single fallback with full transcript (if present)
    if not segments:
        # Build overall transcript text (if any)
        overall_text = " ".join([p.get("text") or "" for p in all_phrases]).strip() if all_phrases else None

        content_text_obj = {
            "about": "This is a JSON string representing the overall video summary.",
            "video_url": video_url,
            "note": "No segment details available."
        }
        if overall_text:
            content_text_obj["full_transcript_text"] = overall_text
            content_text_obj["full_transcript"] = all_phrases  # could be large

        rows.append({
            "document_title": f"{title} • Full Video",
            "content_text": json.dumps(content_text_obj, ensure_ascii=ensure_ascii)
        })
        return rows

    # Build rows per segment with attached transcript snippets
    for s in segments:
        st, et = s.get("startTimeMs"), s.get("endTimeMs")
        seg_id = s.get("segmentId") or "?"
        seg_desc = s.get("description") or ""

        # collect overlapping phrases
        seg_phrases = [
            p for p in all_phrases
            if overlaps(st, et, p.get("start_ms"), p.get("end_ms"))
        ]

        # Concatenate a readable segment transcript
        seg_transcript_text = " ".join([(p.get("text") or "").strip() for p in seg_phrases]).strip()

        content_text_obj = {
            "about": "This is a JSON string representing a slice of video analysis for RAG.",
            "video_url": video_url,
            "segment_id": seg_id,
            "time_window": {"start_ms": st, "end_ms": et},
            "segment_description": seg_desc,
            "transcript": seg_phrases,
            "transcript_text": seg_transcript_text
        }

        rows.append({
            "document_title": f"{title} • Segment {seg_id}",
            "content_text": json.dumps(content_text_obj, ensure_ascii=ensure_ascii)
        })

    # Optional final row with the full transcript (nice for global search)
    if include_full_transcript_row and all_phrases:
        full_text = " ".join([(p.get("text") or "").strip() for p in all_phrases]).strip()
        full_obj = {
            "about": "This is a JSON string representing the overall video transcript.",
            "video_url": video_url,
            "transcript": all_phrases,
            "transcript_text": full_text
        }
        rows.append({
            "document_title": f"{title} • Full Transcript",
            "content_text": json.dumps(full_obj, ensure_ascii=ensure_ascii)
        })

    return rows


In [None]:
# Constructing the Video URL
video_url = f"{storage_account_endpoint}/{storage_container_name}/BMW_circularity.mp4"

# Analyzing the video and building the dataset
video_analysis_result = analyze_video(video_url)
video_dataset = build_simple_video_dataset(video_analysis_result, video_url, title="BMW_circularity_video")

# print the dataset in a pretty format
print(json.dumps(video_dataset, indent=2))

### Creating Functions for Performing Audio Analysis using Prebuilt Audio Analyzer

In [None]:
# Create a function to analyze audio using prebuilt audio analyzer - makes API calls to Content Understanding service
def analyze_audio(audio_url):
    prebuilt_audio_analyzer_url = f"{CONTENT_UNDERSTANDING_ENDPOINT}/contentunderstanding/analyzers/prebuilt-audioAnalyzer:analyze?api-version=2025-05-01-preview"

    body = {
        "url": audio_url
    }

    audio_analysis_result = {}

    try:
        headers = {
                    "Content-Type": "application/json",
                    "Ocp-Apim-Subscription-Key": CONTENT_UNDERSTANDING_API_KEY
                }

        response = requests.post(prebuilt_audio_analyzer_url, headers=headers, json=body)
        response.raise_for_status()
        result = response.json()
        analysis_id = result.get("id")
        print("Analysis ID:", analysis_id)

        # Using the analysis ID to get results; polling until the analysis is complete
        get_result_url = f"{CONTENT_UNDERSTANDING_ENDPOINT}/contentunderstanding/analyzerResults/{analysis_id}?api-version=2025-05-01-preview"
        
        headers = {
            "Ocp-Apim-Subscription-Key": CONTENT_UNDERSTANDING_API_KEY
        }
        analysis_status = "Running"
        while analysis_status == "Running":
            status_response = requests.get(get_result_url, headers=headers)
            status_response.raise_for_status()
            status_result = status_response.json()
            analysis_status = status_result.get("status")
            print("Current Analysis Status:", analysis_status)
            if analysis_status == "Running":
                import time
                time.sleep(3)  # Wait before polling again
        result_response = requests.get(get_result_url, headers=headers)
        result_response.raise_for_status()
        audio_analysis_result = result_response.json()
        print("Audio Analysis Result:", audio_analysis_result)
        return audio_analysis_result

    except requests.RequestException as e:
        print(f"Error occurred: {e}")

# Creating a Function to extract relevant information from the audio analysis result and prepare it for RAG
def build_simple_audio_dataset(analysis_result, audio_url, title, include_full_transcript_row=True):
    """
    Create simple dataset rows with document_title + content_text from audio analysis.
    """
    result = (analysis_result or {}).get("result", {}) or {}
    contents = result.get("contents") or []

    rows = []
    all_phrases = []

    for idx, item in enumerate(contents, start=1):
        kind = item.get("kind")
        st, et = item.get("startTimeMs"), item.get("endTimeMs")
        md = item.get("markdown") or ""
        phrases = item.get("transcriptPhrases") or []
        all_phrases.extend(phrases)

        # Optional summary field (if present)
        fields = item.get("fields") or {}
        summary = ((fields.get("Summary") or {}).get("valueString")) if fields else None

        # Short transcript excerpt
        transcript_excerpt = ""
        if phrases:
            transcript_excerpt = " ".join((p.get("text") or "").strip() for p in phrases[:3]).strip()

        content_text = json.dumps({
            "about": "This is a JSON string representing a slice of audio analysis for RAG.",
            "audio_url": audio_url,
            "content_index": idx,
            "kind": kind,
            "time_window": {"start_ms": st, "end_ms": et},
            "analyzer_markdown_excerpt": md[:400],
            "transcript_excerpt": transcript_excerpt,
            **({"summary": summary} if summary else {})
        }, ensure_ascii=False)

        rows.append({
            "document_title": f"{title} • Segment {idx}",
            "content_text": content_text
        })

    # Optionally add a full transcript row for global search
    if include_full_transcript_row and all_phrases:
        full_transcript_text = " ".join((p.get("text") or "").strip() for p in all_phrases).strip()
        full_obj = {
            "about": "This is a JSON string representing the full audio transcript.",
            "audio_url": audio_url,
            "transcript_text": full_transcript_text,
            "transcript": [
                {
                    "speaker": p.get("speaker"),
                    "start_ms": p.get("startTimeMs"),
                    "end_ms": p.get("endTimeMs"),
                    "text": p.get("text"),
                    "confidence": p.get("confidence")
                } for p in all_phrases
            ]
        }
        rows.append({
            "document_title": f"{title} • Full Transcript",
            "content_text": json.dumps(full_obj, ensure_ascii=False)
        })

    # Fallback if nothing produced
    if not rows:
        content_text = json.dumps({
            "about": "This is a JSON string representing a generic audio analysis record.",
            "audio_url": audio_url,
            "note": "No content segments found in analysis."
        }, ensure_ascii=False)
        rows.append({
            "document_title": f"{title} • Full Audio",
            "content_text": content_text
        })

    return rows


In [None]:
# Constructing the Audio URL
audio_url = f"{storage_account_endpoint}/{storage_container_name}/BMW_forwardism.mp3"

# Analyzing the audio and building the dataset
audio_analysis_result = analyze_audio(audio_url)
audio_dataset = build_simple_audio_dataset(audio_analysis_result, audio_url, title="BMW_forwardism_audio")

# print the dataset in a pretty format
print(json.dumps(audio_dataset, indent=2))

### Creating Functions for Performing Image Analysis using Prebuilt Image Analyzer

In [None]:
# Create a function to analyze image using prebuilt image analyzer - makes API calls to Content Understanding service
def analyze_image(image_url):
    prebuilt_image_analyzer_url = f"{CONTENT_UNDERSTANDING_ENDPOINT}/contentunderstanding/analyzers/prebuilt-imageAnalyzer:analyze?api-version=2025-05-01-preview"

    body = {
        "url": image_url
    }

    image_analysis_result = {}

    try:
        headers = {
                    "Content-Type": "application/json",
                    "Ocp-Apim-Subscription-Key": CONTENT_UNDERSTANDING_API_KEY
                }

        response = requests.post(prebuilt_image_analyzer_url, headers=headers, json=body)
        response.raise_for_status()
        result = response.json()
        analysis_id = result.get("id")
        print("Analysis ID:", analysis_id)

        # Using the analysis ID to get results; polling until the analysis is complete
        get_result_url = f"{CONTENT_UNDERSTANDING_ENDPOINT}/contentunderstanding/analyzerResults/{analysis_id}?api-version=2025-05-01-preview"
        
        headers = {
            "Ocp-Apim-Subscription-Key": CONTENT_UNDERSTANDING_API_KEY
        }
        analysis_status = "Running"
        while analysis_status == "Running":
            status_response = requests.get(get_result_url, headers=headers)
            status_response.raise_for_status()
            status_result = status_response.json()
            analysis_status = status_result.get("status")
            print("Current Analysis Status:", analysis_status)
            if analysis_status == "Running":
                import time
                time.sleep(1)  # Wait before polling again
        result_response = requests.get(get_result_url, headers=headers)
        result_response.raise_for_status()
        image_analysis_result = result_response.json()
        print("Image Analysis Result:", image_analysis_result)
        return image_analysis_result

    except requests.RequestException as e:
        print(f"Error occurred: {e}")

# Creating a Function to extract relevant information from the image analysis result and prepare it for RAG
def build_simple_image_dataset(analysis_result, image_url, title):
    """
    Create simple dataset rows with document_title + content_text from image analysis.
    Pulls 'Summary' from fields if present.
    """
    result = (analysis_result or {}).get("result", {}) or {}
    contents = result.get("contents") or []

    rows = []

    for idx, item in enumerate(contents, start=1):
        kind = item.get("kind")
        md = item.get("markdown") or ""
        fields = item.get("fields") or {}

        # Extract a clean summary string if present
        summary_obj = fields.get("Summary") or {}
        summary_text = summary_obj.get("valueString")

        payload = {
            "about": "This is a JSON string representing a slice of image analysis for RAG.",
            "image_url": image_url,
            "content_index": idx,
            "kind": kind,
            "analyzer_markdown_excerpt": md[:400],
        }
        if summary_text:
            payload["summary"] = summary_text
        else:
            # If you prefer preserving all fields, keep them—but they’re verbose
            payload["fields_raw"] = fields

        rows.append({
            "document_title": f"{title} • Content {idx}",
            "content_text": json.dumps(payload, ensure_ascii=False)
        })

    if not rows:
        rows.append({
            "document_title": f"{title} • Full Image",
            "content_text": json.dumps({
                "about": "This is a JSON string representing a generic image analysis record.",
                "image_url": image_url,
                "note": "No content found in analysis."
            }, ensure_ascii=False)
        })

    return rows

In [None]:
# Constructing the Image URL
image_url = f"{storage_account_endpoint}/{storage_container_name}/image.png"

# Analyzing the image and building the dataset
image_analysis_result = analyze_image(image_url)
image_dataset = build_simple_image_dataset(image_analysis_result, image_url, title="Sample Image")

# print the dataset in a pretty format
print(json.dumps(image_dataset, indent=2))

### Extracting Textual Data from PDF Document and Preparing it for RAG

In [None]:
import io
import json
import re
import requests
from pdfminer.high_level import extract_text_to_fp
from pdfminer.layout import LAParams


def _is_url(s: str) -> bool:
    return s.lower().startswith(("http://", "https://"))

def _download_bytes(url: str) -> bytes:
    r = requests.get(url, timeout=30)
    r.raise_for_status()
    return r.content

def _clean_text(text: str) -> str:
    text = text.replace("\r", "")
    text = re.sub(r"[ \t]+\n", "\n", text)
    text = re.sub(r"\n{3,}", "\n\n", text)
    return text.strip()


def extract_pdf_text(pdf_source: str):
    """
    Extracts plain text from each page of a PDF.
    pdf_source can be a local file path or a URL.
    Returns: list of {"pageNumber": int, "text": str}
    """
    if _is_url(pdf_source):
        data = _download_bytes(pdf_source)
        fp = io.BytesIO(data)
    else:
        fp = open(pdf_source, "rb")

    pages = []
    try:
        output = io.StringIO()
        laparams = LAParams()
        extract_text_to_fp(fp, output, laparams=laparams, output_type="text")
        full_text = output.getvalue()
        raw_pages = full_text.split("\x0c")  # pdfminer page delimiter
        for i, txt in enumerate(raw_pages, start=1):
            cleaned = _clean_text(txt)
            if i == len(raw_pages) and not cleaned:
                continue
            pages.append({"pageNumber": i, "text": cleaned})
    finally:
        fp.close()
    return pages


def build_simple_pdf_dataset(pages, pdf_source: str, title: str):
    """
    Convert PDF pages into dataset rows like your audio/video/image functions.
    """
    rows = []
    for page in pages:
        pno = page["pageNumber"]
        text = page["text"]

        payload = {
            "about": "This is a JSON string representing a slice of PDF for RAG.",
            "pdf_source": pdf_source,
            "page_number": pno,
            "page_text": text
        }

        rows.append({
            "document_title": f"{title} • Page {pno}",
            "content_text": json.dumps(payload, ensure_ascii=False)
        })


    return rows


In [None]:
# Constructing the PDF URL
pdf_url = f"{storage_account_endpoint}/{storage_container_name}/BMW_sustainable_natural_rubber.pdf"

# Extracting text from the PDF and building the dataset
extracted_text = extract_pdf_text(pdf_url)
pdf_dataset = build_simple_pdf_dataset(extracted_text, pdf_url, title="BMW Sustainable Natural Rubber PDF")

print(json.dumps(pdf_dataset, indent=2))

### Combining Datasets for Multi-Modal RAG

In [None]:
final_dataset = video_dataset + audio_dataset + image_dataset + pdf_dataset
print(f"Total dataset rows: {len(final_dataset)}")

print(json.dumps(final_dataset, indent=2))  # print first 2 rows as a sample

### Creating an Azure OpenAI Client

In [None]:
from openai import AzureOpenAI

openai_client = AzureOpenAI(
    api_key = os.getenv("AZURE_AI_API_KEY"),
    api_version = "2024-02-15-preview",
    azure_endpoint = os.getenv("AZURE_AI_ENDPOINT")
)

### Creating a Function for Generating Embeddings using Azure OpenAI

In [None]:
from openai import AzureOpenAI

def generate_embeddings(text, embedding_model_name):
    response = openai_client.embeddings.create(
        input=text,
        model=embedding_model_name
    )

    embeddings = response.model_dump()

    return embeddings["data"][0]["embedding"]

### Finalising RAG Dataset with Embeddings

In [None]:
import uuid
for data in final_dataset:
    embedding = generate_embeddings(data["content_text"], os.getenv("EMBEDDING_MODEL_NAME"))
    data["content_embedding"] = embedding
    data["content_id"] = uuid.uuid4().hex

# printing the final dataset with embeddings in a file 
with open("data.json", "w") as f:
    f.write(json.dumps(final_dataset, indent=2))

### Creating our Azure AI Search Index in Azure Search Service

In [None]:
search_service_endpoint = os.getenv("SEARCH_SERVICE_ENDPOINT").strip().rstrip('/')
search_service_api_key = os.getenv("SEARCH_SERVICE_API_KEY")

index_creation_url = search_service_endpoint +"/indexes?api-version=2025-08-01-preview"

headers = {
    "Content-Type": "application/json",
    "api-key": search_service_api_key
}

# reading the index schema from index.json file
with open("index.json", "r") as f:
    index_schema = json.load(f)

body = index_schema 

response = requests.post(index_creation_url, headers=headers, json=body)

if response.status_code == 201:
    print("Index created successfully.")
    print("Response body:", response.json())
else:
    print("Error creating index:", response.status_code, response.text)
    print("Response body:", response.json())


### Preparing the RAG Data for Bulk Upload to Azure AI Search Index

In [None]:
with open("data.json", "r") as f:
    dataset = json.load(f)

for object in dataset:
    object["@search.action"] = "upload" # append the action to each object

with open("data_with_actions.json", "w") as f:
    f.write(json.dumps(dataset, indent=2))

### Pushing RAG formatted data to Azure AI Search Index

In [None]:
with open("data_with_actions.json", "r") as f:
    rag_dataset = json.load(f)


bulk_upload_url = search_service_endpoint +"/indexes/multi-modal-rag-index/docs/index?api-version=2025-08-01-preview"

headers = {
    "Content-Type": "application/json",
    "api-key": search_service_api_key
}

body = {
    "value": [*rag_dataset]
}

response = requests.post(bulk_upload_url, headers=headers, json=body)

if response.status_code == 200:
    print("Bulk upload successful.")
    print("Response body:", response.json())
else:
    print("Error during bulk upload:", response.status_code, response.text)
    print("Response body:", response.json())