In [None]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# [Building Data for Multimodal Question-Answering System]

[Change the link]

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/gemini/getting-started/intro_gemini_1_5_flash.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fgenerative-ai%2Fmain%2Fgemini%2Fgetting-started%2Fintro_gemini_1_5_flash.ipynb">
      <img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>    
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/gemini/getting-started/intro_gemini_1_5_flash.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo"><br> Open in Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/getting-started/intro_gemini_1_5_flash.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>


## Overview

[add overview]

[add what you learned in previous notebook and link] - skip if its first


[Context of this notebook compared to overall idea]

## Getting Started

### Install Dependencies


In [1]:
! pip3 install --upgrade --user --quiet google-cloud-aiplatform

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.1/5.1 MB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[0m

### Restart runtime

To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which restarts the current kernel.

In [2]:
import sys

if "google.colab" in sys.modules:
    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

<div class="alert alert-block alert-warning">
<b>⚠️ The kernel is going to restart. Please wait until it is finished before continuing to the next step. ⚠️</b>
</div>


### Authenticate your notebook environment (Colab only)

If you are running this notebook on Google Colab, run the cell below to authenticate your environment.


### Set Google Cloud project information and initialize Vertex AI SDK

To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).

Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

In [1]:
import sys

if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user()

In [2]:
# Define project information
PROJECT_ID = ""  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}
BUCKET_NAME = "mlops-for-genai" # @param {type:"string"}
# Initialize Vertex AI
import vertexai

vertexai.init(project=PROJECT_ID, location=LOCATION)

# Initialize cloud storage
from google.cloud import storage

storage_client = storage.Client(project=PROJECT_ID)
bucket = storage_client.bucket(BUCKET_NAME)

### Import libraries


In [3]:
import IPython.display
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"

from vertexai.generative_models import (
    GenerationConfig,
    GenerativeModel,
    HarmBlockThreshold,
    HarmCategory,
    Part,
)
from typing import List
from google.cloud.storage import Bucket
import json
import pandas as pd
import os

### Load the models

To learn more about all [Gemini API models on Vertex AI](https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#gemini-models).


In [4]:
MODEL_ID_PRO = "gemini-1.5-pro-001"  # @param {type:"string"}
MODEL_ID_FLASH = "gemini-1.5-flash-001" # @param {type:"string"}

model_pro = GenerativeModel(MODEL_ID_PRO)
model_flash = GenerativeModel(MODEL_ID_FLASH)

### Add data path

In [5]:
prototype_data = "multimodal-finanace-qa/data/unstructured/prototype/"  # @param {type:"string"}
production_data = "multimodal-finanace-qa/data/unstructured/production/"  # @param {type:"string"}

## Manual Review Process for Multimodal RAG Question-Answer-Citation Pairs

**SME Review (1-5 SMEs)**

**Thorough Review:**

*   **Answer Quality:**  Assess if the generated answer is the most accurate and comprehensive representation of the expected response. Modify if necessary.
    *   **Voting:**
        *   **Accurate (1/0):** Indicate whether the answer is accurate and relevant.
        *   **Comprehensive (1/0):** Indicate whether the answer is comprehensive and addresses all aspects of the question.
        *   **Well-written (1/0):** Indicate whether the answer is well-written and easy to understand.

*   **Question Clarity:** Evaluate if the generated question effectively captures the intended query. Refine if needed.
    *   **Voting:**
        *   **Clear (1/0):** Indicate whether the question is clear and unambiguous.
        *   **Relevant (1/0):** Indicate whether the question is relevant to the context.
        *   **Concise (1/0):** Indicate whether the question is concise and to the point.

*   **Citation Validity:** Verify the authenticity and relevance of the provided citation.
    *   **Voting:**
        *   **Authentic (1/0):** Indicate whether the citation is from a reliable source.
        *   **Relevant (1/0):** Indicate whether the citation supports the answer.
        *   **Accessible (1/0):** Indicate whether the citation is easily accessible.

*   **Question Type Alignment:** Confirm that the assigned 'question_type' accurately reflects the nature of the generated question.
    *   **Voting:**
        *   **Accurate (1/0):** Indicate whether the question type is accurate.

*   **Compliance & Policy Adherence:** Flag any question-answer-citation pairs that violate internal compliance guidelines, policies, or expectations.
    *   **Voting:**
        *   **Compliant (1/0):** Indicate whether the pair adheres to internal policies.
        *   **Sensitive Content (1/0):** Indicate whether the pair contains sensitive or offensive content.
        *   **Bias (1/0):** Indicate whether the pair exhibits any bias.

*   **Strategies & Policies Adherence:** Evaluate if the question-answer-citation pair aligns with the defined strategies and policies for RAG data generation.
    *   **Voting:**
        *   **Adheres to Strategies (1/0):** Indicate whether the pair aligns with the defined strategies.
        *   **Adheres to Policies (1/0):** Indicate whether the pair adheres to the defined policies.

**Additional Voting for all methods**

*   **Drop (1/0):** Indicate whether the pair should be discarded entirely.
*   **Modify (1/0):** Signal if the pair requires modification.
*   **Correct Citation (1/0):** Specify if the citation needs correction.
*   **Confidence (1-5):** Indicate the SME's confidence in their evaluation.

**Preserve Modified Pairs:** Save the modified question-answer-citation pairs for potential use as few-shot examples in future model fine-tuning.

**Remember:**

*   The manual review process is crucial for ensuring the quality and reliability of the RAG system's outputs.
*   By incorporating diverse SME perspectives and implementing robust review strategies, you can enhance the accuracy, fairness, and overall effectiveness of your LLM-powered RAG system.

**Please note:** This markdown serves as a raw template. You may need to further customize it to align with your specific project requirements and internal workflows.

Let me know if you have any further questions or would like assistance refining this process!

## Manual Review Process for Multimodal RAG Question-Answer-Citation Pairs

**SME Review (1-5 SMEs)**

| Review Aspect | Voting Criteria | Description |
|---|---|---|
| **Answer Quality** | Accurate (1/0) | Is the answer factually correct and pertinent to the question? |
|  | Comprehensive (1/0) | Does the answer fully address all aspects of the query? |
|  | Well-written (1/0) | Is the answer clear, concise, and grammatically sound? |
| **Question Clarity** | Clear (1/0) | Is the question unambiguous and easy to understand? |
|  | Relevant (1/0) | Does the question pertain to the given context or topic? |
|  | Concise (1/0) | Is the question formulated in a brief and to-the-point manner? |
| **Citation Validity** | Authentic (1/0) | Is the citation from a reputable and trustworthy source? |
|  | Relevant (1/0) | Does the citation directly support the provided answer? |
|  | Accessible (1/0) | Can the citation be easily located and verified? |
| **Question Type Alignment** | Accurate (1/0) | Does the assigned 'question_type' correctly reflect the nature of the question? |
| **Compliance & Policy Adherence** | Compliant (1/0) | Does the pair adhere to internal guidelines and policies? |
|  | Sensitive Content (1/0) | Does the pair contain any potentially offensive or harmful material? |
|  | Bias (1/0) | Does the pair exhibit any form of prejudice or discrimination? |
| **Strategies & Policies Adherence** | Adheres to Strategies (1/0) | Is the pair aligned with the predefined data generation strategies? |
|  | Adheres to Policies (1/0) | Does the pair comply with the established data generation policies? |
| **Additional Voting** | Drop (1/0) | Should the pair be removed from the dataset entirely? |
|  | Modify (1/0) | Does the pair require adjustments or corrections? |
|  | Correct Citation (1/0) | Is the citation inaccurate or in need of revision? |
|  | Confidence (1-5) | How confident is the SME in their overall evaluation of the pair? |

**Preserve Modified Pairs:** Save the modified question-answer-citation pairs for potential use as few-shot examples in future model fine-tuning.

**Remember:**

*   The manual review process is crucial for ensuring the quality and reliability of the RAG system's outputs.
*   By incorporating diverse SME perspectives and implementing robust review strategies, you can enhance the accuracy, fairness, and overall effectiveness of your LLM-powered RAG system.

## Levels of Questions in Multimodal RAG Systems (Financial Context)

| Question Type | Description | Complexity | Reasoning | Examples |
|---|---|---|---|---|
| Factual Extraction Questions | Seek specific information or facts directly from retrieved multimodal data. | Low to Moderate | Minimal to Basic |  "What was the company's net income in Q2 2023?" (from financial report)<br> "Identify the key trends in the company's stock price over the past year." (from financial graph)<br> "Who is the CEO of the company?" (from launch video or earnings call audio) |
| Synthesis Questions | Require combining information from multiple parts or modalities of retrieved data. | Moderate | Integration of information from various sources. | "Summarize the company's financial performance for the fiscal year based on the annual report and earnings call transcript."<br> "What are the key features of the new product as highlighted in the launch video and discussed in the earnings call?" |
| Inferential Questions | Require going beyond explicit information and making deductions or drawing conclusions from multimodal data. | High | Logical reasoning, potentially involving cross-modal inference and application of financial knowledge. | "Based on the CEO's tone in the earnings call and the company's financial performance in the recent quarter, what are the potential challenges the company might be facing?"<br> "What is the company's strategic focus for the next year based on the product roadmap presented in the launch video and the discussions in the earnings call?" |
| Multi-hop Reasoning Questions | Involve chaining multiple facts or evidence across modalities to arrive at an answer. | Very High | Complex reasoning across different data types, potentially involving causal or temporal relationships. | "How did the new product launch, as detailed in the launch video, impact the company's revenue in the subsequent quarter, as reported in the financial statements?"<br> "Identify any inconsistencies between the financial projections mentioned in the earnings call and the actual results presented in the quarterly report." |
| Comparative Questions | Explicitly ask for comparisons between entities, events, or concepts across modalities. | Moderate | Identification and comparison of relevant attributes across different data types. | "Compare the company's revenue growth in the past two quarters as shown in the financial charts."<br> "How do the features of the newly launched product, as shown in the video, compare to those of the previous generation product mentioned in the last year's annual report?" |
| Temporal Questions | Involve understanding the timeline of events or changes in states across multimodal data. | Moderate | Extraction and alignment of temporal information from various sources. | "Trace the evolution of the company's product line over the past five years based on launch videos and annual reports."<br> "How has the company's profit margin changed over the last three fiscal years as shown in the financial statements?" |
| Hypothetical Questions | Ask about potential outcomes or scenarios based on multimodal data. | High | Combines information from various modalities with financial knowledge and makes predictions or inferences. | "If the company were to expand into a new market, as hinted at in the earnings call, what potential impact could it have on its revenue?"<br> "What could be the potential challenges if the company decides to discontinue one of its product lines, based on the information in the latest earnings call and product launch videos?" |
| Open-Ended Questions | Do not have a single correct answer, invite opinions, discussions, or creative responses based on multimodal data. | High | Involves synthesis, inference, and potentially generation of novel ideas, drawing from various modalities. | "What are the key strengths and weaknesses of the company's current business strategy based on all available information?"<br> "Discuss the potential risks and opportunities associated with investing in this company." |
| Code Generation/Debugging Questions (Financial Context) | Target understanding and generation of code or identification of errors, specifically related to financial analysis or modeling. | High | Combines information retrieval from various modalities with logical reasoning about financial code and data. | "Write a Python script to visualize the company's quarterly revenue trend over the past two years using the data from the financial reports."<br> "Identify any potential errors in the calculation of the company's debt-to-equity ratio in the provided financial spreadsheet." |
| Ambiguous Questions | Can have multiple interpretations or require clarification, especially when dealing with multimodal data. | High | Identifying ambiguities across different modalities, seeking clarification, or providing multiple interpretations. | "Tell me about the company's performance." (Ambiguous - could refer to financial, operational, or other aspects)<br> "What is the outlook?" (Ambiguous - could be financial outlook, product outlook, or market outlook) |
| Contextual Questions | Rely on understanding the broader context of a conversation or series of interactions, considering multimodal data. | High | Tracking previous queries and responses across modalities, identifying entities and their relationships. | "Can you show me a graph illustrating that trend?" (Contextual - assumes a previous mention of a specific trend)<br> "Explain the impact of that event on the company's stock price." (Contextual - refers to a previously discussed event) |
| Subjective Questions | Ask for opinions, preferences, or value judgments based on multimodal data, potentially requiring nuanced understanding of financial implications. | High | Identifying diverse perspectives across modalities, acknowledging subjectivity, and potentially offering a balanced or personalized response. | "Is the company's current CEO doing a good job?"<br> "Should the company consider diversifying its product portfolio?" |
| Creative Questions | Call for imaginative or innovative responses based on multimodal data, potentially involving financial storytelling or idea generation. | Very High | Combining information from various modalities with creative thinking, generating novel connections, or producing original content related to finance. | "Create a compelling narrative highlighting the company's key achievements and milestones over the years, drawing from launch videos and annual reports."<br> "Propose a new marketing campaign for the recently launched product, incorporating insights from the launch video and target audience analysis in the market research report." |
| Domain-Specific Questions (Finance) | Pertain to specialized areas within finance, requiring in-depth knowledge and understanding of specific terminology and concepts across modalities. | Varies | Necessitates access to relevant financial documents, potentially specialized language models, and understanding of financial jargon across text, audio, and visual data. | "Explain the concept of 'discounted cash flow' and how it's used in valuing the company."<br> "Analyze the company's risk exposure based on the information in the financial statements and risk factors discussed in the earnings call." |
| Counterfactual Questions | Explore "what if" scenarios by altering past facts or events. | Very High | Understanding cause-and-effect, simulating alternative realities, predicting outcomes. | "How would the company's stock price have been affected if they had not acquired that competitor last year, based on the market conditions at that time?" |
| Ethical/Moral Questions | Delve into the ethical or moral implications of a company's actions or decisions. | High | Understanding ethical frameworks, societal norms, and potentially generating nuanced arguments. |  "Was the company's decision to lay off a significant portion of its workforce during the economic downturn morally justifiable?" |
| Causal Questions | Seek to identify cause-and-effect relationships between events or factors. | High | Disentangling complex relationships, considering multiple variables, identifying confounding factors. | "What were the key factors that contributed to the company's significant increase in market share over the past two years?" |
| Prediction Questions | Ask for forecasts or predictions about future events or trends. | High | Identifying patterns, trends, and potentially utilizing predictive modeling. | "Based on the company's current financial performance and market trends, what is the likely outlook for its stock price in the next six months?" |
| Anomaly Detection Questions | Aim to identify unusual or unexpected patterns or data points. | High | Understanding normal patterns and deviations, potentially involving statistical analysis. | "Are there any unusual fluctuations in the company's cash flow statement that warrant further investigation?" |

## Data Generation

In [6]:
#@title Defining Variables

question_types_dict = {
    "Factual Extraction Questions": {
        "Description": "Seek specific information or facts directly from retrieved multimodal data.",
        "Complexity": "Low to Moderate",
        "Reasoning": "Minimal to Basic",
        "Examples": [
            "What was the company's net income in Q2 2023? (from financial report)",
            "Identify the key trends in the company's stock price over the past year. (from financial graph)",
            "Who is the CEO of the company? (from launch video or earnings call audio)"
        ]
    },
    "Synthesis Questions": {
        "Description": "Require combining information from multiple parts or modalities of retrieved data.",
        "Complexity": "Moderate",
        "Reasoning": "Integration of information from various sources.",
        "Examples": [
            "Summarize the company's financial performance for the fiscal year based on the annual report and earnings call transcript.",
            "What are the key features of the new product as highlighted in the launch video and discussed in the earnings call?"
        ]
    },
    "Inferential Questions": {
        "Description": "Require going beyond explicit information and making deductions or drawing conclusions from multimodal data.",
        "Complexity": "High",
        "Reasoning": "Logical reasoning, potentially involving cross-modal inference and application of financial knowledge.",
        "Examples": [
            "Based on the CEO's tone in the earnings call and the company's financial performance in the recent quarter, what are the potential challenges the company might be facing?",
            "What is the company's strategic focus for the next year based on the product roadmap presented in the launch video and the discussions in the earnings call?"
        ]
    },
    "Multi-hop Reasoning Questions": {
        "Description": "Involve chaining multiple facts or evidence across modalities to arrive at an answer.",
        "Complexity": "Very High",
        "Reasoning": "Complex reasoning across different data types, potentially involving causal or temporal relationships.",
        "Examples": [
            "How did the new product launch, as detailed in the launch video, impact the company's revenue in the subsequent quarter, as reported in the financial statements?",
            "Identify any inconsistencies between the financial projections mentioned in the earnings call and the actual results presented in the quarterly report."
        ]
    },
    "Comparative Questions": {
        "Description": "Explicitly ask for comparisons between entities, events, or concepts across modalities.",
        "Complexity": "Moderate",
        "Reasoning": "Identification and comparison of relevant attributes across different data types.",
        "Examples": [
            "Compare the company's revenue growth in the past two quarters as shown in the financial charts.",
            "How do the features of the newly launched product, as shown in the video, compare to those of the previous generation product mentioned in the last year's annual report?"
        ]
    },
    "Temporal Questions": {
        "Description": "Involve understanding the timeline of events or changes in states across multimodal data.",
        "Complexity": "Moderate",
        "Reasoning": "Extraction and alignment of temporal information from various sources.",
        "Examples": [
            "Trace the evolution of the company's product line over the past five years based on launch videos and annual reports.",
            "How has the company's profit margin changed over the last three fiscal years as shown in the financial statements?"
        ]
    },
    "Hypothetical Questions": {
        "Description": "Ask about potential outcomes or scenarios based on multimodal data.",
        "Complexity": "High",
        "Reasoning": "Combines information from various modalities with financial knowledge and makes predictions or inferences.",
        "Examples": [
            "If the company were to expand into a new market, as hinted at in the earnings call, what potential impact could it have on its revenue?",
            "What could be the potential challenges if the company decides to discontinue one of its product lines, based on the information in the latest earnings call and product launch videos?"
        ]
    },
    "Open-Ended Questions": {
        "Description": "Do not have a single correct answer, invite opinions, discussions, or creative responses based on multimodal data.",
        "Complexity": "High",
        "Reasoning": "Involves synthesis, inference, and potentially generation of novel ideas, drawing from various modalities.",
        "Examples": [
            "What are the key strengths and weaknesses of the company's current business strategy based on all available information?",
            "Discuss the potential risks and opportunities associated with investing in this company."
        ]
    },
    "Code Generation/Debugging Questions (Financial Context)": {
        "Description": "Target understanding and generation of code or identification of errors, specifically related to financial analysis or modeling.",
        "Complexity": "High",
        "Reasoning": "Combines information retrieval from various modalities with logical reasoning about financial code and data.",
        "Examples": [
            "Write a Python script to visualize the company's quarterly revenue trend over the past two years using the data from the financial reports.",
            "Identify any potential errors in the calculation of the company's debt-to-equity ratio in the provided financial spreadsheet."
        ]
    },
    "Ambiguous Questions": {
        "Description": "Can have multiple interpretations or require clarification, especially when dealing with multimodal data.",
        "Complexity": "High",
        "Reasoning": "Identifying ambiguities across different modalities, seeking clarification, or providing multiple interpretations.",
        "Examples": [
            "Tell me about the company's performance. (Ambiguous - could refer to financial, operational, or other aspects)",
            "What is the outlook? (Ambiguous - could be financial outlook, product outlook, or market outlook)"
        ]
    },
    "Contextual Questions": {
        "Description": "Rely on understanding the broader context of a conversation or series of interactions, considering multimodal data.",
        "Complexity": "High",
        "Reasoning": "Tracking previous queries and responses across modalities, identifying entities and their relationships.",
        "Examples": [
            "Can you show me a graph illustrating that trend? (Contextual - assumes a previous mention of a specific trend)",
            "Explain the impact of that event on the company's stock price. (Contextual - refers to a previously discussed event)"
        ]
    },
    "Subjective Questions": {
        "Description": "Ask for opinions, preferences, or value judgments based on multimodal data, potentially requiring nuanced understanding of financial implications.",
        "Complexity": "High",
        "Reasoning": "Identifying diverse perspectives across modalities, acknowledging subjectivity, and potentially offering a balanced or personalized response.",
        "Examples": [
            "Is the company's current CEO doing a good job?",
            "Should the company consider diversifying its product portfolio?"
        ]
    },
    "Creative Questions": {
        "Description": "Call for imaginative or innovative responses based on multimodal data, potentially involving financial storytelling or idea generation.",
        "Complexity": "Very High",
        "Reasoning": "Combining information from various modalities with creative thinking, generating novel connections, or producing original content related to finance.",
        "Examples": [
            "Create a compelling narrative highlighting the company's key achievements and milestones over the years, drawing from launch videos and annual reports.",
            "Propose a new marketing campaign for the recently launched product, incorporating insights from the launch video and target audience analysis in the market research report."
        ]
    },
    "Domain-Specific Questions (Finance)": {
        "Description": "Pertain to specialized areas within finance, requiring in-depth knowledge and understanding of specific terminology and concepts across modalities.",
        "Complexity": "Varies",
        "Reasoning": "Necessitates access to relevant financial documents, potentially specialized language models, and understanding of financial jargon across text, audio, and visual data.",
        "Examples": [
            "Explain the concept of 'discounted cash flow' and how it's used in valuing the company.",
            "Analyze the company's risk exposure based on the information in the financial statements and risk factors discussed in the earnings call."
        ]
    },
    "Counterfactual Questions": {
        "Description": "Explore 'what if' scenarios by altering past facts or events.",
        "Complexity": "Very High",
        "Reasoning": "Understanding cause-and-effect, simulating alternative realities, predicting outcomes.",
        "Example": [
            "How would the company's stock price have been affected if they had not acquired that competitor last year, based on the market conditions at that time?"
        ]
    },
    "Ethical/Moral Questions": {
        "Description": "Delve into the ethical or moral implications of a company's actions or decisions.",
        "Complexity": "High",
        "Reasoning": "Understanding ethical frameworks, societal norms, and potentially generating nuanced arguments.",
        "Example": [
            "Was the company's decision to lay off a significant portion of its workforce during the economic downturn morally justifiable?"
        ]
    },
    "Causal Questions": {
        "Description": "Seek to identify cause-and-effect relationships between events or factors.",
        "Complexity": "High",
        "Reasoning": "Disentangling complex relationships, considering multiple variables, identifying confounding factors.",
        "Example": [
            "What were the key factors that contributed to the company's significant increase in market share over the past two years?"
        ]
    },
    "Prediction Questions": {
        "Description": "Ask for forecasts or predictions about future events or trends.",
        "Complexity": "High",
        "Reasoning": "Identifying patterns, trends, and potentially utilizing predictive modeling.",
        "Example": [
            "Based on the company's current financial performance and market trends, what is the likely outlook for its stock price in the next six months?"
        ]
    },
    "Anomaly Detection Questions": {
        "Description": "Aim to identify unusual or unexpected patterns or data points.",
        "Complexity": "High",
        "Reasoning": "Understanding normal patterns and deviations, potentially involving statistical analysis.",
        "Example": [
            "Are there any unusual fluctuations in the company's cash flow statement that warrant further investigation?"
        ]
    }
}

#follows OpenAPI 3.0
response_schema_gemini_pro = {
  "type": "object",
  "properties": {
    "question": {
      "type": "string",
      "description": "The question posed to the files."
    },
    "answer": {
      "type": "string",
      "description": "The generated answer to the question."
    },
    "text_citation": {
      "type": "array",
      "items": {
        "type": "object",
        "properties": {
          "file_name": {
            "type": "string",
            "description": "Name of the PDF file."
          },
          "page_number": {
            "type": "string",
            "description": "Page number in the PDF file."
          },
          "text": {
            "type": "string",
            "description": "Exact text from the PDF file."
          }
        },
        "required": ["file_name", "page_number", "text"]
      },
      "description": "List of text citations from PDF files."
    },
    "audio_citation": {
      "type": "array",
      "items": {
        "type": "object",
        "properties": {
          "file_name": {
            "type": "string",
            "description": "Name of the MP3 file."
          },
          "timestamp_range": {
            "type": "string",
            "description": "Timestamp range in the MP3 file."
          },
          "transcript": {
            "type": "string",
            "description": "Exact excerpt of transcript from the MP3 file."
          }
        },
        "required": ["file_name", "timestamp_range", "transcript"]
      },
      "description": "List of audio citations from MP3 files."
    },
    "video_citation": {
      "type": "array",
      "items": {
        "type": "object",
        "properties": {
          "file_name": {
            "type": "string",
            "description": "Name of the MP4 file."
          },
          "timestamp_range": {
            "type": "string",
            "description": "Timestamp range in the MP4 file."
          },
          "transcript": {
            "type": "string",
            "description": "Exact excerpt of transcript from the MP4 file."
          }
        },
        "required": ["file_name", "timestamp_range", "transcript"]
      },
      "description": "List of video citations from MP4 files."
    }
  },
  "required": ["question", "answer", "text_citation", "audio_citation", "video_citation"]
}

response_schema_gemini_flash = """{{
"question": "generated question",
"answer": "generate answer",
"text_citation":  [if pdf: list of all text citations. mention all the file name, page_number and exact text in dictionary.]
"audio_citation": [if mp3: list of all audio citations. mention all the file name, timestamp and exact excerpt of transcript in dictionary.]
"video_citation": [if mp4: list of all video citations. mention all the file name, timestamp and exact excerpt of transcript in dictionary.]
}}
"""

In [7]:
#@title Defining Prompts

# prompt

# Fusion Generation Prompt

fushion_generation_task_prompt = """Task: I need you to generate questions where the answers require synthesizing information across multiple files (PDF, video, and audio). The answer should NOT be found within a single file; it should combine insights from at least two different file types.
Example to follow:

Consider the following files:
* "Climate Change Impacts" (PDF)
* "Renewable Energy Solutions" (Video)
* "Expert Panel on Sustainability" (Audio)

Pay attention to how the concept of 'sustainability' is discussed in both the video and the audio file.

Do not generate questions that can be answered with a simple fact or definition. The ideal question should require analysis, comparison, or evaluation across different sources.

Generate a questions that meet these criteria.

Follow the context and guidlines below:
"""

normal_generation_task_prompt = """Task: Generate a question, its relevant context, and the corresponding answer based on the provided multimodal data.
    Follow the context and guidlines below:
    """

# Guidelines Prompt
guidelines_prompt = """Guidelines:

Question:
- Formulate a clear, focused question directly targeting specific information or facts evident in the data.
- Ensure alignment with the specified question type and its intended complexity level.
- Avoid ambiguous or overly broad questions.
- Do not mention "in this video/audio/pdf" or anything that reference the filenames in the question.
- If the question is based on video, audio, then don't mention "speaker" or reference the file.
- If the question is something related to time based entity, mention the time values like year, quarters, months, day etc.

Answer:
- Provide a concise, accurate answer directly addressing the question.
- Ensure the answer is fully supported by the context.
- If reasoning/inference is involved: demonstrate the logical steps.
- If insufficient information: state "Answer not found in the provided data" .

Citations:
- Provide accurate, complete citations including source titles and identifiers (page numbers, timestamps).
- Cite multiple sources if applicable.
- Follow the Output Format.

"""


In [8]:
#@title Data Generation Helper Functions


def generate_questions_and_responses(total_run=1, total_ques_per_run=1,
                                     question_types=None, model="pro", fusion_generation=False,
                                     gs_path_list=None, response_schema=None):
    """
    Generates questions and responses based on provided parameters.

    Args:
        total_run (int): The number of outer loops to run.
        total_ques_per_run (int): The number of questions to generate per run.
        question_types (list): A list of question types to choose from. If None, all types will be used.
        model (str): The model to use for generating responses ("pro" or "flash").
        fusion_generation (bool): Whether to use fusion generation.
        gs_path_list (list): A list of GCS paths to select files from.
        response_schema (dict): The response schema to use.

    Returns:
        list: A list of generated responses.
    """

    final_data = []

    if question_types is None:
        question_types = ['Factual Extraction Questions', 'Synthesis Questions', 'Inferential Questions',
                          'Multi-hop Reasoning Questions', 'Comparative Questions', 'Temporal Questions',
                          'Ethical/Moral Questions', 'Prediction Questions', 'Anomaly Detection Questions']

    for outer_logic in range(total_run):
        print("running count: ",outer_logic+1)
        question_type = random.choice(question_types)
        print("question_type", question_type)
        question_type_description = question_types_dict[question_type]
        selected_paths = select_random_gcs_paths_with_mime_types(gs_path_list, (1, total_ques_per_run))
        print("Selected Files: ", selected_paths)

        for inner_logic in range(total_ques_per_run):
            try:
                response = get_gemini_response_json(
                    question_type,
                    question_type_description,
                    model=model,
                    response_schema=response_schema,
                    fusion_generation=fusion_generation,
                    file_configs=selected_paths
                )
                response['file_type'] = [each['gcs_path'].split(".")[-1] for each in selected_paths]
                final_data.append(response)
            except Exception as e:
                print("Error occurred. Skipping. Error: ", e)
                continue

    return final_data

def get_blob_uri(bucket_name: str, blob_name: str) -> str:
    """Gets the full URI of the blob in Google Cloud Storage.

    Args:
        bucket_name: The name of the GCS bucket.
        blob_name: The name of the blob within the bucket.

    Returns:
        The full GCS URI (gs://...) of the blob.
    """
    return f"gs://{bucket_name}/{blob_name}"


def get_gs_paths_for_production_media(bucket: Bucket, production_data: str) -> List[str]:
  """
  Retrieves Google Storage URIs for media files (PDF, MP4, MP3)
  within a specified production data prefix.

  Args:
      bucket: The Google Cloud Storage bucket to search.
      production_data: The prefix indicating production data.

  Returns:
      A list of Google Storage URIs for the matching media files.
  """

  gs_path_list = []
  for blob in bucket.list_blobs():
    if blob.name.startswith(production_data) and (
        blob.name.endswith(".pdf") or
        blob.name.endswith(".mp4") or
        blob.name.endswith(".mp3")
    ):
      gs_path_list.append(get_blob_uri(bucket.name, blob.name))

  return gs_path_list

def select_random_gcs_paths_with_mime_types(gcs_path_list, selection_range):
    """
    Randomly selects GCS paths and returns them with their corresponding MIME types.

    Args:
        gcs_path_list: A list of GCS paths to choose from.
        selection_range: A tuple (min_files, max_files) specifying the
                         minimum and maximum number of files to select.

    Returns:
        A list of dictionaries, each containing a "gcs_path" and its "mime_type".
    """

    min_files, max_files = selection_range
    num_files_to_select = random.randint(min_files, max_files)
    selected_paths = random.sample(gcs_path_list, num_files_to_select)

    # Map file extensions to MIME types
    extension_mime_type_mapping = {
        'pdf': 'application/pdf',
        'mp3': 'audio/mpeg',
        'mp4': 'video/mp4'
    }

    # Create the final result list
    result = []
    for path in selected_paths:
        _, file_extension = os.path.splitext(path)
        file_extension = file_extension[1:].lower()  # Remove the dot and convert to lowercase

        mime_type = extension_mime_type_mapping.get(file_extension, 'application/octet-stream')
        # Default to 'application/octet-stream' for unknown extensions

        result.append({"gcs_path": path, "mime_type": mime_type})

    return result


import random
import os




def get_gemini_response_json(question_type, question_type_description,
                             model="flash", response_schema=None,
                             fusion_generation=False,
                             file_configs=[]):
  # control generation for JSON using Gemini Flash
  if fusion_generation:
    print("Fusion Generation")
    instruction = fushion_generation_task_prompt
  else:
    print("Normal Generation")
    instruction = normal_generation_task_prompt

  question_context_answer_prompt = f"""{instruction}

Question Type: {question_type}

Question Type Description: {question_type_description}

File Sources: {file_configs}

{guidelines_prompt}

{"Output Format: "+response_schema_gemini_flash if model=='flash' else ""}
"""

  if model == "pro":
    print("Using Gemini 1.5 Pro model for Data Generation with provided OpenAPI schema")
    cg_model = GenerativeModel(
        model_name="gemini-1.5-pro",
      generation_config=GenerationConfig(
          response_mime_type="application/json", response_schema=response_schema
      ),
    )
  elif model == "flash":
    print("Using Gemini 1.5 Flash model for Data Generation with provided schema in prompt")
    cg_model = GenerativeModel(
      model_name="gemini-1.5-flash",
      generation_config={"response_mime_type": "application/json"},
      # generation_config=generation_config,
  )



  # Dynamically create Part objects based on file configurations
  content = [question_context_answer_prompt]
  content.extend([Part.from_uri(config['gcs_path'], mime_type=config['mime_type']) for config in file_configs])

  # Add the analysis prompt
  # print(question_context_answer_prompt)
  content.append(question_context_answer_prompt)

  response = cg_model.generate_content(content)

  json_response = json.loads(response.text)

  json_response["source_file"] = [config['gcs_path'] for config in file_configs]
  json_response["question_type"] = question_type
  json_response["question_type_description"] = question_type_description['Complexity']

  return (json_response)

def generate_questions_and_responses(total_run=1, total_ques_per_run=1,
                                     min_file_select=1, max_file_select=2,
                                     question_types=None, model="pro", fusion_generation=False,
                                     gs_path_list=None, response_schema=None,
                                     output_format=None, output_path=None):

    """
    Generates questions and responses based on provided parameters.

    Args:
        total_run (int): The number of outer loops to run.
        total_ques_per_run (int): The number of questions to generate per run.
        question_types (list): A list of question types to choose from. If None, all types will be used.
        model (str): The model to use for generating responses ("pro" or "flash").
        fusion_generation (bool): Whether to use fusion generation.
        gs_path_list (list): A list of GCS paths to select files from.
        response_schema (dict): The response schema to use.
        output_format (str): The desired output format ("return_df" or "output_path").
        output_path (str): The path to save the DataFrame if output_format is "output_path".


    Returns:
        list: A list of generated responses.
        pd.DataFrame or None: A DataFrame if output_format is "return_df", otherwise None.
    """

    final_data = []

    if question_types is None:
        question_types = ['Factual Extraction Questions', 'Synthesis Questions', 'Inferential Questions',
                          'Multi-hop Reasoning Questions', 'Comparative Questions', 'Temporal Questions',
                          'Ethical/Moral Questions', 'Prediction Questions', 'Anomaly Detection Questions']

    for outer_logic in range(total_run):
        print("running count: ",outer_logic+1)
        question_type = random.choice(question_types)
        print("question_type", question_type)
        question_type_description = question_types_dict[question_type]
        selected_paths = select_random_gcs_paths_with_mime_types(gs_path_list, (min_file_select,
                                                                                max_file_select))
        print("Selected Files: ", selected_paths)

        for inner_logic in range(total_ques_per_run):
            try:
                response = get_gemini_response_json(
                    question_type,
                    question_type_description,
                    model=model,
                    response_schema=response_schema,
                    fusion_generation=fusion_generation,
                    file_configs=selected_paths
                )
                response['file_type'] = [each['gcs_path'].split(".")[-1] for each in selected_paths]
                final_data.append(response)
            except Exception as e:
                print("Error occurred. Skipping. Error: ", e)
                continue

    if output_format == "return_df":
        return pd.DataFrame(final_data)
    elif output_format == "save_df":
        print("saving the final datafram at: ", output_path)
        pd.DataFrame(final_data).to_csv(output_path, index=False)
    else:
        return final_data  # Default behavior if no output_format is specified

In [9]:
gs_path_list = get_gs_paths_for_production_media(bucket, production_data)

### Generating with Gemini 1.5 Pro

In [10]:
# Using Gemin 1.5 Pro to generate a single question-answer pair with random question-category
# selecting atleast 2 or 3 files at a time for all question types
# keeping fusion_generation=True, since question generation should fuse the files.

pro_results_2_3 = generate_questions_and_responses(
    total_run=2,
    total_ques_per_run=1,
    min_file_select = 2,
    max_file_select = 3,
    model="pro",
    fusion_generation=True,  # Enable fusion generation
    gs_path_list=gs_path_list,
    response_schema=response_schema_gemini_pro,
)

running count:  1
question_type Prediction Questions
Selected Files:  [{'gcs_path': 'gs://mlops-for-genai/multimodal-finanace-qa/data/unstructured/production/podcast/episode1.mp3', 'mime_type': 'audio/mpeg'}, {'gcs_path': 'gs://mlops-for-genai/multimodal-finanace-qa/data/unstructured/production/reports/2021/quaterly_report/20210728-alphabet-10q.pdf', 'mime_type': 'application/pdf'}, {'gcs_path': 'gs://mlops-for-genai/multimodal-finanace-qa/data/unstructured/production/reports/2022/quaterly_report/goog-10-k-q4-2022.pdf', 'mime_type': 'application/pdf'}]
Fusion Generation
Using Gemini 1.5 Pro model for Data Generation with provided OpenAPI schema
running count:  2
question_type Comparative Questions
Selected Files:  [{'gcs_path': 'gs://mlops-for-genai/multimodal-finanace-qa/data/unstructured/production/reports/2023/quaterly_report/20230426-alphabet-10q.pdf', 'mime_type': 'application/pdf'}, {'gcs_path': 'gs://mlops-for-genai/multimodal-finanace-qa/data/unstructured/production/reports/202

#### Returning df after generation

In [11]:
# Using Gemin 1.5 Pro to generate a single question-answer pair with random question-category
# selecting atleast 2 or 3 files at a time for all question types
# keeping fusion_generation=True, since question generation should fuse the files.
# returning dataframe rather than dataframe

pro_results_2_3_df = generate_questions_and_responses(
    total_run=1,
    total_ques_per_run=1,
    min_file_select = 1,
    max_file_select = 1,
    model="pro",
    # fusion_generation=True,  # Enable fusion generation
    gs_path_list=gs_path_list,
    response_schema=response_schema_gemini_pro,
    output_format='return_df',
)

running count:  1
question_type Ethical/Moral Questions
Selected Files:  [{'gcs_path': 'gs://mlops-for-genai/multimodal-finanace-qa/data/unstructured/production/reports/2020/earning_transcript/2020_Q3_Earnings_Transcript.pdf', 'mime_type': 'application/pdf'}]
Normal Generation
Using Gemini 1.5 Pro model for Data Generation with provided OpenAPI schema


In [12]:
pro_results_2_3_df.head()

Unnamed: 0,answer,audio_citation,question,text_citation,video_citation,source_file,question_type,question_type_description,file_type
0,Although the provided data extensively covers ...,[],Given the comparison made between the company'...,[{'file_name': 'gs://mlops-for-genai/multimoda...,[],[gs://mlops-for-genai/multimodal-finanace-qa/d...,Ethical/Moral Questions,High,[pdf]


#### Persisting df after generation

In [13]:
# Using Gemin 1.5 Pro to generate a single question-answer pair with random question-category
# selecting atleast 2 or 3 files at a time for all question types
# keeping fusion_generation=True, since question generation should fuse the files.
# persisting the results in external path

pro_results_2_3_csv = generate_questions_and_responses(
    total_run=1,
    total_ques_per_run=1,
    min_file_select = 1,
    max_file_select = 2,
    model="pro",
    fusion_generation=True,  # Enable fusion generation
    gs_path_list=gs_path_list,
    response_schema=response_schema_gemini_pro,
    output_format='save_df',
    output_path='/content/run_gen_pro_1112_fushion.csv',
)

running count:  1
question_type Multi-hop Reasoning Questions
Selected Files:  [{'gcs_path': 'gs://mlops-for-genai/multimodal-finanace-qa/data/unstructured/production/reports/2021/earning_transcript/2021_Q2_Earnings_Transcript.pdf', 'mime_type': 'application/pdf'}, {'gcs_path': 'gs://mlops-for-genai/multimodal-finanace-qa/data/unstructured/production/product_launch/gemini/Googles newest and most capable AI  Gemini.mp4', 'mime_type': 'video/mp4'}]
Fusion Generation
Using Gemini 1.5 Pro model for Data Generation with provided OpenAPI schema
saving the final datafram at:  /content/run_gen_pro_1112_fushion.csv


In [14]:
# Using Gemin 1.5 Pro to generate a single question-answer pair with random question-category
# selecting only one file for all question types
# keeping fusion_generation=False, since single file.

pro_results_1_1 = generate_questions_and_responses(
    total_run=2,
    total_ques_per_run=1,
    min_file_select = 1,
    max_file_select = 1,
    model="pro",
    gs_path_list=gs_path_list,
    response_schema=response_schema_gemini_pro
)

running count:  1
question_type Ethical/Moral Questions
Selected Files:  [{'gcs_path': 'gs://mlops-for-genai/multimodal-finanace-qa/data/unstructured/production/reports/2022/earning_transcript/2022_Q3_Earnings_Transcript.pdf', 'mime_type': 'application/pdf'}]
Normal Generation
Using Gemini 1.5 Pro model for Data Generation with provided OpenAPI schema
running count:  2
question_type Prediction Questions
Selected Files:  [{'gcs_path': 'gs://mlops-for-genai/multimodal-finanace-qa/data/unstructured/production/reports/2021/earning_transcript/2021_Q4_Earnings_Transcript.pdf', 'mime_type': 'application/pdf'}]
Normal Generation
Using Gemini 1.5 Pro model for Data Generation with provided OpenAPI schema


### Generating with Gemini 1.5 Flash

In [19]:
# Using Gemin 1.5 Flash to generate a single question-answer pair with random question-category
# Flash doesn't expect response_schema

results_flash_1_1_sysnques = generate_questions_and_responses(
    total_run=1,
    total_ques_per_run=1,
    min_file_select = 1,
    max_file_select = 1,
    question_types=['Synthesis Questions'],
    model="flash",
    # fusion_generation=True,  # Enable fusion generation
    gs_path_list=gs_path_list
)

running count:  1
question_type Synthesis Questions
Selected Files:  [{'gcs_path': 'gs://mlops-for-genai/multimodal-finanace-qa/data/unstructured/production/product_launch/gemini/Math & physics with AI  Gemini.mp4', 'mime_type': 'video/mp4'}]
Normal Generation
Using Gemini 1.5 Flash model for Data Generation with provided schema in prompt


## Data Combination

In [21]:
# Assuming you have all the list

combined_list = pro_results_2_3 + pro_results_1_1 + results_flash_1_1_sysnques
combined_df = pd.DataFrame(combined_list)

In [22]:
combined_df.head(2)

Unnamed: 0,answer,audio_citation,question,text_citation,video_citation,source_file,question_type,question_type_description,file_type
0,Professor Ngaire Woods argues that the advance...,"[{'file_name': 'episode1.mp3', 'timestamp_rang...",How do experts envision the future of governme...,"[{'file_name': '20210728-alphabet-10q.pdf', 'p...",[],[gs://mlops-for-genai/multimodal-finanace-qa/d...,Prediction Questions,High,"[mp3, pdf, pdf]"
1,"In the first quarter of 2023, Alphabet's long-...",[],How does Alphabet's long-term debt for the fir...,"[{'file_name': '20230426-alphabet-10q.pdf', 'p...",[],[gs://mlops-for-genai/multimodal-finanace-qa/d...,Comparative Questions,Moderate,"[pdf, pdf]"


In [None]:
# # Assuming you have three DataFrames: df1, df2, df3
# combined_df = pd.concat([df1, df2, df3])

# # Display the combined DataFrame
# print(combined_df)

In [90]:
# # if you have persisted each generation as csv, then you can run this logic

# # Path to your folder containing CSV files
# folder_path = "/content/"


# # Initialize an empty DataFrame to store the combined data
# combined_df = pd.DataFrame()

# # Iterate through all files in the folder
# for filename in os.listdir(folder_path):
#     if filename.endswith(".csv"):  # Consider only CSV files
#         file_path = os.path.join(folder_path, filename)
#         df = pd.read_csv(file_path)
#         combined_df = pd.concat([combined_df, df], ignore_index=True)

# combined_df.reset_index(inplace=True)

In [24]:
print("Total generated row: ", combined_df.shape[0])

Total generated row:  5


In [27]:
print("Columns in the combined data: \n",
      combined_df.columns.tolist())

Columns in the combined data: 
 ['answer', 'audio_citation', 'question', 'text_citation', 'video_citation', 'source_file', 'question_type', 'question_type_description', 'file_type']


In [28]:
new_column_order = ['question', 'answer', 'question_type',
       'question_type_description', 'audio_citation',  'text_citation',
       'video_citation', 'source_file', 'file_type' ]

# Reorder columns
combined_df = combined_df[new_column_order]

In [29]:
combined_df.head(2)

Unnamed: 0,question,answer,question_type,question_type_description,audio_citation,text_citation,video_citation,source_file,file_type
0,How do experts envision the future of governme...,Professor Ngaire Woods argues that the advance...,Prediction Questions,High,"[{'file_name': 'episode1.mp3', 'timestamp_rang...","[{'file_name': '20210728-alphabet-10q.pdf', 'p...",[],[gs://mlops-for-genai/multimodal-finanace-qa/d...,"[mp3, pdf, pdf]"
1,How does Alphabet's long-term debt for the fir...,"In the first quarter of 2023, Alphabet's long-...",Comparative Questions,Moderate,[],"[{'file_name': '20230426-alphabet-10q.pdf', 'p...",[],[gs://mlops-for-genai/multimodal-finanace-qa/d...,"[pdf, pdf]"


### Adding review processes flags

In [33]:
review_process_column_names = [
    "answer_quality_accurate",
    "answer_quality_comprehensive",
    "answer_quality_well_written",
    "question_clarity_clear",
    "question_clarity_relevant",
    "question_clarity_concise",
    "citation_validity_authentic",
    "citation_validity_relevant",
    "citation_validity_accessible",
    "question_type_alignment_accurate",
    "compliance_policy_adherence_compliant",
    "compliance_policy_adherence_sensitive_content",
    "compliance_policy_adherence_bias",
    "strategies_policies_adherence_adheres_to_strategies",
    "strategies_policies_adherence_adheres_to_policies",
    "voting_drop",
    "voting_modify",
    "voting_correct_citation",
    "voting_confidence"
]


In [34]:
review_process_df = pd.DataFrame(0, index=range(combined_df.shape[0]),
                                 columns=review_process_column_names)

In [38]:
final_review_with_data = pd.concat([combined_df,review_process_df],axis=1)

In [40]:
final_review_with_data.head(2)

Unnamed: 0,question,answer,question_type,question_type_description,audio_citation,text_citation,video_citation,source_file,file_type,answer_quality_accurate,...,question_type_alignment_accurate,compliance_policy_adherence_compliant,compliance_policy_adherence_sensitive_content,compliance_policy_adherence_bias,strategies_policies_adherence_adheres_to_strategies,strategies_policies_adherence_adheres_to_policies,voting_drop,voting_modify,voting_correct_citation,voting_confidence
0,How do experts envision the future of governme...,Professor Ngaire Woods argues that the advance...,Prediction Questions,High,"[{'file_name': 'episode1.mp3', 'timestamp_rang...","[{'file_name': '20210728-alphabet-10q.pdf', 'p...",[],[gs://mlops-for-genai/multimodal-finanace-qa/d...,"[mp3, pdf, pdf]",0,...,0,0,0,0,0,0,0,0,0,0
1,How does Alphabet's long-term debt for the fir...,"In the first quarter of 2023, Alphabet's long-...",Comparative Questions,Moderate,[],"[{'file_name': '20230426-alphabet-10q.pdf', 'p...",[],[gs://mlops-for-genai/multimodal-finanace-qa/d...,"[pdf, pdf]",0,...,0,0,0,0,0,0,0,0,0,0


In [41]:
# final_review_with_data.to_csv("final_review_with_data.csv", index=False)

## Post-Analysis Process for Multimodal RAG Question-Answer-Citation Pairs (Enterprise-Level)

### Step 1: Data Consolidation and Calculation

1. **Gather SME Reviews:** Collect all completed review forms from the SMEs. Ensure all pairs have been evaluated by the minimum (2) and maximum (5) number of SMEs as defined in the process.

2. **Calculate Scores and Flags:**

    * **Per-Pair Metrics:** For each question-answer-citation pair, calculate:
        * The sum or average for each review aspect (e.g., sum of 'Answer Quality - Accurate' votes).
        * The number of times specific flags were raised (e.g., count of 'Drop' votes).
        * The average 'Confidence' score.

    * **Overall Metrics:** Calculate overall statistics across the entire dataset, such as:
        * The percentage of pairs flagged for 'Drop', 'Modify', etc.
        * The distribution of 'Confidence' scores.

### Step 2: Apply Decision Rules (Threshold-Based)

**Note:** The specific thresholds mentioned below should be customized based on your enterprise's risk tolerance, quality standards, and business objectives.

1. **Drop Rule:**

    * If a pair receives a 'Drop' vote from **more than 50%** of the reviewing SMEs, **automatically drop** the pair from the dataset.
    * **Optional:** If a pair receives a high number of 'Drop' votes (e.g., 3 or more), but not the majority, flag it for further review by a senior SME or a quality assurance team.

2. **Modify Rule:**

    * If a pair receives a 'Modify' vote from **more than 50%** of the reviewing SMEs, **flag it for modification**.
    * Each SME who voted for 'Modify' should provide their revised version of the question-answer-citation pair.
    * A senior SME or quality assurance team should review the proposed modifications and make the final decision on how to adjust the pair.

3. **Hold and Review Rule:**

    * If a pair receives a **majority '0' vote** for 'Adheres to Policies' or any other critical compliance aspect, **place the pair on hold**.
    * A senior SME or compliance team should thoroughly review the pair to determine whether it can be modified to meet policy requirements or if it should be dropped.

4. **Citation Correction Rule:**

    * If a pair receives a 'Correct Citation' vote from **more than 50%** of the reviewing SMEs, **flag it for citation correction**.
    * A subject matter expert should review and update the citation as needed.

5. **Confidence Threshold:**

    * If the average 'Confidence' score for a pair is **below a predefined threshold** (e.g., 3 out of 5), **flag the pair for further review**.
    * A senior SME or quality assurance team should assess the pair and decide whether to accept it, modify it, or drop it.

### Step 3: Iterate and Refine

1. **SME Feedback Loop:** Share the post-analysis results and decisions with the SMEs. Gather their feedback on the process, decision rules, and any challenges they encountered during the review.

2. **Continuous Improvement:** Use the SME feedback and the overall dataset metrics to refine the review process, adjust thresholds, and improve the data generation strategies and policies.

### Additional Considerations for Enterprise Customers

* **Scalability:** For large datasets and teams, consider using a review platform or tool to streamline the process, automate calculations, and track progress.

* **Data Security and Privacy:** Implement appropriate measures to protect sensitive information in the dataset and ensure compliance with data privacy regulations.

* **SME Training and Calibration:** Provide comprehensive training to SMEs on the review process, criteria, and any specific enterprise guidelines. Periodically calibrate the SMEs to ensure consistency in their evaluations.

* **Documentation:** Maintain detailed documentation of the review process, decision rules, thresholds, and any modifications made over time.

By following this structured post-analysis process, enterprise customers can ensure the high quality, accuracy, and compliance of their multimodal RAG question-answer-citation pairs, leading to improved performance and reliability of their RAG-based applications.

**Remember:** The success of this process relies heavily on clear communication, collaboration between SMEs and quality assurance teams, and a commitment to continuous improvement.

## Building and Tuning LLMs for RAG using the Finalized Question-Answer-Citation Pairs

Once you have the finalized dataset of question-answer-citation pairs after the rigorous review process, here's a detailed, step-by-step process to build and tune your RAG system and LLMs:

**1. Data Preparation and Structuring**

* **Format Conversion:** Ensure the data is in a suitable format for your RAG implementation and LLM fine-tuning. Commonly used formats include JSON, CSV, or specific database structures.
* **Data Splitting:** Split the dataset into three parts:
    * **Training Set:** (Largest portion, e.g., 70-80%) Used to fine-tune the LLM on the task of generating relevant and accurate answers based on the context provided in the citation.
    * **Validation Set:** (Smaller portion, e.g., 10-15%) Used to evaluate the model's performance during training and help prevent overfitting.
    * **Test Set:** (Held-out portion, e.g., 10-15%) Used for the final evaluation of the model's performance on unseen data.

**2. Retrieval System Setup (Building the RAG)**

* **Document Store/Vector Database: Vertex Vector Search**
    * **Create Index:** Utilize Vertex Vector Search to create an index optimized for efficient similarity search. Specify the dimensionality of your embeddings (determined by the chosen embedding model) and configure the index for optimal performance based on your data volume and query patterns.
    * **Data Ingestion:** Ingest your finalized question-answer-citation pairs into Vertex Vector Search.
        * **BigQuery Integration:** If your data is stored in BigQuery, you can seamlessly integrate it with Vertex Vector Search using the provided connectors. This enables efficient data loading and synchronization.
        * **Preprocessing:** Before ingestion, preprocess your citations (and potentially questions) to ensure they are clean and normalized. This might include text cleaning, tokenization, and other transformations.
        * **Embedding Generation:** Use Vertex AI's `text-embeddings` or `multimodal-embeddings` models to generate embeddings for your citations (and questions, if needed). These embeddings capture the semantic meaning of the text and are essential for effective retrieval.
        * **Index Updates:**  Implement a mechanism to keep the Vertex Vector Search index up-to-date as new question-answer-citation pairs are added or modified. This ensures that your RAG system always reflects the latest knowledge.

* **Retrieval Logic**
    * **Query Embedding:** When a user poses a query, convert it into an embedding using the same embedding model used for the citations.
    * **Similarity Search:** Perform a similarity search in the Vertex Vector Search index using the query embedding. This will retrieve the top-k most relevant citations based on their semantic similarity to the query.
    * **Contextual Filtering (Optional):** If applicable, implement additional filtering or ranking mechanisms based on metadata associated with the citations (e.g., source reliability, date, etc.) to further refine the retrieved context.
    * **Cloud Run or Cloud Functions:** Consider using Cloud Run or Cloud Functions to deploy the retrieval logic as a scalable and serverless API endpoint. This enables easy integration with your LLM fine-tuning and serving components.

**3. LLM Fine-tuning**

* **Model Selection: Gemini 1.5 and Open Models (Gemma)**
    * **Gemini 1.5:** Leverage the power of Gemini 1.5 models for their multimodality capabilities and large context window (2M tokens). These models can handle both text and image inputs, enabling you to build a more versatile RAG system.
    * **Open Models (Gemma):** Explore different Gemma models (2B, 9B, 27B) to find the best balance between performance and computational resources for your specific use case. Larger models generally offer better performance but require more compute.

* **Fine-tuning Framework: Vertex AI Training**
    * **Managed Service:** Use Vertex AI Training to streamline the fine-tuning process. It provides a managed environment for training and deploying machine learning models, including LLMs.
    * **Hyperparameter Tuning:** Leverage Vertex AI's hyperparameter tuning capabilities to automatically explore different hyperparameter combinations and find the optimal settings for your LLM fine-tuning.

* **Data Formatting and Prompt Engineering**
    * **Prompt Design:** Carefully craft prompts that guide the LLM to generate relevant and accurate answers based on the retrieved context. The prompt should include:
        * Instructions for the LLM (e.g., "Answer the question based on the provided context.")
        * The user's query
        * The retrieved citations
    * **Experimentation:** Experiment with different prompt structures and formats to find the most effective approach for your specific task and LLM.
    * **BigQuery Feature Store (Optional):** If you have additional features or metadata associated with your question-answer-citation pairs, consider using BigQuery Feature Store to manage and incorporate them into your fine-tuning process.

* **Fine-tuning Process**
    * **Vertex AI Training Job:** Submit a training job to Vertex AI Training, specifying the pre-trained LLM, the prepared training data, and the desired hyperparameters.
    * **Monitoring and Evaluation:** Monitor the training progress and evaluate the model's performance on the validation set using relevant metrics. Adjust hyperparameters or data as needed to improve performance.



**4. Integration and Testing**

* **Combine Retrieval and LLM:** Integrate the retrieval system with the fine-tuned LLM. When a user poses a query:
    * The retrieval system fetches the most relevant citations.
    * The LLM generates an answer based on the query and the retrieved citations.
* **Thorough Testing:** Rigorously test the integrated system on the held-out test set and various real-world scenarios. Evaluate the system on key metrics like accuracy, relevance, and fluency.

**5. Deployment and Monitoring**

* **Deployment:** Deploy the RAG system to a suitable environment (cloud, on-premises, etc.) where it can be accessed by users.
* **Monitoring and Maintenance:** Continuously monitor the system's performance and gather user feedback. Periodically retrain or fine-tune the models with new data and address any issues that arise.

**Thought Process and Key Considerations**

* **Data Quality is Paramount:** The quality of your question-answer-citation pairs directly impacts the RAG system's performance. The rigorous review process you've defined is crucial to ensuring high-quality data.
* **Retrieval Effectiveness:** The retrieval system plays a critical role in providing relevant context to the LLM. Experiment with different embedding models and retrieval techniques to optimize performance.
* **LLM Fine-tuning:** Fine-tuning the LLM on your specific task and data is essential for achieving good results. Carefully consider the choice of pre-trained model, fine-tuning framework, and hyperparameters.
* **Evaluation and Iteration:** Continuous evaluation and iteration are key to improving your RAG system. Use a variety of metrics to assess performance, gather user feedback, and make data-driven improvements.
* **Enterprise-Specific Considerations:** Consider factors like scalability, security, and compliance when designing and deploying your RAG system in an enterprise environment.

Remember that building and tuning an effective RAG system is an iterative process. By carefully following these steps and continuously refining your approach based on data and feedback, you can create a powerful tool for leveraging your knowledge base and providing accurate and informative answers to your users.