In [None]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# [Building Data for Multimodal Question-Answering System]

[Change the link]

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/gemini/getting-started/intro_gemini_1_5_flash.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fgenerative-ai%2Fmain%2Fgemini%2Fgetting-started%2Fintro_gemini_1_5_flash.ipynb">
      <img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>    
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/gemini/getting-started/intro_gemini_1_5_flash.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo"><br> Open in Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/getting-started/intro_gemini_1_5_flash.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>


## Overview

[add overview]

[add what you learned in previous notebook and link] - skip if its first


[Context of this notebook compared to overall idea]

## Getting Started

### Install Dependencies


In [1]:
! pip3 install --upgrade --user --quiet google-cloud-aiplatform

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/5.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/5.1 MB[0m [31m53.7 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m5.1/5.1 MB[0m [31m58.9 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m5.1/5.1 MB[0m [31m56.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.1/5.1 MB[0m [31m38.3 MB/s[0m eta [36m0:00:00[0m
[0m

### Restart runtime

To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which restarts the current kernel.

In [2]:
import sys

if "google.colab" in sys.modules:
    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

<div class="alert alert-block alert-warning">
<b>⚠️ The kernel is going to restart. Please wait until it is finished before continuing to the next step. ⚠️</b>
</div>


### Authenticate your notebook environment (Colab only)

If you are running this notebook on Google Colab, run the cell below to authenticate your environment.


### Set Google Cloud project information and initialize Vertex AI SDK

To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).

Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

In [1]:
import sys

if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user()

In [2]:
# Define project information
PROJECT_ID = ""  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}
BUCKET_NAME = "mlops-for-genai" # @param {type:"string"}
# Initialize Vertex AI
import vertexai

vertexai.init(project=PROJECT_ID, location=LOCATION)

# Initialize cloud storage
from google.cloud import storage

storage_client = storage.Client(project=PROJECT_ID)
bucket = storage_client.bucket(BUCKET_NAME)

### Import libraries


In [28]:
import IPython.display
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"

from vertexai.generative_models import (
    GenerationConfig,
    GenerativeModel,
    HarmBlockThreshold,
    HarmCategory,
    Part,
)
from typing import List
from google.cloud.storage import Bucket
import json
import pandas as pd
import os

### Load the models

To learn more about all [Gemini API models on Vertex AI](https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#gemini-models).


In [4]:
MODEL_ID_PRO = "gemini-1.5-pro-001"  # @param {type:"string"}
MODEL_ID_FLASH = "gemini-1.5-flash-001" # @param {type:"string"}

model_pro = GenerativeModel(MODEL_ID_PRO)
model_flash = GenerativeModel(MODEL_ID_FLASH)

### Add data path

In [5]:
prototype_data = "multimodal-finanace-qa/data/unstructured/prototype/"  # @param {type:"string"}
production_data = "multimodal-finanace-qa/data/unstructured/production/"  # @param {type:"string"}

## Manual Review Process for Multimodal RAG Question-Answer-Citation Pairs

**SME Review (1-5 SMEs)**

**Thorough Review:**

*   **Answer Quality:**  Assess if the generated answer is the most accurate and comprehensive representation of the expected response. Modify if necessary.
    *   **Voting:**
        *   **Accurate (1/0):** Indicate whether the answer is accurate and relevant.
        *   **Comprehensive (1/0):** Indicate whether the answer is comprehensive and addresses all aspects of the question.
        *   **Well-written (1/0):** Indicate whether the answer is well-written and easy to understand.

*   **Question Clarity:** Evaluate if the generated question effectively captures the intended query. Refine if needed.
    *   **Voting:**
        *   **Clear (1/0):** Indicate whether the question is clear and unambiguous.
        *   **Relevant (1/0):** Indicate whether the question is relevant to the context.
        *   **Concise (1/0):** Indicate whether the question is concise and to the point.

*   **Citation Validity:** Verify the authenticity and relevance of the provided citation.
    *   **Voting:**
        *   **Authentic (1/0):** Indicate whether the citation is from a reliable source.
        *   **Relevant (1/0):** Indicate whether the citation supports the answer.
        *   **Accessible (1/0):** Indicate whether the citation is easily accessible.

*   **Question Type Alignment:** Confirm that the assigned 'question_type' accurately reflects the nature of the generated question.
    *   **Voting:**
        *   **Accurate (1/0):** Indicate whether the question type is accurate.

*   **Compliance & Policy Adherence:** Flag any question-answer-citation pairs that violate internal compliance guidelines, policies, or expectations.
    *   **Voting:**
        *   **Compliant (1/0):** Indicate whether the pair adheres to internal policies.
        *   **Sensitive Content (1/0):** Indicate whether the pair contains sensitive or offensive content.
        *   **Bias (1/0):** Indicate whether the pair exhibits any bias.

*   **Strategies & Policies Adherence:** Evaluate if the question-answer-citation pair aligns with the defined strategies and policies for RAG data generation.
    *   **Voting:**
        *   **Adheres to Strategies (1/0):** Indicate whether the pair aligns with the defined strategies.
        *   **Adheres to Policies (1/0):** Indicate whether the pair adheres to the defined policies.

**Additional Voting for all methods**

*   **Drop (1/0):** Indicate whether the pair should be discarded entirely.
*   **Modify (1/0):** Signal if the pair requires modification.
*   **Correct Citation (1/0):** Specify if the citation needs correction.
*   **Confidence (1-5):** Indicate the SME's confidence in their evaluation.

**Preserve Modified Pairs:** Save the modified question-answer-citation pairs for potential use as few-shot examples in future model fine-tuning.

**Remember:**

*   The manual review process is crucial for ensuring the quality and reliability of the RAG system's outputs.
*   By incorporating diverse SME perspectives and implementing robust review strategies, you can enhance the accuracy, fairness, and overall effectiveness of your LLM-powered RAG system.

**Please note:** This markdown serves as a raw template. You may need to further customize it to align with your specific project requirements and internal workflows.

Let me know if you have any further questions or would like assistance refining this process!

## Manual Review Process for Multimodal RAG Question-Answer-Citation Pairs

**SME Review (1-5 SMEs)**

| Review Aspect | Voting Criteria | Description |
|---|---|---|
| **Answer Quality** | Accurate (1/0) | Is the answer factually correct and pertinent to the question? |
|  | Comprehensive (1/0) | Does the answer fully address all aspects of the query? |
|  | Well-written (1/0) | Is the answer clear, concise, and grammatically sound? |
| **Question Clarity** | Clear (1/0) | Is the question unambiguous and easy to understand? |
|  | Relevant (1/0) | Does the question pertain to the given context or topic? |
|  | Concise (1/0) | Is the question formulated in a brief and to-the-point manner? |
| **Citation Validity** | Authentic (1/0) | Is the citation from a reputable and trustworthy source? |
|  | Relevant (1/0) | Does the citation directly support the provided answer? |
|  | Accessible (1/0) | Can the citation be easily located and verified? |
| **Question Type Alignment** | Accurate (1/0) | Does the assigned 'question_type' correctly reflect the nature of the question? |
| **Compliance & Policy Adherence** | Compliant (1/0) | Does the pair adhere to internal guidelines and policies? |
|  | Sensitive Content (1/0) | Does the pair contain any potentially offensive or harmful material? |
|  | Bias (1/0) | Does the pair exhibit any form of prejudice or discrimination? |
| **Strategies & Policies Adherence** | Adheres to Strategies (1/0) | Is the pair aligned with the predefined data generation strategies? |
|  | Adheres to Policies (1/0) | Does the pair comply with the established data generation policies? |
| **Additional Voting** | Drop (1/0) | Should the pair be removed from the dataset entirely? |
|  | Modify (1/0) | Does the pair require adjustments or corrections? |
|  | Correct Citation (1/0) | Is the citation inaccurate or in need of revision? |
|  | Confidence (1-5) | How confident is the SME in their overall evaluation of the pair? |

**Preserve Modified Pairs:** Save the modified question-answer-citation pairs for potential use as few-shot examples in future model fine-tuning.

**Remember:**

*   The manual review process is crucial for ensuring the quality and reliability of the RAG system's outputs.
*   By incorporating diverse SME perspectives and implementing robust review strategies, you can enhance the accuracy, fairness, and overall effectiveness of your LLM-powered RAG system.

## Data Generation

In [6]:
#@title Defining Variables

question_types_dict = {
    "Factual Extraction Questions": {
        "Description": "Seek specific information or facts directly from retrieved multimodal data.",
        "Complexity": "Low to Moderate",
        "Reasoning": "Minimal to Basic",
        "Examples": [
            "What was the company's net income in Q2 2023? (from financial report)",
            "Identify the key trends in the company's stock price over the past year. (from financial graph)",
            "Who is the CEO of the company? (from launch video or earnings call audio)"
        ]
    },
    "Synthesis Questions": {
        "Description": "Require combining information from multiple parts or modalities of retrieved data.",
        "Complexity": "Moderate",
        "Reasoning": "Integration of information from various sources.",
        "Examples": [
            "Summarize the company's financial performance for the fiscal year based on the annual report and earnings call transcript.",
            "What are the key features of the new product as highlighted in the launch video and discussed in the earnings call?"
        ]
    },
    "Inferential Questions": {
        "Description": "Require going beyond explicit information and making deductions or drawing conclusions from multimodal data.",
        "Complexity": "High",
        "Reasoning": "Logical reasoning, potentially involving cross-modal inference and application of financial knowledge.",
        "Examples": [
            "Based on the CEO's tone in the earnings call and the company's financial performance in the recent quarter, what are the potential challenges the company might be facing?",
            "What is the company's strategic focus for the next year based on the product roadmap presented in the launch video and the discussions in the earnings call?"
        ]
    },
    "Multi-hop Reasoning Questions": {
        "Description": "Involve chaining multiple facts or evidence across modalities to arrive at an answer.",
        "Complexity": "Very High",
        "Reasoning": "Complex reasoning across different data types, potentially involving causal or temporal relationships.",
        "Examples": [
            "How did the new product launch, as detailed in the launch video, impact the company's revenue in the subsequent quarter, as reported in the financial statements?",
            "Identify any inconsistencies between the financial projections mentioned in the earnings call and the actual results presented in the quarterly report."
        ]
    },
    "Comparative Questions": {
        "Description": "Explicitly ask for comparisons between entities, events, or concepts across modalities.",
        "Complexity": "Moderate",
        "Reasoning": "Identification and comparison of relevant attributes across different data types.",
        "Examples": [
            "Compare the company's revenue growth in the past two quarters as shown in the financial charts.",
            "How do the features of the newly launched product, as shown in the video, compare to those of the previous generation product mentioned in the last year's annual report?"
        ]
    },
    "Temporal Questions": {
        "Description": "Involve understanding the timeline of events or changes in states across multimodal data.",
        "Complexity": "Moderate",
        "Reasoning": "Extraction and alignment of temporal information from various sources.",
        "Examples": [
            "Trace the evolution of the company's product line over the past five years based on launch videos and annual reports.",
            "How has the company's profit margin changed over the last three fiscal years as shown in the financial statements?"
        ]
    },
    "Hypothetical Questions": {
        "Description": "Ask about potential outcomes or scenarios based on multimodal data.",
        "Complexity": "High",
        "Reasoning": "Combines information from various modalities with financial knowledge and makes predictions or inferences.",
        "Examples": [
            "If the company were to expand into a new market, as hinted at in the earnings call, what potential impact could it have on its revenue?",
            "What could be the potential challenges if the company decides to discontinue one of its product lines, based on the information in the latest earnings call and product launch videos?"
        ]
    },
    "Open-Ended Questions": {
        "Description": "Do not have a single correct answer, invite opinions, discussions, or creative responses based on multimodal data.",
        "Complexity": "High",
        "Reasoning": "Involves synthesis, inference, and potentially generation of novel ideas, drawing from various modalities.",
        "Examples": [
            "What are the key strengths and weaknesses of the company's current business strategy based on all available information?",
            "Discuss the potential risks and opportunities associated with investing in this company."
        ]
    },
    "Code Generation/Debugging Questions (Financial Context)": {
        "Description": "Target understanding and generation of code or identification of errors, specifically related to financial analysis or modeling.",
        "Complexity": "High",
        "Reasoning": "Combines information retrieval from various modalities with logical reasoning about financial code and data.",
        "Examples": [
            "Write a Python script to visualize the company's quarterly revenue trend over the past two years using the data from the financial reports.",
            "Identify any potential errors in the calculation of the company's debt-to-equity ratio in the provided financial spreadsheet."
        ]
    },
    "Ambiguous Questions": {
        "Description": "Can have multiple interpretations or require clarification, especially when dealing with multimodal data.",
        "Complexity": "High",
        "Reasoning": "Identifying ambiguities across different modalities, seeking clarification, or providing multiple interpretations.",
        "Examples": [
            "Tell me about the company's performance. (Ambiguous - could refer to financial, operational, or other aspects)",
            "What is the outlook? (Ambiguous - could be financial outlook, product outlook, or market outlook)"
        ]
    },
    "Contextual Questions": {
        "Description": "Rely on understanding the broader context of a conversation or series of interactions, considering multimodal data.",
        "Complexity": "High",
        "Reasoning": "Tracking previous queries and responses across modalities, identifying entities and their relationships.",
        "Examples": [
            "Can you show me a graph illustrating that trend? (Contextual - assumes a previous mention of a specific trend)",
            "Explain the impact of that event on the company's stock price. (Contextual - refers to a previously discussed event)"
        ]
    },
    "Subjective Questions": {
        "Description": "Ask for opinions, preferences, or value judgments based on multimodal data, potentially requiring nuanced understanding of financial implications.",
        "Complexity": "High",
        "Reasoning": "Identifying diverse perspectives across modalities, acknowledging subjectivity, and potentially offering a balanced or personalized response.",
        "Examples": [
            "Is the company's current CEO doing a good job?",
            "Should the company consider diversifying its product portfolio?"
        ]
    },
    "Creative Questions": {
        "Description": "Call for imaginative or innovative responses based on multimodal data, potentially involving financial storytelling or idea generation.",
        "Complexity": "Very High",
        "Reasoning": "Combining information from various modalities with creative thinking, generating novel connections, or producing original content related to finance.",
        "Examples": [
            "Create a compelling narrative highlighting the company's key achievements and milestones over the years, drawing from launch videos and annual reports.",
            "Propose a new marketing campaign for the recently launched product, incorporating insights from the launch video and target audience analysis in the market research report."
        ]
    },
    "Domain-Specific Questions (Finance)": {
        "Description": "Pertain to specialized areas within finance, requiring in-depth knowledge and understanding of specific terminology and concepts across modalities.",
        "Complexity": "Varies",
        "Reasoning": "Necessitates access to relevant financial documents, potentially specialized language models, and understanding of financial jargon across text, audio, and visual data.",
        "Examples": [
            "Explain the concept of 'discounted cash flow' and how it's used in valuing the company.",
            "Analyze the company's risk exposure based on the information in the financial statements and risk factors discussed in the earnings call."
        ]
    },
    "Counterfactual Questions": {
        "Description": "Explore 'what if' scenarios by altering past facts or events.",
        "Complexity": "Very High",
        "Reasoning": "Understanding cause-and-effect, simulating alternative realities, predicting outcomes.",
        "Example": [
            "How would the company's stock price have been affected if they had not acquired that competitor last year, based on the market conditions at that time?"
        ]
    },
    "Ethical/Moral Questions": {
        "Description": "Delve into the ethical or moral implications of a company's actions or decisions.",
        "Complexity": "High",
        "Reasoning": "Understanding ethical frameworks, societal norms, and potentially generating nuanced arguments.",
        "Example": [
            "Was the company's decision to lay off a significant portion of its workforce during the economic downturn morally justifiable?"
        ]
    },
    "Causal Questions": {
        "Description": "Seek to identify cause-and-effect relationships between events or factors.",
        "Complexity": "High",
        "Reasoning": "Disentangling complex relationships, considering multiple variables, identifying confounding factors.",
        "Example": [
            "What were the key factors that contributed to the company's significant increase in market share over the past two years?"
        ]
    },
    "Prediction Questions": {
        "Description": "Ask for forecasts or predictions about future events or trends.",
        "Complexity": "High",
        "Reasoning": "Identifying patterns, trends, and potentially utilizing predictive modeling.",
        "Example": [
            "Based on the company's current financial performance and market trends, what is the likely outlook for its stock price in the next six months?"
        ]
    },
    "Anomaly Detection Questions": {
        "Description": "Aim to identify unusual or unexpected patterns or data points.",
        "Complexity": "High",
        "Reasoning": "Understanding normal patterns and deviations, potentially involving statistical analysis.",
        "Example": [
            "Are there any unusual fluctuations in the company's cash flow statement that warrant further investigation?"
        ]
    }
}

#follows OpenAPI 3.0
response_schema_gemini_pro = {
  "type": "object",
  "properties": {
    "question": {
      "type": "string",
      "description": "The question posed to the files."
    },
    "answer": {
      "type": "string",
      "description": "The generated answer to the question."
    },
    "text_citation": {
      "type": "array",
      "items": {
        "type": "object",
        "properties": {
          "file_name": {
            "type": "string",
            "description": "Name of the PDF file."
          },
          "page_number": {
            "type": "string",
            "description": "Page number in the PDF file."
          },
          "text": {
            "type": "string",
            "description": "Exact text from the PDF file."
          }
        },
        "required": ["file_name", "page_number", "text"]
      },
      "description": "List of text citations from PDF files."
    },
    "audio_citation": {
      "type": "array",
      "items": {
        "type": "object",
        "properties": {
          "file_name": {
            "type": "string",
            "description": "Name of the MP3 file."
          },
          "timestamp_range": {
            "type": "string",
            "description": "Timestamp range in the MP3 file."
          },
          "transcript": {
            "type": "string",
            "description": "Exact excerpt of transcript from the MP3 file."
          }
        },
        "required": ["file_name", "timestamp_range", "transcript"]
      },
      "description": "List of audio citations from MP3 files."
    },
    "video_citation": {
      "type": "array",
      "items": {
        "type": "object",
        "properties": {
          "file_name": {
            "type": "string",
            "description": "Name of the MP4 file."
          },
          "timestamp_range": {
            "type": "string",
            "description": "Timestamp range in the MP4 file."
          },
          "transcript": {
            "type": "string",
            "description": "Exact excerpt of transcript from the MP4 file."
          }
        },
        "required": ["file_name", "timestamp_range", "transcript"]
      },
      "description": "List of video citations from MP4 files."
    }
  },
  "required": ["question", "answer", "text_citation", "audio_citation", "video_citation"]
}

response_schema_gemini_flash = """{{
"question": "generated question",
"answer": "generate answer",
"text_citation":  [if pdf: list of all text citations. mention all the file name, page_number and exact text in dictionary.]
"audio_citation": [if mp3: list of all audio citations. mention all the file name, timestamp and exact excerpt of transcript in dictionary.]
"video_citation": [if mp4: list of all video citations. mention all the file name, timestamp and exact excerpt of transcript in dictionary.]
}}
"""

In [7]:
#@title Defining Prompts

# prompt

# Fusion Generation Prompt

fushion_generation_task_prompt = """Task: I need you to generate questions where the answers require synthesizing information across multiple files (PDF, video, and audio). The answer should NOT be found within a single file; it should combine insights from at least two different file types.
Example to follow:

Consider the following files:
* "Climate Change Impacts" (PDF)
* "Renewable Energy Solutions" (Video)
* "Expert Panel on Sustainability" (Audio)

Pay attention to how the concept of 'sustainability' is discussed in both the video and the audio file.

Do not generate questions that can be answered with a simple fact or definition. The ideal question should require analysis, comparison, or evaluation across different sources.

Generate a questions that meet these criteria.

Follow the context and guidlines below:
"""

normal_generation_task_prompt = """Task: Generate a question, its relevant context, and the corresponding answer based on the provided multimodal data.
    Follow the context and guidlines below:
    """

# Guidelines Prompt
guidelines_prompt = """Guidelines:

Question:
- Formulate a clear, focused question directly targeting specific information or facts evident in the data.
- Ensure alignment with the specified question type and its intended complexity level.
- Avoid ambiguous or overly broad questions.
- Do not mention "in this video/audio/pdf" or anything that reference the filenames in the question.
- If the question is based on video, audio, then don't mention "speaker" or reference the file.
- If the question is something related to time based entity, mention the time values like year, quarters, months, day etc.

Answer:
- Provide a concise, accurate answer directly addressing the question.
- Ensure the answer is fully supported by the context.
- If reasoning/inference is involved: demonstrate the logical steps.
- If insufficient information: state "Answer not found in the provided data" .

Citations:
- Provide accurate, complete citations including source titles and identifiers (page numbers, timestamps).
- Cite multiple sources if applicable.
- Follow the Output Format.

"""


In [46]:
#@title Data Generation Helper Functions


def generate_questions_and_responses(total_run=1, total_ques_per_run=1,
                                     question_types=None, model="pro", fusion_generation=False,
                                     gs_path_list=None, response_schema=None):
    """
    Generates questions and responses based on provided parameters.

    Args:
        total_run (int): The number of outer loops to run.
        total_ques_per_run (int): The number of questions to generate per run.
        question_types (list): A list of question types to choose from. If None, all types will be used.
        model (str): The model to use for generating responses ("pro" or "flash").
        fusion_generation (bool): Whether to use fusion generation.
        gs_path_list (list): A list of GCS paths to select files from.
        response_schema (dict): The response schema to use.

    Returns:
        list: A list of generated responses.
    """

    final_data = []

    if question_types is None:
        question_types = ['Factual Extraction Questions', 'Synthesis Questions', 'Inferential Questions',
                          'Multi-hop Reasoning Questions', 'Comparative Questions', 'Temporal Questions',
                          'Ethical/Moral Questions', 'Prediction Questions', 'Anomaly Detection Questions']

    for outer_logic in range(total_run):
        print("running count: ",outer_logic+1)
        question_type = random.choice(question_types)
        print("question_type", question_type)
        question_type_description = question_types_dict[question_type]
        selected_paths = select_random_gcs_paths_with_mime_types(gs_path_list, (1, total_ques_per_run))
        print("Selected Files: ", selected_paths)

        for inner_logic in range(total_ques_per_run):
            try:
                response = get_gemini_response_json(
                    question_type,
                    question_type_description,
                    model=model,
                    response_schema=response_schema,
                    fusion_generation=fusion_generation,
                    file_configs=selected_paths
                )
                response['file_type'] = [each['gcs_path'].split(".")[-1] for each in selected_paths]
                final_data.append(response)
            except Exception as e:
                print("Error occurred. Skipping. Error: ", e)
                continue

    return final_data

def get_blob_uri(bucket_name: str, blob_name: str) -> str:
    """Gets the full URI of the blob in Google Cloud Storage.

    Args:
        bucket_name: The name of the GCS bucket.
        blob_name: The name of the blob within the bucket.

    Returns:
        The full GCS URI (gs://...) of the blob.
    """
    return f"gs://{bucket_name}/{blob_name}"


def get_gs_paths_for_production_media(bucket: Bucket, production_data: str) -> List[str]:
  """
  Retrieves Google Storage URIs for media files (PDF, MP4, MP3)
  within a specified production data prefix.

  Args:
      bucket: The Google Cloud Storage bucket to search.
      production_data: The prefix indicating production data.

  Returns:
      A list of Google Storage URIs for the matching media files.
  """

  gs_path_list = []
  for blob in bucket.list_blobs():
    if blob.name.startswith(production_data) and (
        blob.name.endswith(".pdf") or
        blob.name.endswith(".mp4") or
        blob.name.endswith(".mp3")
    ):
      gs_path_list.append(get_blob_uri(bucket.name, blob.name))

  return gs_path_list

def select_random_gcs_paths_with_mime_types(gcs_path_list, selection_range):
    """
    Randomly selects GCS paths and returns them with their corresponding MIME types.

    Args:
        gcs_path_list: A list of GCS paths to choose from.
        selection_range: A tuple (min_files, max_files) specifying the
                         minimum and maximum number of files to select.

    Returns:
        A list of dictionaries, each containing a "gcs_path" and its "mime_type".
    """

    min_files, max_files = selection_range
    num_files_to_select = random.randint(min_files, max_files)
    selected_paths = random.sample(gcs_path_list, num_files_to_select)

    # Map file extensions to MIME types
    extension_mime_type_mapping = {
        'pdf': 'application/pdf',
        'mp3': 'audio/mpeg',
        'mp4': 'video/mp4'
    }

    # Create the final result list
    result = []
    for path in selected_paths:
        _, file_extension = os.path.splitext(path)
        file_extension = file_extension[1:].lower()  # Remove the dot and convert to lowercase

        mime_type = extension_mime_type_mapping.get(file_extension, 'application/octet-stream')
        # Default to 'application/octet-stream' for unknown extensions

        result.append({"gcs_path": path, "mime_type": mime_type})

    return result


import random
import os




def get_gemini_response_json(question_type, question_type_description,
                             model="flash", response_schema=None,
                             fusion_generation=False,
                             file_configs=[]):
  # control generation for JSON using Gemini Flash
  if fusion_generation:
    print("Fusion Generation")
    instruction = fushion_generation_task_prompt
  else:
    print("Normal Generation")
    instruction = normal_generation_task_prompt

  question_context_answer_prompt = f"""{instruction}

Question Type: {question_type}

Question Type Description: {question_type_description}

File Sources: {file_configs}

{guidelines_prompt}

{"Output Format: "+response_schema_gemini_flash if model=='flash' else ""}
"""

  if model == "pro":
    print("Using Gemini 1.5 Pro model for Data Generation with provided OpenAPI schema")
    cg_model = GenerativeModel(
        model_name="gemini-1.5-pro",
      generation_config=GenerationConfig(
          response_mime_type="application/json", response_schema=response_schema
      ),
    )
  elif model == "flash":
    print("Using Gemini 1.5 Flash model for Data Generation with provided schema in prompt")
    cg_model = GenerativeModel(
      model_name="gemini-1.5-flash",
      generation_config={"response_mime_type": "application/json"},
      # generation_config=generation_config,
  )



  # Dynamically create Part objects based on file configurations
  content = [question_context_answer_prompt]
  content.extend([Part.from_uri(config['gcs_path'], mime_type=config['mime_type']) for config in file_configs])

  # Add the analysis prompt
  # print(question_context_answer_prompt)
  content.append(question_context_answer_prompt)

  response = cg_model.generate_content(content)

  json_response = json.loads(response.text)

  json_response["source_file"] = [config['gcs_path'] for config in file_configs]
  json_response["question_type"] = question_type
  json_response["question_type_description"] = question_type_description['Complexity']

  return (json_response)

def generate_questions_and_responses(total_run=1, total_ques_per_run=1,
                                     min_file_select=1, max_file_select=2,
                                     question_types=None, model="pro", fusion_generation=False,
                                     gs_path_list=None, response_schema=None,
                                     output_format=None, output_path=None):

    """
    Generates questions and responses based on provided parameters.

    Args:
        total_run (int): The number of outer loops to run.
        total_ques_per_run (int): The number of questions to generate per run.
        question_types (list): A list of question types to choose from. If None, all types will be used.
        model (str): The model to use for generating responses ("pro" or "flash").
        fusion_generation (bool): Whether to use fusion generation.
        gs_path_list (list): A list of GCS paths to select files from.
        response_schema (dict): The response schema to use.
        output_format (str): The desired output format ("return_df" or "output_path").
        output_path (str): The path to save the DataFrame if output_format is "output_path".


    Returns:
        list: A list of generated responses.
        pd.DataFrame or None: A DataFrame if output_format is "return_df", otherwise None.
    """

    final_data = []

    if question_types is None:
        question_types = ['Factual Extraction Questions', 'Synthesis Questions', 'Inferential Questions',
                          'Multi-hop Reasoning Questions', 'Comparative Questions', 'Temporal Questions',
                          'Ethical/Moral Questions', 'Prediction Questions', 'Anomaly Detection Questions']

    for outer_logic in range(total_run):
        print("running count: ",outer_logic+1)
        question_type = random.choice(question_types)
        print("question_type", question_type)
        question_type_description = question_types_dict[question_type]
        selected_paths = select_random_gcs_paths_with_mime_types(gs_path_list, (min_file_select,
                                                                                max_file_select))
        print("Selected Files: ", selected_paths)

        for inner_logic in range(total_ques_per_run):
            try:
                response = get_gemini_response_json(
                    question_type,
                    question_type_description,
                    model=model,
                    response_schema=response_schema,
                    fusion_generation=fusion_generation,
                    file_configs=selected_paths
                )
                response['file_type'] = [each['gcs_path'].split(".")[-1] for each in selected_paths]
                final_data.append(response)
            except Exception as e:
                print("Error occurred. Skipping. Error: ", e)
                continue

    if output_format == "return_df":
        return pd.DataFrame(final_data)
    elif output_format == "save_df":
        print("saving the final datafram at: ", output_path)
        pd.DataFrame(final_data).to_csv(output_path, index=False)
    else:
        return final_data  # Default behavior if no output_format is specified

In [17]:
gs_path_list = get_gs_paths_for_production_media(bucket, production_data)

In [38]:
# Using Gemin 1.5 Pro to generate a single question-answer pair with random question-category
# selecting atleast 2 or 3 files at a time for all question types
# keeping fusion_generation=True, since question generation should fuse the files.

pro_results_2_3 = generate_questions_and_responses(
    total_run=2,
    total_ques_per_run=1,
    min_file_select = 2,
    max_file_select = 3,
    model="pro",
    fusion_generation=True,  # Enable fusion generation
    gs_path_list=gs_path_list,
    response_schema=response_schema_gemini_pro,
)

running count:  1
question_type Temporal Questions
Selected Files:  [{'gcs_path': 'gs://mlops-for-genai/multimodal-finanace-qa/data/unstructured/production/reports/2020/quaterly_report/20200429-alphabet-10q.pdf', 'mime_type': 'application/pdf'}, {'gcs_path': 'gs://mlops-for-genai/multimodal-finanace-qa/data/unstructured/production/podcast/episode2.mp3', 'mime_type': 'audio/mpeg'}, {'gcs_path': 'gs://mlops-for-genai/multimodal-finanace-qa/data/unstructured/production/earning_call/Alphabet 2023 Q3 Earnings Call (128 kbps).mp3', 'mime_type': 'audio/mpeg'}]
Fusion Generation
Using Gemini 1.5 Pro model for Data Generation with provided OpenAPI schema
running count:  2
question_type Synthesis Questions
Selected Files:  [{'gcs_path': 'gs://mlops-for-genai/multimodal-finanace-qa/data/unstructured/production/earning_call/Alphabet_2023_Q1_Earnings_Call.mp3', 'mime_type': 'audio/mpeg'}, {'gcs_path': 'gs://mlops-for-genai/multimodal-finanace-qa/data/unstructured/production/reports/2022/annual_repo

In [39]:
# Using Gemin 1.5 Pro to generate a single question-answer pair with random question-category
# selecting atleast 2 or 3 files at a time for all question types
# keeping fusion_generation=True, since question generation should fuse the files.
# returning dataframe rather than dataframe

pro_results_2_3_df = generate_questions_and_responses(
    total_run=1,
    total_ques_per_run=1,
    min_file_select = 1,
    max_file_select = 1,
    model="pro",
    # fusion_generation=True,  # Enable fusion generation
    gs_path_list=gs_path_list,
    response_schema=response_schema_gemini_pro,
    output_format='return_df',
)

running count:  1
question_type Factual Extraction Questions
Selected Files:  [{'gcs_path': 'gs://mlops-for-genai/multimodal-finanace-qa/data/unstructured/production/product_launch/gemini/Guessing movies with AI  Testing Gemini.mp4', 'mime_type': 'video/mp4'}]
Normal Generation
Using Gemini 1.5 Pro model for Data Generation with provided OpenAPI schema


In [40]:
pro_results_2_3_df.head()

Unnamed: 0,answer,audio_citation,question,text_citation,video_citation,source_file,question_type,question_type_description,file_type
0,The movie title guessed by Gemini using the im...,[],What is the movie title guessed by Gemini when...,[],[{'file_name': 'gs://mlops-for-genai/multimoda...,[gs://mlops-for-genai/multimodal-finanace-qa/d...,Factual Extraction Questions,Low to Moderate,[mp4]


In [47]:
# Using Gemin 1.5 Pro to generate a single question-answer pair with random question-category
# selecting atleast 2 or 3 files at a time for all question types
# keeping fusion_generation=True, since question generation should fuse the files.
# persisting the results in external path

pro_results_2_3_csv = generate_questions_and_responses(
    total_run=1,
    total_ques_per_run=1,
    min_file_select = 1,
    max_file_select = 2,
    model="pro",
    fusion_generation=True,  # Enable fusion generation
    gs_path_list=gs_path_list,
    response_schema=response_schema_gemini_pro,
    output_format='save_df',
    output_path='/content/run_gen_pro_1112_fushion.csv',
)

running count:  1
question_type Comparative Questions
Selected Files:  [{'gcs_path': 'gs://mlops-for-genai/multimodal-finanace-qa/data/unstructured/production/earning_call/Alphabet 2024 Q2 Earnings Call (128 kbps).mp3', 'mime_type': 'audio/mpeg'}]
Fusion Generation
Using Gemini 1.5 Pro model for Data Generation with provided OpenAPI schema
saving the final datafram at:  /content/run_gen_pro_1112_fushion.csv


In [20]:
# Using Gemin 1.5 Pro to generate a single question-answer pair with random question-category
# selecting only one file for all question types
# keeping fusion_generation=False, since single file.

pro_results_1_1 = generate_questions_and_responses(
    total_run=2,
    total_ques_per_run=1,
    min_file_select = 1,
    max_file_select = 1,
    model="pro",
    gs_path_list=gs_path_list,
    response_schema=response_schema_gemini_pro
)

running count:  1
question_type Inferential Questions
Selected Files:  [{'gcs_path': 'gs://mlops-for-genai/multimodal-finanace-qa/data/unstructured/production/product_launch/gemini/Guessing movies with AI  Testing Gemini.mp4', 'mime_type': 'video/mp4'}]
Normal Generation
Using Gemini 1.5 Pro model for Data Generation with provided OpenAPI schema
running count:  2
question_type Synthesis Questions
Selected Files:  [{'gcs_path': 'gs://mlops-for-genai/multimodal-finanace-qa/data/unstructured/production/earning_call/Alphabet_2023_Q1_Earnings_Call.mp3', 'mime_type': 'audio/mpeg'}]
Normal Generation
Using Gemini 1.5 Pro model for Data Generation with provided OpenAPI schema


In [21]:
# Using Gemin 1.5 Flash to generate a single question-answer pair with random question-category
# Flash doesn't expect response_schema

results_flash_1_1_sysnques = generate_questions_and_responses(
    total_run=1,
    total_ques_per_run=1,
    min_file_select = 1,
    max_file_select = 1,
    question_types=['Synthesis Questions'],
    model="flash",
    # fusion_generation=True,  # Enable fusion generation
    gs_path_list=gs_path_list
)

running count:  1
question_type Synthesis Questions
Selected Files:  [{'gcs_path': 'gs://mlops-for-genai/multimodal-finanace-qa/data/unstructured/production/podcast/episode4.mp3', 'mime_type': 'audio/mpeg'}]
Normal Generation
Using Gemini 1.5 Flash model for Data Generation with provided schema in prompt


## Data Combination

In [35]:
# Assuming you have all the list

combined_list = pro_results_2_3 + pro_results_1_1 + results_flash_1_1_sysnques
combined_df = pd.DataFrame(combined_list)

In [36]:
combined_df.head(2)

Unnamed: 0,answer,audio_citation,question,text_citation,video_citation,source_file,question_type,question_type_description,file_type
0,Gemini is able to understand and explain the v...,[],What is Gemini able to do with images created ...,[],[{'file_name': 'Can AI understand new emojis ...,[gs://mlops-for-genai/multimodal-finanace-qa/d...,Factual Extraction Questions,Low to Moderate,"[mp3, mp4]"
1,The company saw a notable decrease in Sales & ...,[],Was there an unusual shift in the allocation o...,[{'file_name': '2021_Q1_Earnings_Transcript.pd...,[],[gs://mlops-for-genai/multimodal-finanace-qa/d...,Anomaly Detection Questions,High,"[mp4, pdf, pdf]"


In [None]:
# # Assuming you have three DataFrames: df1, df2, df3
# combined_df = pd.concat([df1, df2, df3])

# # Display the combined DataFrame
# print(combined_df)

In [90]:
# # if you have persisted each generation as csv, then you can run this logic

# # Path to your folder containing CSV files
# folder_path = "/content/"


# # Initialize an empty DataFrame to store the combined data
# combined_df = pd.DataFrame()

# # Iterate through all files in the folder
# for filename in os.listdir(folder_path):
#     if filename.endswith(".csv"):  # Consider only CSV files
#         file_path = os.path.join(folder_path, filename)
#         df = pd.read_csv(file_path)
#         combined_df = pd.concat([combined_df, df], ignore_index=True)

# combined_df.reset_index(inplace=True)

In [29]:
combined_df.shape

(5, 9)

In [30]:
combined_df.columns

Index(['answer', 'audio_citation', 'question', 'text_citation',
       'video_citation', 'source_file', 'question_type',
       'question_type_description', 'file_type'],
      dtype='object')

In [31]:
new_column_order = ['question', 'answer', 'question_type',
       'question_type_description', 'audio_citation',  'text_citation',
       'video_citation', 'source_file', 'file_type' ]

# Reorder columns
combined_df = combined_df[new_column_order]

In [34]:
combined_df.head(2)

Unnamed: 0,question,answer,question_type,question_type_description,audio_citation,text_citation,video_citation,source_file,file_type
0,What is Gemini able to do with images created ...,Gemini is able to understand and explain the v...,Factual Extraction Questions,Low to Moderate,[],[],[{'file_name': 'Can AI understand new emojis ...,[gs://mlops-for-genai/multimodal-finanace-qa/d...,"[mp3, mp4]"
1,Was there an unusual shift in the allocation o...,The company saw a notable decrease in Sales & ...,Anomaly Detection Questions,High,[],[{'file_name': '2021_Q1_Earnings_Transcript.pd...,[],[gs://mlops-for-genai/multimodal-finanace-qa/d...,"[mp4, pdf, pdf]"


In [33]:
# combined_df.to_csv("combined_data.csv", index=False)