In [None]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

| | |
|-|-|
|Author(s) | [Lavi Nigam](https://github.com/lavinigam-gcp) |

Notebooks:
**goo.gle/io24-gemini-api**

Google AI Cookbook:
**goo.gle/google-ai-cookbook**

Vertex AI Cookbook:
**goo.gle/vertex-ai-cookbook**

## What do we want to build?



![](https://storage.googleapis.com/gemini-lavi-asset/img/output.png)

# Step 1 - Google Cloud Vertex AI Gemini API

In [None]:
# Library installation # needs restarts
! pip install --upgrade google-cloud-aiplatform
! pip install PyPDF2

In [None]:
# Authentication

import sys
from google.cloud import storage

# Additional authentication is required for Google Colab
if "google.colab" in sys.modules:
    # Authenticate user to Google Cloud
    from google.colab import auth

    auth.authenticate_user()

if "google.colab" in sys.modules:
    # Define project information
    PROJECT_ID = ""  # @param {type:"string"}
    LOCATION = "us-central1"  # @param {type:"string"}
    BUCKET_NAME = "gemini-lavi-asset" # @param {type:"string"}
    # Initialize Vertex AI
    import vertexai

    vertexai.init(project=PROJECT_ID, location=LOCATION)

    # Initialize cloud storage
    storage_client = storage.Client(project=PROJECT_ID)
    bucket = storage_client.bucket(BUCKET_NAME)

In [None]:
# Library
from IPython.display import display, Markdown, Latex
import vertexai.generative_models as genai
import PyPDF2
import pandas as pd
import time
from vertexai.generative_models import (
    GenerativeModel,
    Part,
    GenerationConfig,
    HarmCategory,
    HarmBlockThreshold,
)
import PyPDF2
from io import BytesIO
from vertexai.preview.generative_models import Part
from google.cloud import storage
from io import BytesIO
from datetime import datetime, timezone
import numpy as np
import numpy as np
from typing import List
from IPython.display import display, Markdown, Latex
from vertexai.language_models import TextEmbeddingInput, TextEmbeddingModel
from vertexai.generative_models import (
    GenerativeModel,
    Part,
    GenerationConfig,
    HarmCategory,
    HarmBlockThreshold,
)
from typing import List
from vertexai.language_models import TextEmbeddingInput, TextEmbeddingModel
from google.cloud import storage
from io import BytesIO
from rich import print as rich_print
from rich.markdown import Markdown as rich_Markdown
import pickle
pd.options.mode.chained_assignment = None

## Building Metadata & Index

In [None]:
gemini_15_pro_new = genai.GenerativeModel("gemini-1.5-pro-preview-0514")
gemini_15_flash = genai.GenerativeModel("gemini-1.5-flash-preview-0514")

In [None]:
#@title Helper Functions

def get_gemini_response(model, generation_config=None,
                        safety_settings=None,
                        uri_path=None,mime_type=None, prompt=None):
  if not generation_config:
    generation_config = {
      "max_output_tokens": 8192,
      "temperature": 1,
      "top_p": 0.95,
    }

  if not safety_settings:
    safety_settings = {
        HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_ONLY_HIGH,
        HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_ONLY_HIGH,
        HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_ONLY_HIGH,
        HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_ONLY_HIGH,
    }

  uri = "gs://"+uri_path
  file = genai.Part.from_uri(
    mime_type=mime_type,
    uri=uri
    )
  responses = model.generate_content([file, prompt],
      generation_config=generation_config,
      safety_settings=safety_settings,
      stream=True,
  )
  final_response = []
  for response in responses:
    try:
      final_response.append(response.text)
    except ValueError:
      # print("Something is blocked...")
      final_response.append("blocked")

  return "".join(final_response)

def get_text_from_pdf(bucket):
  extracted_text = []
  # Iterate over all blobs (files) in the bucket
  for blob in bucket.list_blobs():
      if blob.name.startswith("production/"):
        if blob.name.lower().endswith('.pdf'):  # Check if the file is a PDF
            # Download the PDF to a BytesIO object
            pdf_content = BytesIO(blob.download_as_bytes())
            try:
                # Process the PDF using PyPDF2
                pdf_reader = PyPDF2.PdfReader(pdf_content)
                text = ""
                pdf_data = []
                text_type = "/".join(blob.name.split("/")[1:-1])
                filename = blob.name.split("/")[-1]
                for page_num in range(len(pdf_reader.pages)):
                    page = pdf_reader.pages[page_num]
                    text = page.extract_text()
                    if text:
                      pdf_data.append(
                      {
                          'text_type' : text_type,
                          'gcs_path': "gs://"+blob.bucket.name+"/"+blob.name,
                          'page_number': page_num+1,
                          'text': text
                      }
                      )
                extracted_text.extend(pdf_data)
                # break
            except:
                print(f"Warning: Could not read PDF file '{blob.name}' (might be encrypted or corrupted)")
  return pd.DataFrame(extracted_text)

# 16-17 min
def get_text_from_video(bucket, model, prompt, time_sleep=5):
  video_metadata = []
  for blob in bucket.list_blobs():
      if blob.name.startswith("production/"):
        if blob.name.lower().endswith('.mp4'):
          print("processing....",blob.name)
          video_type = "/".join(blob.name.split("/")[1:-1])
          gcs_path = "/".join(blob.id.split("/")[:-1])
          try:
            video_description = get_gemini_response(uri_path=gcs_path,
                                                    model=model,
                                                    mime_type='video/mp4',
                                                    prompt = prompt)
            if video_description:
              video_metadata.append(
                  {
                      'video_gcs': "gs://"+blob.bucket.name+"/"+blob.name,
                      'video_type':video_type,
                      'video_description': video_description
                  }

              )
          except:
            print("Something Failed........")
            video_metadata.append(
                  {
                      'video_gcs': "gs://"+blob.name,
                      'video_type':video_type,
                      'video_description': ""
                  }

              )
          # print("sleeping......")
          time.sleep(time_sleep)
          # break
  return pd.DataFrame(video_metadata)

# ~20 min
def get_text_from_audio(bucket, model, prompt, time_sleep=5):
  # Iterate over all blobs (files) in the bucket
  audio_metadata = []
  for blob in bucket.list_blobs():
    if blob.name.startswith("production/"):
      if blob.name.lower().endswith('.mp3'):
        print("processing....",blob.name)
        video_type = "/".join(blob.name.split("/")[1:-1])
        gcs_path = "/".join(blob.id.split("/")[:-1])
        # print(gcs_path)
        try:
          audio_description = get_gemini_response(uri_path=gcs_path,
                                                  model=model,
                                                  mime_type='audio/mpeg',
                                                  prompt=prompt)
          if audio_description:
            audio_metadata.append(
                {
                    'audio_gcs': "gs://"+blob.bucket.name+"/"+blob.name,
                    'audio_type':video_type,
                    'audio_description': audio_description
                }

            )
        except:
          print("Something Failed........")
          audio_metadata.append(
              {
                  'audio_gcs': blob.name,
                  'audio_type':video_type,
                  'audio_description': ""
              }

          )
        # print("sleeping......")
        time.sleep(time_sleep)
        # break
  return pd.DataFrame(audio_metadata)

def split_text_into_chunks(df, text_column, chunk_size):
    """Splits text into chunks of specified size, preserving other column values."""

    # Create a list of new dataframes, one for each chunk
    new_dfs = []
    for _, row in df.iterrows():
        text_chunks = [row[text_column][i:i + chunk_size] for i in range(0, len(row[text_column]), chunk_size)]
        for chunk in text_chunks:
            new_row = row.copy()  # Copy all other columns
            new_row[text_column] = chunk
            new_dfs.append(pd.DataFrame([new_row]))

    return pd.concat(new_dfs, ignore_index=True)  # Combine into single dataframe

def get_text_embeddings(
    texts: List[str] = ["banana muffins? ", "banana bread? banana muffins?"],
    task: str = "RETRIEVAL_DOCUMENT",
    model_name: str = "textembedding-gecko@003",
) -> List[List[float]]:
    # print("doing...")
    """Embeds texts with a pre-trained, foundational model."""
    model = TextEmbeddingModel.from_pretrained(model_name)
    inputs = [TextEmbeddingInput(text, task) for text in texts]
    embeddings = model.get_embeddings(inputs)
    return [embedding.values for embedding in embeddings][0]

def backup_metadata_in_pickle(extracted_text,
                              video_metadata,
                              audio_metadata,
                              index_db,
                              output_path_with_name):
  import pickle
  data_to_save = {
      "extracted_text": extracted_text,
      "video_metadata": video_metadata,
      "audio_metadata": audio_metadata,
      "index_db": index_db
  }
  print("Backing up the metadata in: ",output_path_with_name+".pkl")
  with open(f"{output_path_with_name}.pkl", "wb") as f:
      pickle.dump(data_to_save,f)

# def load_backuped_metadata(backup_path_with_name):
#   import pickle
#   import pandas as pd

#   with open(f"{backup_path_with_name}.pkl", "rb") as f:
#       loaded_data = pickle.load(f)

#   extracted_text = loaded_data["extracted_text"]
#   video_metadata = loaded_data["video_metadata"]
#   audio_metadata = loaded_data["audio_metadata"]
#   index_db = loaded_data["index_db"]
#   return extracted_text, video_metadata, audio_metadata, index_db

def load_backuped_metadata(bucket):
    for blob in bucket.list_blobs():
      if blob.name.startswith("asset/"):
        if blob.name.lower().endswith('.pkl'):
          with blob.open("rb") as f:
            loaded_data = pickle.load(f)

    extracted_text = loaded_data["extracted_text"]
    video_metadata = loaded_data["video_metadata"]
    audio_metadata = loaded_data["audio_metadata"]
    index_db = loaded_data["index_db"]

    return extracted_text, video_metadata, audio_metadata, index_db


## Step 1 - Text Extraction

### From all reports

![](https://storage.googleapis.com/gemini-lavi-asset/img/Step1.png)

In [None]:
%%time
extracted_text = get_text_from_pdf(bucket)

In [None]:
extracted_text.head()

In [None]:
print("Pages per file...",
      extracted_text['gcs_path'].value_counts())

In [None]:
print("total files: ....",
      len(extracted_text['gcs_path'].value_counts().index))

In [None]:
print("total pages: ....",
      sum(extracted_text['gcs_path'].value_counts().values))

In [None]:
rich_Markdown(extracted_text['text'][9])

### From audio files

![](https://storage.googleapis.com/gemini-lavi-asset/img/Step1-Audio.png)

In [None]:
%%time

audio_description_extraction_prompt = """Transcribe and analyze the audio, identifying key topic shifts or changes in focus. Divide the audio into segments based on these transitions.
For each segment:
* **Summarize:** Briefly describe the main topic or theme of the segment.
* **Contextualize:** Explain how this topic fits into the broader conversation or narrative.
* **Analyze:** Explore the significance of this topic, the perspectives presented, and any potential biases or underlying assumptions.
* **Synthesize:** Connect this topic to other themes or ideas mentioned in the audio, highlighting relationships and overarching patterns.
Conclude with a thematic analysis of the entire audio. Identify the most prominent themes, how they are interconnected, and the overall message or purpose of the audio.
"""

audio_metadata = get_text_from_audio(bucket, gemini_15_pro_new,
                                     audio_description_extraction_prompt,
                                    )

In [None]:
# %%time
# audio_metadata_flash = get_text_from_audio(bucket, gemini_15_flash,
#                                      audio_description_extraction_prompt,
#                                     )

In [None]:
audio_metadata.tail()

In [None]:
print("total files: ....",
      len(audio_metadata['audio_gcs'].value_counts().index))

In [None]:
rich_Markdown(audio_metadata['audio_description'][2])

### From video files

![](https://storage.googleapis.com/gemini-lavi-asset/img/Step1_Video.png)

In [None]:
%%time

video_description_extraction_prompt = """Transcribe and analyze the video, intelligently segmenting it based on shifts in topic, focus, or narrative progression.
For each identified segment:
**Concise Summary**: Distill the core theme or message in 1-2 sentences.
**Thematic Context**: How does this segment contribute to the overarching narrative or argument?
**Critical Analysis**: Delve into the segment's implications, perspectives presented, and potential biases.
**Connections**: Link this segment to other parts of the video, revealing patterns and relationships.

Conclude by synthesizing the video's main themes, their interconnections, and the overarching purpose or message.
"""

video_metadata = get_text_from_video(bucket,gemini_15_pro_new,
                                    video_description_extraction_prompt,
                                    )


In [None]:
%%time

video_metadata_flash = get_text_from_video(bucket,gemini_15_flash,
                                    video_description_extraction_prompt,
                                    )

In [None]:
video_metadata.head()

In [None]:
print("total files: ....",
      len(video_metadata['video_gcs'].value_counts().index))

In [None]:
rich_Markdown(video_metadata['video_description'][0])

## Step 2 - Text Chunking

<img src="https://storage.googleapis.com/gemini-lavi-asset/img/Step2-Chunking.png" width="500" />

In [None]:
# Step 2 [Why do we still do chunking? Explain ----
# 1) show the token count
# 2) reduce noise while search  ]
# latency and cost consideriation - you can still do that, but would it make sense
# Out of 5M token "information" -> you would still want to makes ure that the 1M that you send are the most relevant 1M
# Chunking the Text to smaller size to make precise match with queries

chunk_size =500
extracted_text_chunk_df = split_text_into_chunks(extracted_text, 'text', chunk_size)
video_metadata_chunk_df = split_text_into_chunks(video_metadata, 'video_description', chunk_size)
audio_metadata_chunk_df = split_text_into_chunks(audio_metadata, 'audio_description', chunk_size)

In [None]:
extracted_text_chunk_df.head()

In [None]:
rich_Markdown(
    extracted_text[
        (extracted_text["page_number"] == 3)
        & (
            extracted_text["gcs_path"]
            == "gs://gemini-lavi-asset/production/blogpost/Google Cloud TPU blog.pdf"
        )
    ]["text"].values[0]
)


In [None]:
rich_Markdown(extracted_text_chunk_df.iloc[2]['text'])

In [None]:
rich_Markdown(extracted_text_chunk_df.iloc[3]['text'])

In [None]:
rich_Markdown(extracted_text_chunk_df.iloc[4]['text'])

## Step 3 - Embedding



<img src="https://storage.googleapis.com/gemini-lavi-asset/img/Step2-Chunking_embedding.png" width="500" />

In [None]:
%%time
# Step 3
# Building Embeddings of the text

extracted_text_chunk_df['embeddings'] = extracted_text_chunk_df['text'].apply(lambda x: get_text_embeddings([x]))
video_metadata_chunk_df['embeddings'] = video_metadata_chunk_df['video_description'].apply(lambda x: get_text_embeddings([x]))
audio_metadata_chunk_df['embeddings'] = audio_metadata_chunk_df['audio_description'].apply(lambda x: get_text_embeddings([x]))

In [None]:
extracted_text_chunk_df.head()

In [None]:
video_metadata_chunk_df.head()

In [None]:
audio_metadata_chunk_df.head()

## Step 4 - Building Index

In [None]:
# Step 4 Building final metadata and index for the vector db
# Building index

extracted_text_chunk_df['index'] = extracted_text_chunk_df['page_number'].astype(str)+ "_"+extracted_text_chunk_df['text_type']+ "_" + extracted_text_chunk_df.index.astype(str)
video_metadata_chunk_df['index'] = video_metadata_chunk_df['video_type']+ "_" + video_metadata_chunk_df.index.astype(str)
audio_metadata_chunk_df['index'] = audio_metadata_chunk_df['audio_type']+ "_" + audio_metadata_chunk_df.index.astype(str)

#Adding source to identify type of file
print("Adding source type in the metadata......")
extracted_text_chunk_df['source'] = "text_based"
video_metadata_chunk_df['source'] = "video_based"
audio_metadata_chunk_df['source'] = "audio_based"

#Building index data
print("Building index data from the metadata......")
index_db = pd.concat([extracted_text_chunk_df[['index','source','embeddings']],
          video_metadata_chunk_df[['index','source','embeddings']],
          audio_metadata_chunk_df[['index','source','embeddings']]
                      ],
        axis=0).reset_index(drop=True)

In [None]:
index_db.head()

In [None]:
extracted_text_chunk_df.head()

In [None]:
video_metadata_chunk_df.head()

In [None]:
audio_metadata_chunk_df.head()

### Backing up the metadata

In [None]:
# backup_metadata_in_pickle(extracted_text,
#                           video_metadata,
#                           audio_metadata,
#                           index_db,
#                           output_path_with_name="/content/metadata"
#                           )

In [None]:
# download documents and images used in this notebook - will take ~30 sec
!gsutil -m -q rsync -r gs://gemini-lavi-asset/asset/ .
print("Download completed")

In [None]:
# extracted_text, video_metadata, audio_metadata, index_db = load_backuped_metadata("/content/metadata")
import pickle
extracted_text, video_metadata, audio_metadata, index_db = load_backuped_metadata(bucket)

In [None]:
extracted_text.head()

In [None]:
video_metadata.head()

In [None]:
audio_metadata.head()

In [None]:
index_db.head()

In [None]:
print(gemini_15_pro_new.count_tokens('-'.join(extracted_text['text'])))

In [None]:
print(gemini_15_pro_new.count_tokens('-'.join(video_metadata['video_description'])))

In [None]:
print(gemini_15_pro_new.count_tokens('-'.join(audio_metadata['audio_description'])))

## Retreival & Generation

In [None]:
#@title Helper Functions

def get_cosine_score(
    dataframe: pd.DataFrame, column_name: str, input_text_embd: np.ndarray
) -> float:
    """
    Calculates the cosine similarity between the user query embedding and the dataframe embedding for a specific column.

    Args:
        dataframe: The pandas DataFrame containing the data to compare against.
        column_name: The name of the column containing the embeddings to compare with.
        input_text_embd: The NumPy array representing the user query embedding.

    Returns:
        The cosine similarity score (rounded to two decimal places) between the user query embedding and the dataframe embedding.
    """
    if dataframe[column_name]:
      text_cosine_score = round(np.dot(dataframe[column_name], input_text_embd), 2)
      return text_cosine_score
    else:
      return 0

def get_timestamp_with_milliseconds():
    """Gets the current time as a timestamp string with milliseconds."""

    now = datetime.now(timezone.utc)  # Get current UTC time with timezone awareness
    timestamp_str = now.strftime("%Y-%m-%d-%H:%M:%S.%f")[:-3]  # Format with milliseconds

    return timestamp_str

def get_pdf_from_matched_index(text_index):
  # print("Unique Text index: ", unique_text_cit)
  storage_client = storage.Client(project='kaggle-on-gcp')
  bucket = storage_client.bucket("gemini-lavi-asset")

  selected_file = {}
  gcs_path = extracted_text[extracted_text['index']==text_index]['gcs_path'].values[0].split("//")[1]
  gcs_path = "/".join(gcs_path.split("/")[1:])
  page_num = extracted_text[extracted_text['index']==text_index]['page_number'].values[0]
  selected_file[gcs_path] = int(page_num)
  # print(selected_file)
  pdf_object_part_list = []
  for blob in bucket.list_blobs(prefix="production/"):
      if blob.name in selected_file:
        pdf_content = BytesIO(blob.download_as_bytes())
        pdf_reader = PyPDF2.PdfReader(pdf_content)
        pdf_writer = PyPDF2.PdfWriter()
        pdf_writer.add_page(pdf_reader.pages[selected_file[blob.name]])

        # pdf_writer.add_page(pdf_reader.pages[selected_file[blob.name]])
        # Create a BytesIO buffer to hold the output PDF
        output_buffer = BytesIO()

        # Write the PDF to the buffer
        pdf_writer.write(output_buffer)
        output_buffer.seek(0)  # Reset buffer position to the beginning

        # Upload the buffer content to GCS
        new_blob_name = f"""temp/{blob.name.split("/")[-1]}_selected_pages_{selected_file[blob.name]}_{get_timestamp_with_milliseconds()}.pdf"""
        new_blob = bucket.blob(new_blob_name)
        # print(new_blob_name)
        # print(new_blob)
        new_blob.upload_from_string(output_buffer.getvalue(), content_type="application/pdf")
        gcs_path = "gs://"+blob.bucket.name+"/"+new_blob_name
        pdf_object_part_list.extend(["filename: ",gcs_path, Part.from_uri(uri=gcs_path, mime_type="application/pdf")])
        # print(f"Uploaded selected pages to: {new_blob_name}")

  # print("Created temp pdf's to address the query in GCS.....")
  return pdf_object_part_list

def get_gemini_content_list(query, vector_db, top_n_cosine_scores):
  instruction = """Task: Answer the following questions in detail, providing clear reasoning and evidence from the context files in bullet points.
  Instructions:

  1. **Analyze:** Carefully examine the provided images and text context.
  2. **Synthesize:** Integrate information from both the visual and textual elements.
  3. **Reason:**  Deduce logical connections and inferences to address the question.
  4. **Respond:** Provide a concise, accurate answer in the following format:

    * **Question:** [Question]
    * **Answer:** [Direct response to the question]
    * **Explanation:** [Bullet-point reasoning steps if applicable]
    * **Source** [name of the file, page, image from where the information is citied]

  5. **Ambiguity:** If the context is insufficient to answer, respond "Not enough context to answer."
  """
  gemini_content = [instruction,"Questions:", query,
  "Contexual Files:" ]

  vector_db_sample = vector_db.iloc[top_n_cosine_scores]
  uri_track = []
  for index, row in vector_db_sample.iterrows():
    if row['source'] == 'video_based':
      gcs_path = video_metadata[video_metadata['index']==row['index']]['video_gcs'].values[0]
      mime = 'video/mp4'
      if gcs_path not in uri_track:
        uri_track.append(gcs_path)
        gemini_content.extend(["filename: ",gcs_path,Part.from_uri(uri=gcs_path, mime_type= mime)])

    elif row['source'] == 'audio_based':
      gcs_path = audio_metadata[audio_metadata['index']==row['index']]['audio_gcs'].values[0]
      mime = 'audio/mpeg'
      if gcs_path not in uri_track:
        uri_track.append(gcs_path)
        gemini_content.extend(["filename: ",gcs_path, Part.from_uri(uri=gcs_path, mime_type= mime)])

    elif row['source'] == 'text_based':
      pdf_object_part_list = get_pdf_from_matched_index(row['index'])
      # print("Some files are temp uploaded to gcs to support the query....")
      gemini_content.extend(pdf_object_part_list)

    else:
      print ("Something has gone wrong......")

  return gemini_content

def get_gemini_response(model, generation_config=None,
                        safety_settings=None,
                        uri_path=None,mime_type=None, prompt=None):
  if not generation_config:
    generation_config = {
      "max_output_tokens": 8192,
      "temperature": 1,
      "top_p": 0.95,
    }

  if not safety_settings:
    safety_settings = {
        HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_ONLY_HIGH,
        HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_ONLY_HIGH,
        HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_ONLY_HIGH,
        HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_ONLY_HIGH,
    }

  responses = model.generate_content(prompt,
      generation_config=generation_config,
      safety_settings=safety_settings,
      stream=True,
  )
  final_response = []
  for response in responses:
    try:
      final_response.append(response.text)
    except ValueError:
      # print("Something is blocked...")
      final_response.append("blocked")

  return "".join(final_response)

def get_answer(query,vector_db, model, top_n=5):
  query_embedding = get_text_embeddings([query])
  #Find score
  cosine_scores = vector_db.apply(
              lambda x: get_cosine_score(x, 'embeddings', query_embedding),
              axis=1,
          )
  # Remove same image comparison score when user image is matched exactly with metadata image
  # cosine_scores = cosine_scores[cosine_scores < 1.00000000]
  # Get top N cosine scores and their indices
  top_n_cosine_scores = cosine_scores.nlargest(top_n).index.tolist()
  top_n_cosine_values = cosine_scores.nlargest(top_n).values.tolist()

  citations = vector_db.iloc[top_n_cosine_scores]
  # citations['score'] = top_n_cosine_scores
  citations.loc[:, 'score'] = top_n_cosine_values
  citations = citations[['index','source','score']]

  # print(citations)
  gemini_content = get_gemini_content_list(query, vector_db, top_n_cosine_scores)

  response  =  get_gemini_response(model=model, prompt=gemini_content)
  return([response, gemini_content, citations])

In [None]:
query = """- What is the role of AI in accelerating the progress of UN's sustainable development goals
- What specific commitments and initiatives demonstrate Google's collaboration with the UN on AI for good?
"""

In [None]:
%%time
response, gemini_content, citation = get_answer(query, index_db, gemini_15_pro_new,top_n=5)
rich_Markdown(response)

In [None]:
%%time
response, gemini_content, citation = get_answer(query, index_db, gemini_15_flash,top_n=5)
rich_Markdown(response)

In [None]:
gemini_content

In [None]:
citation

In [None]:
%%time

# time.sleep(30)
query = "How does Gemini 1.5 long context works with video, images, text and code? Give detail examples that Google showed?"
response, gemini_content, citation = get_answer(query, index_db, gemini_15_pro_new,top_n=5)
rich_Markdown(response)

In [None]:
citation

In [None]:
%%time

# time.sleep(30)
query = "What are key achievement for Google Cloud in terms of training LLMs using their TPUs?"
response, gemini_content, citation = get_answer(query, index_db, gemini_15_pro_new,top_n=5)
rich_Markdown(response)

In [None]:
citation

In [None]:
%%time

# time.sleep(30)
query = "what is the emfu for bf16 and 128b parameter model with 1 tpu v5e pod? Cite the table and page number and explain the significance of the results"
response, gemini_content, citation = get_answer(query, index_db, gemini_15_pro_new,top_n=5)
rich_Markdown(response)

In [None]:
citation