In [59]:
import json

start=600
end=900
schema="""{
    "description": "A structured schema to represent detailed information from a video or text analysis",
    "type": "object",
    "properties": {
        "Category": {
            "type": "string",
            "description": "The category or general type of the content"
        },
        "DetailedDescriptionOfEventsAndConversations": {
            "type": "string",
            "description": "A detailed textual description of the events and conversations in the content"
        },
        "BrandsCompanyNamesLogos": {
            "type": "array",
            "items": {
                "type": "string"
            },
            "description": "A list of brands, company names, or logos appearing or mentioned in the content"
        },
        "KeyLocationsAndScenes": {
            "type": "array",
            "items": {
                "type": "string"
            },
            "description": "A list of key locations and scenes appearing or mentioned in the content"
        },
        "KeyThemes": {
            "type": "array",
            "items": {
                "type": "string"
            },
            "description": "A list of key themes discussed or portrayed in the content"
        },
        "PeopleAppearingAndMentioned": {
            "type": "array",
            "items": {
                "type": "string"
            },
            "description": "A list of people who appear or are mentioned in the content"
        }
    },
    "required": [
        "Category",
        "DetailedDescriptionOfEventsAndConversations",
        "BrandsCompanyNamesLogos",
        "KeyLocationsAndScenes",
        "KeyThemes",
        "PeopleAppearingAndMentioned"
    ]
}"""


VAR_VIDEO_SEGMENT=f"Your task is to provide a comprehensive description of this video from segment {start} seconds to {end} seconds.\n"
VAR_INSTRUCTIONS= """To complete the task you need to follow these steps:\n
                           No greetings, closing remarks, or additional comments. Begin immediately with the video analysis and provide only the requested information in the specified format.\n
                           Idenify all instances of visual product placement. Pay close attention to background details and items held by the characters. List each product placement with the following
                            information: Brand name, product name (if applicable), and a brief description. Include information about product placement into the description generated for the video\n
                           Create a transcript of all the speeches, dialogs, narration.\n
                           Scrupulously examine each scene for any and all visible brand names, logos, and products. Even if a product appears briefly or in the background, it should be included.\n"""

VAR_CONSTRAINTS= """Describe the video content objectively, avoiding any subjective opinions or assumptions.\n
                           Specify who is saying what. If a person talking can be seen, specify their name and/or occupation. If it is voice behind the scenes, then describe it as a narrator.\n
                           Be specific when describing. Include all the information that is shown or given.\n
                           Do not show timestamps.\n
                           If an unidentified person is shown in the video first, but then their name is mentioned later in the video, make sure to mention their name in the description from the start.\n
                           """

VAR_STRUCTURE= f"""Organize the description with the following properties, and give a valid json file with JSON schema.<JSONSchema>{json.dumps(schema)}</JSONSchema>:
                       \n**Category**\n
                       \n**DetailedDescriptionOfEventsAndConversations**\n
                       \n**BrandsCompanyNamesLogos**\n
                       \n**KeyLocationsAndScenes**\n
                       \n**KeyThemes**\n
                       \n**PeopleAppearingAndMentioned**\n 
                 """ 

VAR_CONDITIONS = """Identify a video as one of these categories: News, TV Shows, Live Sport Events, News Analyses. \n
                       When describing the DetailedDescriptionOfEventsAndConversations, consider the following instructions for specific video types:\n
                       * **News:** Pay close attention to transitions, graphics, and on-screen text.\n
                       * **TV Shows:** Describe facial expressions, body language, appearances, and overall mood.\n
                       * **Live Sports Events:** Focus on key moments, like goals or fouls, and describe the overall flow and momentum of the game.\n
                       * **News Analyses:** Identify different perspectives, arguments, and supporting evidence.\n
                       Make sure to mention people's names in the DetailedDescriptionOfEventsAndConversations and in PeopleAppearingAndMentioned as well as any other information about them like their age, occupation, location, etc. \n"""

VAR_EXAMPLE = """Follow this example for the format of the output:\n
              {
                "Category": "TV Show",
                "DetailedDescriptionOfEventsAndConversations": "The video starts with a man sitting at a dining table, reading a letter. Two Fiji bottles are visible on the benchtop. He has short, light brown hair and a beard. His name is Harrison. The scene changes to Melissa. Melissa says: \"I'm Melissa, and I'm a hairdresser. I'm 41 years old, and I'm from Sydney.\"",
                "BrandsCompanyNamesLogos": ["Lacoste", "Fiji"],
                "KeyLocationsAndScenes": ["Apartment"],
                "KeyThemes": ["Marriage"],
                "PeopleAppearingAndMentioned": [
                "Harrison, 32, Builder, NSW",
                "Melissa, 41, Hairdresser, NSW"
                ]
            }
               """
  
    
video_description_prompt=VAR_VIDEO_SEGMENT+VAR_INSTRUCTIONS+VAR_CONSTRAINTS+VAR_STRUCTURE+VAR_CONDITIONS+VAR_EXAMPLE
 

In [60]:
import base64
import time
import typing
import math
import numpy as np

from google.cloud import aiplatform
from google.protobuf import struct_pb2

#libraries to generate image summaries
from vertexai.vision_models import Video
from vertexai.vision_models import VideoSegmentConfig
from vertexai.vision_models import MultiModalEmbeddingModel
from vertexai.language_models import TextEmbeddingModel
from vertexai.vision_models import Image as vision_model_Image
from vertexai.preview.generative_models import (
    Content,
    GenerationConfig,
    GenerationResponse,
    GenerativeModel,
    Image,
    Part as GenerativeModelPart,
    HarmBlockThreshold,
    HarmCategory,
)
from typing import Any, Dict, List, Literal, Optional, Union

from moviepy.editor import VideoFileClip

contents= [{
                                                                    "role": "user",
                                                                    "parts": [
                                                                        {
                                                                    
                                                                        
                                                                        "file_data": {
                                                                            "mime_type":  "video/mp4",
                                                                            "file_uri": "gs://raw_nine_files/vlt_video_extract/MAAT/MAAT2024_1_A_HBB.mp4"
                                                                        } 
                                                                         ,
                                                                        "video_metadata": {
                                                                                        "start_offset": {
                                                                                        "seconds": start,
                                                                                        "nanos": 0
                                                                                        },
                                                                                        "end_offset": {
                                                                                        "seconds": end,
                                                                                        "nanos": 0
                                                                                        }
                                                                                }

                                                                        },
                                                                        { "text": video_description_prompt }
                                                                    ]
                                                                    }
                                                                ]



In [None]:
generative_multimodal_model= GenerativeModel("gemini-1.5-pro-002")
generation_config=GenerationConfig(temperature=1, top_k=40,top_p=0.95,max_output_tokens=8192)#, response_mime_type='application/json',
 
safety_settings=  {
                    HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
                    HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
                    HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
                    HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
                    }

model_response = generative_multimodal_model.generate_content(
                                    contents ,safety_settings=safety_settings,generation_config=generation_config,
                                   )   

In [4]:
model_response.candidates[0].content.parts[0].text

'```json\n{\n  "Category": "TV Show",\n  "DetailedDescriptionOfEventsAndConversations": "The video begins with Jayden, a 26-year-old professional kickboxer from Queensland, arriving at a gathering. He greets Tim, Richard, and John. Richard introduces himself to Jayden. Richard asks Jayden what he does for a living. Jayden says that kickboxing is his full-time job, training twice a day, six days a week. Jayden explains that he\'s not what he appears to be, enjoying snuggles and back tickles when he\'s not fighting. He adds that he\'s a romantic and doesn\'t have a specific type, looking for someone with a good heart. Lucinda, a 43-year-old MC and wedding celebrant from New South Wales, arrives at the same gathering, greeting the other women: Tori and Natasha. Lucinda says that she\'s looking for a confident man, a great cook who is funny and generous of spirit. She unfolds a scroll, revealing her detailed list of desired qualities in a partner, including “handsome” and “spunky,” prompti

In [5]:
print(model_response.candidates[0].content.parts[0].text)

```json
{
  "Category": "TV Show",
  "DetailedDescriptionOfEventsAndConversations": "The video begins with Jayden, a 26-year-old professional kickboxer from Queensland, arriving at a gathering. He greets Tim, Richard, and John. Richard introduces himself to Jayden. Richard asks Jayden what he does for a living. Jayden says that kickboxing is his full-time job, training twice a day, six days a week. Jayden explains that he's not what he appears to be, enjoying snuggles and back tickles when he's not fighting. He adds that he's a romantic and doesn't have a specific type, looking for someone with a good heart. Lucinda, a 43-year-old MC and wedding celebrant from New South Wales, arrives at the same gathering, greeting the other women: Tori and Natasha. Lucinda says that she's looking for a confident man, a great cook who is funny and generous of spirit. She unfolds a scroll, revealing her detailed list of desired qualities in a partner, including “handsome” and “spunky,” prompting laught

In [6]:
import json

# Provided JSON string
json_string =  model_response.candidates[0].content.parts[0].text.replace('```json','').replace('```','')
# Convert JSON string to a Python dictionary
parsed_json = json.loads(json_string, strict=False)

parsed_json
 

{'Category': 'TV Show',
 'DetailedDescriptionOfEventsAndConversations': "The video begins with Jayden, a 26-year-old professional kickboxer from Queensland, arriving at a gathering. He greets Tim, Richard, and John. Richard introduces himself to Jayden. Richard asks Jayden what he does for a living. Jayden says that kickboxing is his full-time job, training twice a day, six days a week. Jayden explains that he's not what he appears to be, enjoying snuggles and back tickles when he's not fighting. He adds that he's a romantic and doesn't have a specific type, looking for someone with a good heart. Lucinda, a 43-year-old MC and wedding celebrant from New South Wales, arrives at the same gathering, greeting the other women: Tori and Natasha. Lucinda says that she's looking for a confident man, a great cook who is funny and generous of spirit. She unfolds a scroll, revealing her detailed list of desired qualities in a partner, including “handsome” and “spunky,” prompting laughter from her 

In [7]:
# Define the text you want to write to the file
text_to_export =json_string

# Specify the file name
file_name = "output_model1.txt"

# Write the text to the file
with open(file_name, "w") as file:
    file.write(text_to_export)

print(f"The text has been successfully exported to {file_name}")

The text has been successfully exported to output_model1.txt


In [8]:
# Define the text you want to write to the file
text_to_export =model_response.candidates[0].content.parts[0].text

# Specify the file name
file_name = "output_model_original_text1.txt"

# Write the text to the file
with open(file_name, "w") as file:
    file.write(text_to_export)

print(f"The text has been successfully exported to {file_name}")

The text has been successfully exported to output_model_original_text1.txt


### Lets test with another model


In [61]:
#lets change the model

generative_multimodal_model= GenerativeModel("gemini-1.5-flash-002")
generation_config=GenerationConfig(temperature=1, top_k=40,top_p=0.95,max_output_tokens=8192 , response_mime_type='application/json')
 
safety_settings=  {
                    HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
                    HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
                    HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
                    HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
                    }

model_response = generative_multimodal_model.generate_content(
                                    contents ,safety_settings=safety_settings,generation_config=generation_config
                                   )   



 

In [62]:
# Provided JSON string
json_string =  model_response.candidates[0].content.parts[0].text.replace('```json','').replace('```','')
# Convert JSON string to a Python dictionary
parsed_json = json.loads(json_string, strict=False)

parsed_json

{'Category': 'TV Show',
 'DetailedDescriptionOfEventsAndConversations': "The video shows a group of men sitting on a couch in a modern, well-lit room with a city view at night. Jayden, a 26-year-old professional kickboxer from Queensland, enters and introduces himself. Richard comments on Jayden's striking character. Jayden shares that his full-time job is kickboxing, which surprises the others. The scene shifts to Jayden sitting in a boxing gym, talking about his work and softer side. He mentions wanting to have knockouts in front of a crowd. Back at the gathering, the men continue their conversation. Lucinda, a 43-year-old MC and wedding celebrant from New South Wales, arrives in a pink sparkly jacket and matching dress. She greets the other women present. Lucinda shares her thoughts on her ideal partner. The scene transitions to Lucinda sitting on a couch and reading a scroll with notes about the men. She laughs after reading the notes.",
 'BrandsCompanyNamesLogos': [],
 'KeyLocatio

In [64]:
# Provided JSON string
json_string =  model_response.candidates[0].content.parts[0].text.replace('```json','').replace('```','')
# Convert JSON string to a Python dictionary
parsed_json = json.loads(json_string, strict=False)

parsed_json

{'Category': 'TV Show',
 'DetailedDescriptionOfEventsAndConversations': "The video shows a group of men sitting on a couch in a modern, well-lit room with a city view at night. Jayden, a 26-year-old professional kickboxer from Queensland, enters and introduces himself. Richard comments on Jayden's striking character. Jayden shares that his full-time job is kickboxing, which surprises the others. The scene shifts to Jayden sitting in a boxing gym, talking about his work and softer side. He mentions wanting to have knockouts in front of a crowd. Back at the gathering, the men continue their conversation. Lucinda, a 43-year-old MC and wedding celebrant from New South Wales, arrives in a pink sparkly jacket and matching dress. She greets the other women present. Lucinda shares her thoughts on her ideal partner. The scene transitions to Lucinda sitting on a couch and reading a scroll with notes about the men. She laughs after reading the notes.",
 'BrandsCompanyNamesLogos': [],
 'KeyLocatio

In [65]:
# Define the text you want to write to the file
text_to_export =json_string

# Specify the file name
file_name = "output_model2.txt"

# Write the text to the file
with open(file_name, "w") as file:
    file.write(text_to_export)

print(f"The text has been successfully exported to {file_name}")

The text has been successfully exported to output_model2.txt


In [66]:
# Define the text you want to write to the file
text_to_export =model_response.candidates[0].content.parts[0].text

# Specify the file name
file_name = "output_model_original_text2.txt"

# Write the text to the file
with open(file_name, "w") as file:
    file.write(text_to_export)

print(f"The text has been successfully exported to {file_name}")

The text has been successfully exported to output_model_original_text2.txt


### compare the result of the two model

In [77]:

with open('output_model1.txt', 'r') as file:
    response1 = json.dumps(json.load(file))
with open('output_model2.txt', 'r') as file:
    response2 = json.dumps(json.load(file))


In [78]:
def get_autorater_response(metric_prompt: list) -> dict:
    metric_response_schema = {
        "type": "OBJECT",
        "properties": {
            "score": {"type": "NUMBER"},
            "explanation": {"type": "STRING"},
        },
        "required": ["score", "explanation"],
    }

    autorater = GenerativeModel(
        "gemini-1.5-pro",
        generation_config=GenerationConfig(
            response_mime_type="application/json",
            response_schema=metric_response_schema,
        ),
        safety_settings={
            HarmCategory.HARM_CATEGORY_UNSPECIFIED: HarmBlockThreshold.BLOCK_NONE,
            HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
            HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
            HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
            HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
        },
    )

    response = autorater.generate_content(metric_prompt)

    response_json = {}

    if response.candidates and len(response.candidates) > 0:
        candidate = response.candidates[0]
        if (
            candidate.content
            and candidate.content.parts
            and len(candidate.content.parts) > 0
        ):
            part = candidate.content.parts[0]
            if part.text:
                response_json = json.loads(part.text)

    return response_json

def custom_coverage_fn(instance):

    video_uri = instance["reference"]["video_uri"]
    video_metadata = instance["reference"]["video_metadata"]
    response = instance["response"]

    eval_instruction_template = """
  # Instruction
  You are an expert evaluator. Your task is to evaluate the quality of the responses generated by AI models. We will provide you with the user prompt,  and an AI-generated responses, video and the segment (in seconds) for which this response is generated.
  You should first read the user input carefully for analyzing the task, then look into video segment, and then evaluate the quality of the responses based on the Criteria provided in the Evaluation section below.
  You will assign the response a rating following the Rating Rubric and Evaluation Steps. Give step by step explanations for your rating, and only choose ratings from the Rating Rubric.

  # Evaluation
  ## Metric Definition
  You will be assessing coverage, which measures the ability to provide a detailed response based on a the given video segment and requested properties.
  

  ## Criteria
  Coverage: It is the quality of capturing all required detail for each requested property.
  In the context of video content capturing, it refers to the way that all the details of the following properties are captured and presented throughly:
  
  - Category 
  - Detailed Description Of Events And Conversations 
  - Brands,CompanyNames, and Logos 
  - KeyLocations And Scenes" 
  - Key Themes 
  - People Appearing And Mentioned 
  
  This AI-generated responses will be used for data retrieval. So, it has to be able to capture all the details of a scene.

  ## Rating Rubric
  5: (Perfectly Aligned) The details of the video segment is captured properly for all the requested properties thoroughly.
  4: (Highly Aligned) The descriptions captured for each property generally supports the details in the video segment and can be used for data retrieval.
  3: (Moderately Aligned) The descriptions captured for each property is captured well, but there might be minor inconsistencies or missed details, and the response is broadly relevant but not entirely specific.
  2: (Poorly Aligned)  The descriptions captured for each property has significant inconsistencies with the video segment, raising doubts about the validity and quality of text to be usable for data retrieval.
  1: (Misaligned) The descriptions captured for each property has major inconsistencies with the video segment and many information and details are missed casuing a very poor quality of text for data retrieval.

  ## Evaluation Steps
  STEP 1:  Assess User Instruction:  Carefully read the user input prompt to understand the user's request and requested information.
  STEP 2:  Analyze Video Segment: Examine the video segment for each requested property to to make sure all the requested information are captured in detail.
  STEP 3: Evaluate Accuracy:  For each requested property, Check if the generated response correctly identifies the information and details described in the video segment.
  STEP 4:  Identify Inconsistencies: for each requested property, look for any discrepancies between the video segment details and the captured information in the AI-generated responses. For example, if any information is missed, or not captured right.
  STEP 5:  Determine Overall Coverage: Based on the previous steps, assign a coverage score using the 1-5 rubric.  Consider the severity of any inconsistencies and their potential impact on the data retrieval.
  """
 
    # generate the eval
    evaluation_prompt = [
        eval_instruction_template,       
        "VIDEO URI: ",
        video_uri,
        "VIDEO METADATA: ",
        video_metadata,
        "GENERATED RESPONSE: ",
        response,
    ]

   
    
    evaluation_response = get_autorater_response(evaluation_prompt)
    return {
       "custom_coverage":  evaluation_response.get("score", ""),
         "explanation": evaluation_response.get("explanation", "") 
    }
       

In [79]:
import pandas as pd
context = [
    {
        "video_uri": "gs://raw_nine_files/vlt_video_extract/MAAT/MAAT2024_1_A_HBB.mp4",
        "video_metadata":json.dumps({"start_offset": {"seconds": start, "nanos": 0}, "end_offset": {"seconds": end, "nanos": 0}}, indent=2)
    }
]

generated_response = [
    response1,
    response2
]

eval_dataset = pd.DataFrame(
    {
        "instruction": video_description_prompt,
        "context":[None]*len(generated_response) ,
        "response": generated_response,
        "reference":context*len(generated_response) ,
    
    }
)

In [80]:
eval_dataset

Unnamed: 0,instruction,context,response,reference
0,Your task is to provide a comprehensive descri...,,"{""Category"": ""TV Show"", ""DetailedDescriptionOf...",{'video_uri': 'gs://raw_nine_files/vlt_video_e...
1,Your task is to provide a comprehensive descri...,,"{""Category"": ""TV Show"", ""DetailedDescriptionOf...",{'video_uri': 'gs://raw_nine_files/vlt_video_e...


In [81]:
from vertexai.evaluation import CustomMetric, EvalTask

custom_coverage_metric = CustomMetric(
    name="custom_coverage",
    metric_function=custom_coverage_fn,
)

In [82]:
metrics = [custom_coverage_metric]

experiment_name = "eval-multimodal-metric"

eval_task = EvalTask(
    dataset=eval_dataset,
    metrics=metrics,
    experiment=experiment_name,
)

In [83]:
eval_result = eval_task.evaluate()


Associating projects/494586852359/locations/us-central1/metadataStores/default/contexts/eval-multimodal-metric-08e02d44-b9cc-40f8-8a73-af1fbe869fa8 to Experiment: eval-multimodal-metric


Computing metrics with a total of 2 Vertex Gen AI Evaluation Service API requests.


100%|██████████| 2/2 [00:07<00:00,  3.67s/it]

All 2 metric requests are successfully computed.
Evaluation Took:7.35588538297452 seconds





In [84]:
# @title
import json
import logging
import warnings

import pandas as pd
from IPython.display import HTML, Markdown, display

logging.getLogger("urllib3.connectionpool").setLevel(logging.ERROR)
warnings.filterwarnings("ignore")

# pd.set_option('display.max_colwidth', None)



def display_eval_result(
    eval_result: dict | object,
    title: str | None = None,
    metrics: list[str] | None = None,
) -> None:
    """Display the evaluation results."""
    summary_metrics, metrics_table = (
        eval_result.summary_metrics,
        eval_result.metrics_table,
    )

    metrics_df = pd.DataFrame.from_dict(summary_metrics, orient="index").T
    if metrics:
        metrics_df = metrics_df.filter(
            [
                metric
                for metric in metrics_df.columns
                if any(selected_metric in metric for selected_metric in metrics)
            ]
        )
        metrics_table = metrics_table.filter(
            [
                metric
                for metric in metrics_table.columns
                if any(selected_metric in metric for selected_metric in metrics)
            ]
        )

    if title:
        # Display the title with Markdown for emphasis
        display(Markdown(f"## {title}"))
    # Display the summary metrics DataFrame
    display(Markdown("### Summary Metrics"))
    display(metrics_df)
    # Display the metrics table DataFrame
    display(Markdown("### Row-based Metrics"))
    display(metrics_table)


def display_explanations(
    eval_result: dict | object, metrics: list[str] | None = None, n: int = 1
) -> None:
    """Display the explanations."""
    style = "white-space: pre-wrap; width: 1500px; overflow-x: auto;"
    metrics_table = eval_result.metrics_table
    df = metrics_table.sample(n=n)

    if metrics:
        df = df.filter(
            ["response", "baseline_model_response"]
            + [
                metric
                for metric in df.columns
                if any(selected_metric in metric for selected_metric in metrics)
            ]
        )
    for index, row in df.iterrows():
        for col in df.columns:
            display(HTML(f"{col}:{row[col]}"))
        display(HTML(""))
     
    

display_eval_result(eval_result, title="Evaluation Results")
display_explanations(eval_result, metrics=["custom_coverage"])


## Evaluation Results

### Summary Metrics

Unnamed: 0,row_count,custom_coverage/mean,custom_coverage/std
0,2.0,2.0,0.0


### Row-based Metrics

Unnamed: 0,instruction,context,response,reference,custom_coverage/score,custom_coverage/explanation
0,Your task is to provide a comprehensive descri...,,"{""Category"": ""TV Show"", ""DetailedDescriptionOf...",{'video_uri': 'gs://raw_nine_files/vlt_video_e...,2.0,The response captures some accurate details ab...
1,Your task is to provide a comprehensive descri...,,"{""Category"": ""TV Show"", ""DetailedDescriptionOf...",{'video_uri': 'gs://raw_nine_files/vlt_video_e...,2.0,The response captures some accurate details ab...


In [85]:
eval_result.metrics_table.to_csv('test20250117.csv')

In [86]:
eval_result.metrics_table['custom_coverage/explanation'].to_list()

['The response captures some accurate details about events and conversations, but it also includes information not present within the provided video segment. For example, the response mentions Jayden\'s training regimen, Lucinda\'s desire for a "high-functioning erection," and Tori\'s preference for a nerdy guy, none of which are depicted or discussed within the given timeframe. Additionally, some key details from the video are missing, such as the conversation about Sara wanting tattoos and how many people each of them are meeting.  The location identification is inaccurate, as the Sydney skyline is not visible in the video segment.  Also, there is no mention of Fiji water in the video. Lastly, while it accurately lists some individuals present, it omits several others who appear on screen. This mix of fabricated details and omission of existing information results in a poorly aligned response.',
 "The response captures some accurate details about the video segment, such as the introd

In [87]:
eval_result.metrics_table['response'].to_list()

['{"Category": "TV Show", "DetailedDescriptionOfEventsAndConversations": "The video begins with Jayden, a 26-year-old professional kickboxer from Queensland, arriving at a gathering. He greets Tim, Richard, and John. Richard introduces himself to Jayden. Richard asks Jayden what he does for a living. Jayden says that kickboxing is his full-time job, training twice a day, six days a week. Jayden explains that he\'s not what he appears to be, enjoying snuggles and back tickles when he\'s not fighting. He adds that he\'s a romantic and doesn\'t have a specific type, looking for someone with a good heart. Lucinda, a 43-year-old MC and wedding celebrant from New South Wales, arrives at the same gathering, greeting the other women: Tori and Natasha. Lucinda says that she\'s looking for a confident man, a great cook who is funny and generous of spirit. She unfolds a scroll, revealing her detailed list of desired qualities in a partner, including \\u201chandsome\\u201d and \\u201cspunky,\\u201