### LLM Evaluation 

This code uses gcp evaluation service to evaluate the generated content by a generative AI API in terms of 

<b> built-in gcp evaluation service evaluation using user provided metrics:<br>
- safety and sextural harmness
- coherence and fluency
- verbosity and repeatation<br>

<b> mathematical metrics:<br>
- perplexity
- entropy<br>
    
<b> llm as a judge using user provided metrics:<br>
- multimodal content coverage


Use PointWiseEvaluationMetrics.json as a json file for the requested metrics and rating rubric

In [None]:
%pip install --upgrade --user --quiet google-cloud-aiplatform[evaluation]

In [20]:
#import libraries
import time
import random
from google.cloud import bigquery
import json
from datetime import datetime
import pandas as pd
from sklearn.utils import shuffle
from LLM_PointWiseEval_cls import PointWiseEvaluationClient


### Load Prediction Data
This block of code gets the executed predictions. The sample data for this test are stored in bigquery.

Replace the code to load your own data.

In [31]:
def get_predictions(table, dataset,project_id,filter_query=""):
    """Query nearest neighbors using cosine similarity in BigQuery for text embeddings."""
  
    sql = f"""  
        WITH SEARCH_RESULT AS
         (SELECT 

                        asset_id, 
                        content,
                        headline,
                        html_safe_text,
                        description,
                        startOffset_seconds,
                        endOffset_seconds,
                        fileUri,
                        asset_type,
                        first_published_timestamp,
                        brand_type,
                        primary_category_name,
                        byline,
                        image_license_type,
                        publisher_type,
                        photographer,
                        date_published,
                        dxcId,
                        text_embedding_result ,
                        byline[SAFE_OFFSET(0)].author_name ,  
                        modelVersion,
                        prompt_text,
                        CAST(JSON_EXTRACT_SCALAR(media_jsonbody, '$.response.candidates[0].avgLogprobs') AS FLOAT64) AS  avgLogprobs
                 FROM  `{dataset}.{table}` WHERE 1=1 and (LOWER(asset_type) LIKE '%video%' OR LOWER(asset_type) LIKE '%image%' ) {filter_query} 
        ),
          IMAGE_CONTEXT AS (
                   SELECT
                          pd.asset_id,
                          plain_text_column,
                          JSON_EXTRACT_SCALAR(entry, '$.image.mediaId') AS image_id,
                          JSON_EXTRACT_SCALAR(entry, '$.image.caption') AS image_caption
                        FROM
                          (SELECT
                              asset_id,
                              plain_text_column,
                              JSON_EXTRACT_ARRAY(article_body_json) AS article_body_json_array
                            FROM
                              `vlt_media_content_prelanding.vlt_article_content` -- change to vlt
                            WHERE
                              article_body_json IS NOT NULL
                          ) pd,
                          UNNEST(pd.article_body_json_array) AS entry -- Unnest the article body JSON array
                        WHERE
                          UPPER(JSON_EXTRACT_SCALAR(entry, '$.type')) = 'IMAGE' -- Filter to only 'IMAGE' type
                          AND JSON_EXTRACT_SCALAR(entry, '$.image.mediaId') IS NOT NULL -- Ensure there's an image ID
                       
          ) 
        
        SELECT sr.*,    plain_text_column as image_context ,  image_caption
        FROM SEARCH_RESULT   sr
        LEFT JOIN IMAGE_CONTEXT imgcnxt
        on REGEXP_REPLACE( sr.asset_id, r'\..*', '') =imgcnxt.image_id
    """       
 
    #print(sql)
    bq_client = bigquery.Client(project_id)
  
    # Run the query
    query_job = bq_client.query(sql)
    output=[]
    try:
        # Fetch results
        results = query_job.result()  
        df = results.to_dataframe()
       
        #drop duplicates
        df = df.drop_duplicates(subset=['asset_id', 'headline', 'description',
            'startOffset_seconds', 'endOffset_seconds', 'fileUri', 'asset_type',
            'first_published_timestamp', 'brand_type', 'primary_category_name',
            'author_name', 'image_license_type', 'publisher_type', 'photographer',
            'date_published', 'dxcId','avgLogprobs', 'image_context','image_caption','modelVersion','prompt_text' ])
        print(len(df))
        # Sort by asset_id and startOffset_seconds to ensure proper order
        df = df.sort_values(by=['asset_id', 'startOffset_seconds'])
        
     
        # Aggregate descriptions for each asset_id, ordered by startOffset_seconds
        # I dont want to aggregate different time-stamps
        #df['description'] = df.groupby('asset_id')['description'].transform(lambda x: '\n'.join(x))

        # Aggregate and concatenate segments for each asset_id
        df['time_lines'] = df.apply(
            lambda row: f"{{'startOffset_seconds': {row['startOffset_seconds']}, 'endOffset_seconds': {row['endOffset_seconds']}}}", axis=1)
            
        # Now group by 'asset_id' and concatenate the strings in 'time_lines'
        time_lines = df.groupby(['asset_id'])['time_lines'].apply(lambda x: ', '.join(x)).reset_index()
        
        df.drop('time_lines', axis=1, inplace=True)
        # Merge the time_lines into the original DataFrame
        df = df.merge(time_lines, on=['asset_id'], how='left')
    
        #drop duplicates
        df = df.drop_duplicates(subset=['asset_id', 'headline', 'description',
                'fileUri', 'asset_type',
            'first_published_timestamp', 'brand_type', 'primary_category_name',
            'author_name', 'image_license_type', 'publisher_type', 'photographer',
            'date_published', 'dxcId',  'time_lines','avgLogprobs' ,'image_context','image_caption','modelVersion','startOffset_seconds','endOffset_seconds','prompt_text' ])[['asset_id', 'headline', 'description',
                'fileUri', 'asset_type',
            'first_published_timestamp', 'brand_type', 'primary_category_name',
            'author_name', 'image_license_type', 'publisher_type', 'photographer',
            'date_published', 'dxcId',  'time_lines','avgLogprobs' ,'image_context','image_caption','modelVersion','startOffset_seconds','endOffset_seconds','prompt_text' ]]
            
        # Convert datetime to string using astype(str)
        df['date_published'] = df['date_published'].astype(str)
        df['first_published_timestamp'] = df['first_published_timestamp'].astype(str) 
        
        #set the output
        output = df#.to_dict(orient='records') 
 
    except Exception as e:
        print('error'+str(e))
    return output


In [None]:
dataset= "vlt_media_embeddings_integration"
content_table="vlt_all_media_content_text_embeddings"
project_id='nine-quality-test'
df=get_predictions(content_table, dataset,project_id,filter_query="")
df=df.reset_index(drop=True)

### Pick Samples of Data
Due to the costs, we just execute the evaluations on samples of data. 

The required columns for evaluations:

- <b>AI-generated description:</b> if the final description should be composed of combinations of several columns, this should be done in advance as part of pre-processing and data preparation in previous step <br>
- <b>AI-generated Average Log Probabilities:</b> will be used for calculating perplexity-set to None if not available.<br>
- <b>LLM model version</b> <br>

Replace the number of samples if required.

In [26]:
df.columns

Index(['asset_id', 'headline', 'description', 'fileUri', 'asset_type',
       'first_published_timestamp', 'brand_type', 'primary_category_name',
       'author_name', 'image_license_type', 'publisher_type', 'photographer',
       'date_published', 'dxcId', 'time_lines', 'avgLogprobs', 'image_context',
       'image_caption', 'modelVersion', 'startOffset_seconds',
       'endOffset_seconds'],
      dtype='object')

In [27]:
n_sample=1#pick n random samples
df = shuffle(df)
items=df.sample(n_sample)
items=items[['asset_id','description',"avgLogprobs","modelVersion", 'fileUri', 'asset_type','startOffset_seconds','endOffset_seconds']]
#set the respective column names for 
response_column_name='description'  #AI-generated description column name
response_avgLogprobs='avgLogprobs' #AI-generated Average Log Probabilities column name
response_modelVersion='modelVersion' # LLM model version column name

### Load the user defined metrics

In [28]:
import json
experiment_name = "content-generation-qa-quality"
file_path = 'PointWiseEvaluationMetrics.json'

# Open and load the JSON file
with open(file_path, 'r') as file:
    eval_metrics = json.load(file)
 


In [9]:
pointwise_evaluation_client=PointWiseEvaluationClient(project='nine-quality-test',
                          location='us-central1',
                          items=items,
                          response_desc_column_name=response_column_name,
                          response_llm_model_column_name=response_modelVersion,
                          response_avgLogprobs_column_name=response_avgLogprobs,
                          eval_metrics=eval_metrics,
                         experiment_name="pointwise-evaluation-experiment",    
                         evaluation_prompt= "Evaluate the AI's contribution to a meaningful content generation. For rating and evaluationtion of the response on a 1-5 scale, use the given rubric criteria.",
                         delete_experiment=True, # to save the costs, delete the evaluation experiment after the evaluation is finished
                         sys_metrics=True #calculate some mathematical metrics: entropy, perplexity
                         )
evaluations=pointwise_evaluation_client.get_evaluations()
 

Associating projects/494586852359/locations/us-central1/metadataStores/default/contexts/pointwise-evaluation-experiment-pointwise-evaluation-experiment-90cc7754-6700-4558-96f0-c16f2576c5a7 to Experiment: pointwise-evaluation-experiment


Computing metrics with a total of 3 Vertex Gen AI Evaluation Service API requests.


100%|██████████| 3/3 [00:05<00:00,  1.91s/it]

All 3 metric requests are successfully computed.
Evaluation Took:5.752329819020815 seconds





Experiment run pointwise-evaluation-experiment-90cc7754-6700-4558-96f0-c16f2576c5a7 skipped backing tensorboard run deletion.
To delete backing tensorboard run, execute the following:
tensorboard_run_artifact = aiplatform.metadata.artifact.Artifact(artifact_name=f"pointwise-evaluation-experiment-pointwise-evaluation-experiment-90cc7754-6700-4558-96f0-c16f2576c5a7-tb-run")
tensorboard_run_resource = aiplatform.TensorboardRun(tensorboard_run_artifact.metadata["resourceName"])
tensorboard_run_resource.delete()
tensorboard_run_artifact.delete()
Deleting Context : projects/494586852359/locations/us-central1/metadataStores/default/contexts/pointwise-evaluation-experiment-pointwise-evaluation-experiment-90cc7754-6700-4558-96f0-c16f2576c5a7
Context deleted. . Resource name: projects/494586852359/locations/us-central1/metadataStores/default/contexts/pointwise-evaluation-experiment-pointwise-evaluation-experiment-90cc7754-6700-4558-96f0-c16f2576c5a7
Deleting Context resource: projects/4945868523

In [10]:
evaluations.head(10)

Unnamed: 0,response,evaluation_prompt,avgLogprobs,reference,response_llm_model,run_experiment_name,run_experiment_date,perplexity,entropy,safety_explanation,safety_score,coherence and fluency_explanation,coherence and fluency_score,verbosity_explanation,verbosity_score
0,"The image features a portrait of a woman, like...",Evaluate the AI's contribution to a meaningful...,-0.18972,{},gemini-1.5-pro-002@default,pointwise-evaluation-experiment-90cc7754-6700-...,2025-01-23,1.208911,7.703023,The AI response provides a thorough and compre...,4.0,The AI response demonstrates a comprehensive a...,5.0,The AI response demonstrates significant verbo...,2.0


### Prepare Data Sample for Multimodal Coverage Evaluation
The assumption is that the generated content is in the form of json including the fields that are requested from llm models to be extracted from the content.<br>
Because we did not have data in our environment, we make some sample data

# Sample User Prompt
This is basically the prompt text that will be used to generate the content for each video segment or image during batch/online content generation.
Here, we used this prompt to generate the content of a sample video from 600s to 900s using two different models 'gemini-1.5-pro-002', 'gemini-1.5-flash-002'. The generated content is recorded in json format in output_model1.txt and output_model2.txt files.

In [29]:
start=600
end=900
schema="""{
    "description": "A structured schema to represent detailed information from a video or text analysis",
    "type": "object",
    "properties": {
        "Category": {
            "type": "string",
            "description": "The category or general type of the content"
        },
        "DetailedDescriptionOfEventsAndConversations": {
            "type": "string",
            "description": "A detailed textual description of the events and conversations in the content"
        },
        "BrandsCompanyNamesLogos": {
            "type": "array",
            "items": {
                "type": "string"
            },
            "description": "A list of brands, company names, or logos appearing or mentioned in the content"
        },
        "KeyLocationsAndScenes": {
            "type": "array",
            "items": {
                "type": "string"
            },
            "description": "A list of key locations and scenes appearing or mentioned in the content"
        },
        "KeyThemes": {
            "type": "array",
            "items": {
                "type": "string"
            },
            "description": "A list of key themes discussed or portrayed in the content"
        },
        "PeopleAppearingAndMentioned": {
            "type": "array",
            "items": {
                "type": "string"
            },
            "description": "A list of people who appear or are mentioned in the content"
        }
    },
    "required": [
        "Category",
        "DetailedDescriptionOfEventsAndConversations",
        "BrandsCompanyNamesLogos",
        "KeyLocationsAndScenes",
        "KeyThemes",
        "PeopleAppearingAndMentioned"
    ]
}"""


VAR_VIDEO_SEGMENT=f"Your task is to provide a comprehensive description of this video from segment {start} seconds to {end} seconds.\n"
VAR_INSTRUCTIONS= """To complete the task you need to follow these steps:\n
                           No greetings, closing remarks, or additional comments. Begin immediately with the video analysis and provide only the requested information in the specified format.\n
                           Idenify all instances of visual product placement. Pay close attention to background details and items held by the characters. List each product placement with the following
                            information: Brand name, product name (if applicable), and a brief description. Include information about product placement into the description generated for the video\n
                           Create a transcript of all the speeches, dialogs, narration.\n
                           Scrupulously examine each scene for any and all visible brand names, logos, and products. Even if a product appears briefly or in the background, it should be included.\n"""

VAR_CONSTRAINTS= """Describe the video content objectively, avoiding any subjective opinions or assumptions.\n
                           Specify who is saying what. If a person talking can be seen, specify their name and/or occupation. If it is voice behind the scenes, then describe it as a narrator.\n
                           Be specific when describing. Include all the information that is shown or given.\n
                           Do not show timestamps.\n
                           If an unidentified person is shown in the video first, but then their name is mentioned later in the video, make sure to mention their name in the description from the start.\n
                           """

VAR_STRUCTURE= f"""Organize the description with the following properties, and give a valid json file with JSON schema.<JSONSchema>{json.dumps(schema)}</JSONSchema>:
                       \n**Category**\n
                       \n**DetailedDescriptionOfEventsAndConversations**\n
                       \n**BrandsCompanyNamesLogos**\n
                       \n**KeyLocationsAndScenes**\n
                       \n**KeyThemes**\n
                       \n**PeopleAppearingAndMentioned**\n 
                 """ 

VAR_CONDITIONS = """Identify a video as one of these categories: News, TV Shows, Live Sport Events, News Analyses. \n
                       When describing the DetailedDescriptionOfEventsAndConversations, consider the following instructions for specific video types:\n
                       * **News:** Pay close attention to transitions, graphics, and on-screen text.\n
                       * **TV Shows:** Describe facial expressions, body language, appearances, and overall mood.\n
                       * **Live Sports Events:** Focus on key moments, like goals or fouls, and describe the overall flow and momentum of the game.\n
                       * **News Analyses:** Identify different perspectives, arguments, and supporting evidence.\n
                       Make sure to mention people's names in the DetailedDescriptionOfEventsAndConversations and in PeopleAppearingAndMentioned as well as any other information about them like their age, occupation, location, etc. \n"""

VAR_EXAMPLE = """Follow this example for the format of the output:\n
              {
                "Category": "TV Show",
                "DetailedDescriptionOfEventsAndConversations": "The video starts with a man sitting at a dining table, reading a letter. Two Fiji bottles are visible on the benchtop. He has short, light brown hair and a beard. His name is Harrison. The scene changes to Melissa. Melissa says: \"I'm Melissa, and I'm a hairdresser. I'm 41 years old, and I'm from Sydney.\"",
                "BrandsCompanyNamesLogos": ["Lacoste", "Fiji"],
                "KeyLocationsAndScenes": ["Apartment"],
                "KeyThemes": ["Marriage"],
                "PeopleAppearingAndMentioned": [
                "Harrison, 32, Builder, NSW",
                "Melissa, 41, Hairdresser, NSW"
                ]
            }
               """
  
    
video_description_prompt=VAR_VIDEO_SEGMENT+VAR_INSTRUCTIONS+VAR_CONSTRAINTS+VAR_STRUCTURE+VAR_CONDITIONS+VAR_EXAMPLE
 

In [12]:
#load pre-executed predictions
with open('output_model1.txt', 'r') as file:
    response1 = json.dumps(json.load(file))
with open('output_model2.txt', 'r') as file:
    response2 = json.dumps(json.load(file))

In [13]:
video_evaluation_prompt="""
  # Instruction
  You are an expert evaluator. Your task is to evaluate the quality of the responses generated by AI models. We will provide you with the user prompt,  and an AI-generated responses, video and the segment (in seconds) for which this response is generated.
  You should first read the user input carefully for analyzing the task, then look into video segment, and then evaluate the quality of the responses based on the Criteria provided in the Evaluation section below.
  You will assign the response a rating following the Rating Rubric and Evaluation Steps. Give step by step explanations for your rating, and only choose ratings from the Rating Rubric.

  # Evaluation
  ## Metric Definition
  You will be assessing coverage, which measures the ability to provide a detailed response based on a the given video segment and requested properties.
  

  ## Criteria
  Coverage: It is the quality of capturing all required detail for each requested property.
  In the context of video content capturing, it refers to the way that all the details of the following properties are captured and presented throughly:
  
  - Category 
  - Detailed Description Of Events And Conversations 
  - Brands,CompanyNames, and Logos 
  - KeyLocations And Scenes" 
  - Key Themes 
  - People Appearing And Mentioned 
  
  This AI-generated responses will be used for data retrieval. So, it has to be able to capture all the details of a scene.

  ## Rating Rubric
  5: (Perfectly Aligned) The details of the video segment is captured properly for all the requested properties thoroughly.
  4: (Highly Aligned) The descriptions captured for each property generally supports the details in the video segment and can be used for data retrieval.
  3: (Moderately Aligned) The descriptions captured for each property is captured well, but there might be minor inconsistencies or missed details, and the response is broadly relevant but not entirely specific.
  2: (Poorly Aligned)  The descriptions captured for each property has significant inconsistencies with the video segment, raising doubts about the validity and quality of text to be usable for data retrieval.
  1: (Misaligned) The descriptions captured for each property has major inconsistencies with the video segment and many information and details are missed casuing a very poor quality of text for data retrieval.

  ## Evaluation Steps
  STEP 1:  Assess User Instruction:  Carefully read the user input prompt to understand the user's request and requested information.
  STEP 2:  Analyze Video Segment: Examine the video segment for each requested property to to make sure all the requested information are captured in detail.
  STEP 3: Evaluate Accuracy:  For each requested property, Check if the generated response correctly identifies the information and details described in the video segment.
  STEP 4:  Identify Inconsistencies: for each requested property, look for any discrepancies between the video segment details and the captured information in the AI-generated responses. For example, if any information is missed, or not captured right.
  STEP 5:  Determine Overall Coverage: Based on the previous steps, assign a coverage score using the 1-5 rubric.  Consider the severity of any inconsistencies and their potential impact on the data retrieval.
  """


image_evaluation_prompt="""
  # Instruction
  You are an expert evaluator. Your task is to evaluate the quality of the responses generated by AI models. We will provide you with the user prompt,  and an AI-generated responses, video and the segment (in seconds) for which this response is generated.
  You should first read the user input carefully for analyzing the task, then look into video segment, and then evaluate the quality of the responses based on the Criteria provided in the Evaluation section below.
  You will assign the response a rating following the Rating Rubric and Evaluation Steps. Give step by step explanations for your rating, and only choose ratings from the Rating Rubric.

  # Evaluation
  ## Metric Definition
  You will be assessing coverage, which measures the ability to provide a detailed response based on a the given video segment and requested properties.
  

  ## Criteria
  Coverage: It is the quality of capturing all required detail for each requested property.
  In the context of video content capturing, it refers to the way that all the details of the following properties are captured and presented throughly:
  
  - Category 
  - Detailed Description Of Events And Conversations 
  - Brands,CompanyNames, and Logos 
  - KeyLocations And Scenes" 
  - Key Themes 
  - People Appearing And Mentioned 
  
  This AI-generated responses will be used for data retrieval. So, it has to be able to capture all the details of a scene.

  ## Rating Rubric
  5: (Perfectly Aligned) The details of the video segment is captured properly for all the requested properties thoroughly.
  4: (Highly Aligned) The descriptions captured for each property generally supports the details in the video segment and can be used for data retrieval.
  3: (Moderately Aligned) The descriptions captured for each property is captured well, but there might be minor inconsistencies or missed details, and the response is broadly relevant but not entirely specific.
  2: (Poorly Aligned)  The descriptions captured for each property has significant inconsistencies with the video segment, raising doubts about the validity and quality of text to be usable for data retrieval.
  1: (Misaligned) The descriptions captured for each property has major inconsistencies with the video segment and many information and details are missed casuing a very poor quality of text for data retrieval.

  ## Evaluation Steps
  STEP 1:  Assess User Instruction:  Carefully read the user input prompt to understand the user's request and requested information.
  STEP 2:  Analyze Video Segment: Examine the video segment for each requested property to to make sure all the requested information are captured in detail.
  STEP 3: Evaluate Accuracy:  For each requested property, Check if the generated response correctly identifies the information and details described in the video segment.
  STEP 4:  Identify Inconsistencies: for each requested property, look for any discrepancies between the video segment details and the captured information in the AI-generated responses. For example, if any information is missed, or not captured right.
  STEP 5:  Determine Overall Coverage: Based on the previous steps, assign a coverage score using the 1-5 rubric.  Consider the severity of any inconsistencies and their potential impact on the data retrieval.
  """

multimodal_evaluation_promt={'video_prompt': video_evaluation_prompt,'image_prompt':image_evaluation_prompt}


generated_response = [
    response1,
    response2
]
llm_models=['gemini-1.5-pro-002', 'gemini-1.5-flash-002']

items = pd.DataFrame(
    {
        "prompt_text": video_description_prompt, #this should be set with the prompt_text when doing batch generation
        "fileUri":'gs://raw_nine_files/vlt_video_extract/MAAT/MAAT2024_1_A_HBB.mp4' , #this should be set to file uri when doing batch generation
        "description": generated_response, #this should be set the generated content when doing batch generation
        "asset_type": 'video/mp4', #this should be set to asset_type/mime_type when doing batch generation
        "startOffset_seconds":[int(start)]*len(generated_response),
        "endOffset_seconds":[int(end)]*len(generated_response),
        "modelVersion":llm_models,
        "avgLogprobs":[-0.25]*len(generated_response)
    }
)

In [14]:
 items

Unnamed: 0,prompt_text,fileUri,description,asset_type,startOffset_seconds,endOffset_seconds,modelVersion,avgLogprobs
0,Your task is to provide a comprehensive descri...,gs://raw_nine_files/vlt_video_extract/MAAT/MAA...,"{""Category"": ""TV Show"", ""DetailedDescriptionOf...",video/mp4,600,900,gemini-1.5-pro-002,-0.25
1,Your task is to provide a comprehensive descri...,gs://raw_nine_files/vlt_video_extract/MAAT/MAA...,"{""Category"": ""TV Show"", ""DetailedDescriptionOf...",video/mp4,600,900,gemini-1.5-flash-002,-0.25


In [18]:
pointwise_evaluation_client=PointWiseEvaluationClient(project='nine-quality-test',
                          location='us-central1',
                          items=items,
                          response_desc_column_name=response_column_name,
                          response_llm_model_column_name=response_modelVersion,
                          response_avgLogprobs_column_name=response_avgLogprobs,
                          eval_metrics=eval_metrics,
                         experiment_name="pointwise-evaluation-experiment",    
                         evaluation_prompt= "Evaluate the AI's contribution to a meaningful content generation. For rating and evaluationtion of the response on a 1-5 scale, use the given rubric criteria.",
                         delete_experiment=True, # to save the costs, delete the evaluation experiment after the evaluation is finished
                         sys_metrics=True, #calculate some mathematical metrics: entropy, perplexity
                         multimodal_evaluation_promt=multimodal_evaluation_promt, # prompt that will be used for video and image generated content evaluation
                         response_userPrompt_column_name="prompt_text", # name of the column in the 'items' data frame that includes user input prompt when generating content
                         response_media_column_metadata={'fileUri':'fileUri', 'startOffset':'startOffset_seconds','endOffset':'endOffset_seconds', 'mediaType':'asset_type'},   # name of metadata columns in the 'items' dataframe
                             response_mediaType_column_name='asset_type'
                             )
evaluations=pointwise_evaluation_client.get_evaluations()

Associating projects/494586852359/locations/us-central1/metadataStores/default/contexts/pointwise-evaluation-experiment-pointwise-evaluation-experiment-025769da-8935-48c3-a571-95951107d8ac to Experiment: pointwise-evaluation-experiment


Computing metrics with a total of 8 Vertex Gen AI Evaluation Service API requests.



  0%|          | 0/8 [00:00<?, ?it/s][A
 12%|█▎        | 1/8 [00:02<00:17,  2.44s/it][A
 25%|██▌       | 2/8 [00:05<00:15,  2.63s/it][A
 38%|███▊      | 3/8 [00:06<00:10,  2.11s/it][A
 50%|█████     | 4/8 [00:06<00:05,  1.33s/it][A
 62%|██████▎   | 5/8 [00:08<00:04,  1.52s/it][A
 75%|███████▌  | 6/8 [00:11<00:03,  1.97s/it][A
 88%|████████▊ | 7/8 [00:13<00:01,  1.85s/it][A
100%|██████████| 8/8 [00:13<00:00,  1.73s/it][A

All 8 metric requests are successfully computed.
Evaluation Took:13.823837678995915 seconds





Experiment run pointwise-evaluation-experiment-025769da-8935-48c3-a571-95951107d8ac skipped backing tensorboard run deletion.
To delete backing tensorboard run, execute the following:
tensorboard_run_artifact = aiplatform.metadata.artifact.Artifact(artifact_name=f"pointwise-evaluation-experiment-pointwise-evaluation-experiment-025769da-8935-48c3-a571-95951107d8ac-tb-run")
tensorboard_run_resource = aiplatform.TensorboardRun(tensorboard_run_artifact.metadata["resourceName"])
tensorboard_run_resource.delete()
tensorboard_run_artifact.delete()
Deleting Context : projects/494586852359/locations/us-central1/metadataStores/default/contexts/pointwise-evaluation-experiment-pointwise-evaluation-experiment-025769da-8935-48c3-a571-95951107d8ac
Context deleted. . Resource name: projects/494586852359/locations/us-central1/metadataStores/default/contexts/pointwise-evaluation-experiment-pointwise-evaluation-experiment-025769da-8935-48c3-a571-95951107d8ac
Deleting Context resource: projects/4945868523

In [19]:
evaluations

Unnamed: 0,response,evaluation_prompt,mediaType,avgLogprobs,multimodal_evaluation_promt,instruction,reference,response_llm_model,run_experiment_name,run_experiment_date,perplexity,entropy,custom_coverage_score,custom_coverage_explanation,safety_explanation,safety_score,coherence and fluency_explanation,coherence and fluency_score,verbosity_explanation,verbosity_score
0,"{""Category"": ""TV Show"", ""DetailedDescriptionOf...",Evaluate the AI's contribution to a meaningful...,video/mp4,-0.25,\n # Instruction\n You are an expert evaluat...,Your task is to provide a comprehensive descri...,"{""fileuri"": ""gs://raw_nine_files/vlt_video_ext...",gemini-1.5-pro-002,pointwise-evaluation-experiment-025769da-8935-...,2025-01-23,1.284025,7.384875,1,"The response captures some details accurately,...",The AI-generated response is safe and appropri...,5.0,The AI response demonstrates good coherence an...,4.0,The response is verbose and contains redundant...,3.0
1,"{""Category"": ""TV Show"", ""DetailedDescriptionOf...",Evaluate the AI's contribution to a meaningful...,video/mp4,-0.25,\n # Instruction\n You are an expert evaluat...,Your task is to provide a comprehensive descri...,"{""fileuri"": ""gs://raw_nine_files/vlt_video_ext...",gemini-1.5-flash-002,pointwise-evaluation-experiment-025769da-8935-...,2025-01-23,1.284025,6.755835,2,The response shows some alignment with the vid...,The AI-generated content is safe and appropria...,5.0,The response demonstrates good coherence and f...,4.0,The AI response provides a concise and informa...,4.0
