### LLM Evaluation 

This code uses gcp evaluation service to evaluate the generated content by a generative AI API in terms of 

<b> built-in gcp evaluation service evaluation using user provided metrics:<br>
- safety and sextural harmness
- coherence and fluency
- verbosity and repeatation<br>

<b> mathematical metrics:<br>
- perplexity
- entropy<br>
    
<b> llm as a judge using user provided metrics:<br>
- multimodal content coverage


Use PointWiseEvaluationMetrics.json as a json file for the requested metrics and rating rubric

In [None]:
%pip install --upgrade --user --quiet google-cloud-aiplatform[evaluation]

In [1]:
#import libraries
import time
import random
from google.cloud import bigquery
import json
from datetime import datetime
import pandas as pd
from sklearn.utils import shuffle
#from LLM_PointWiseEval_cls import PointWiseEvaluationClient

### Load Prediction Data
This block of code gets the executed predictions. The sample data for this test are stored in bigquery.

Replace the code to load your own data.

In [2]:
def get_predictions(table, dataset,project_id,filter_query=""):
    """Query nearest neighbors using cosine similarity in BigQuery for text embeddings."""
  
    sql = f"""  
        WITH SEARCH_RESULT AS
         (SELECT 

                        asset_id, 
                        content,
                        headline,
                        html_safe_text,
                        description,
                        startOffset_seconds,
                        endOffset_seconds,
                        fileUri,
                        asset_type,
                        first_published_timestamp,
                        brand_type,
                        primary_category_name,
                        byline,
                        image_license_type,
                        publisher_type,
                        photographer,
                        date_published,
                        dxcId,
                        text_embedding_result ,
                        byline[SAFE_OFFSET(0)].author_name ,  
                        modelVersion,
                        CAST(JSON_EXTRACT_SCALAR(media_jsonbody, '$.response.candidates[0].avgLogprobs') AS FLOAT64) AS  avgLogprobs
                 FROM  `{dataset}.{table}` WHERE 1=1 and (LOWER(asset_type) LIKE '%video%' OR LOWER(asset_type) LIKE '%image%' ) {filter_query} 
        ),
          IMAGE_CONTEXT AS (
                   SELECT
                          pd.asset_id,
                          plain_text_column,
                          JSON_EXTRACT_SCALAR(entry, '$.image.mediaId') AS image_id,
                          JSON_EXTRACT_SCALAR(entry, '$.image.caption') AS image_caption
                        FROM
                          (SELECT
                              asset_id,
                              plain_text_column,
                              JSON_EXTRACT_ARRAY(article_body_json) AS article_body_json_array
                            FROM
                              `vlt_media_content_prelanding.vlt_article_content` -- change to vlt
                            WHERE
                              article_body_json IS NOT NULL
                          ) pd,
                          UNNEST(pd.article_body_json_array) AS entry -- Unnest the article body JSON array
                        WHERE
                          UPPER(JSON_EXTRACT_SCALAR(entry, '$.type')) = 'IMAGE' -- Filter to only 'IMAGE' type
                          AND JSON_EXTRACT_SCALAR(entry, '$.image.mediaId') IS NOT NULL -- Ensure there's an image ID
                       
          ) 
        
        SELECT sr.*,    plain_text_column as image_context ,  image_caption
        FROM SEARCH_RESULT   sr
        LEFT JOIN IMAGE_CONTEXT imgcnxt
        on REGEXP_REPLACE( sr.asset_id, r'\..*', '') =imgcnxt.image_id
    """       
 
    #print(sql)
    bq_client = bigquery.Client(project_id)
  
    # Run the query
    query_job = bq_client.query(sql)
    output=[]
    try:
        # Fetch results
        results = query_job.result()  
        df = results.to_dataframe()
       
        #drop duplicates
        df = df.drop_duplicates(subset=['asset_id', 'headline', 'description',
            'startOffset_seconds', 'endOffset_seconds', 'fileUri', 'asset_type',
            'first_published_timestamp', 'brand_type', 'primary_category_name',
            'author_name', 'image_license_type', 'publisher_type', 'photographer',
            'date_published', 'dxcId','avgLogprobs', 'image_context','image_caption','modelVersion' ])
        print(len(df))
        # Sort by asset_id and startOffset_seconds to ensure proper order
        df = df.sort_values(by=['asset_id', 'startOffset_seconds'])
        
     
        # Aggregate descriptions for each asset_id, ordered by startOffset_seconds
        # I dont want to aggregate different time-stamps
        #df['description'] = df.groupby('asset_id')['description'].transform(lambda x: '\n'.join(x))

        # Aggregate and concatenate segments for each asset_id
        df['time_lines'] = df.apply(
            lambda row: f"{{'startOffset_seconds': {row['startOffset_seconds']}, 'endOffset_seconds': {row['endOffset_seconds']}}}", axis=1)
            
        # Now group by 'asset_id' and concatenate the strings in 'time_lines'
        time_lines = df.groupby(['asset_id'])['time_lines'].apply(lambda x: ', '.join(x)).reset_index()
        
        df.drop('time_lines', axis=1, inplace=True)
        # Merge the time_lines into the original DataFrame
        df = df.merge(time_lines, on=['asset_id'], how='left')
    
        #drop duplicates
        df = df.drop_duplicates(subset=['asset_id', 'headline', 'description',
                'fileUri', 'asset_type',
            'first_published_timestamp', 'brand_type', 'primary_category_name',
            'author_name', 'image_license_type', 'publisher_type', 'photographer',
            'date_published', 'dxcId',  'time_lines','avgLogprobs' ,'image_context','image_caption','modelVersion' ])[['asset_id', 'headline', 'description',
                'fileUri', 'asset_type',
            'first_published_timestamp', 'brand_type', 'primary_category_name',
            'author_name', 'image_license_type', 'publisher_type', 'photographer',
            'date_published', 'dxcId',  'time_lines','avgLogprobs' ,'image_context','image_caption','modelVersion' ]]
            
        # Convert datetime to string using astype(str)
        df['date_published'] = df['date_published'].astype(str)
        df['first_published_timestamp'] = df['first_published_timestamp'].astype(str) 
        
        #set the output
        output = df#.to_dict(orient='records') 
 
    except Exception as e:
        print('error'+str(e))
    return output


In [3]:
dataset= "vlt_media_embeddings_integration"
content_table="vlt_all_media_content_text_embeddings"
project_id='nine-quality-test'
df=get_predictions(content_table, dataset,project_id,filter_query="")
df=df.reset_index(drop=True)

1568


### Pick Samples of Data
Due to the costs, we just execute the evaluations on samples of data. 

The required columns for evaluations:

- <b>AI-generated description:</b> if the final description should be composed of combinations of several columns, this should be done in advance as part of pre-processing and data preparation in previous step <br>
- <b>AI-generated Average Log Probabilities:</b> will be used for calculating perplexity-set to None if not available.<br>
- <b>LLM model version</b> <br>

Replace the number of samples if required.

In [4]:
n_sample=1#pick n random samples
df = shuffle(df)
items=df.sample(n_sample)
items=items[['description',"avgLogprobs","modelVersion"]]
#set the respective column names for 
response_column_name='description'  #AI-generated description column name
response_avgLogprobs='avgLogprobs' #AI-generated Average Log Probabilities column name
response_modelVersion='modelVersion' # LLM model version column name

### Load the user defined metrics

In [5]:
import json
experiment_name = "content-generation-qa-quality"
file_path = 'PointWiseEvaluationMetrics.json'

# Open and load the JSON file
with open(file_path, 'r') as file:
    eval_metrics = json.load(file)
 


In [8]:
pointwise_evaluation_client=PointWiseEvaluationClient(project='nine-quality-test',
                          location='us-central1',
                          items=items,
                          response_desc_column_name=response_column_name,
                          response_llm_model_column_name=response_modelVersion,
                          response_avgLogprobs_column_name=response_avgLogprobs,
                          eval_metrics=eval_metrics,
                         experiment_name="pointwise-evaluation-experiment",    
                         evaluation_prompt= "Evaluate the AI's contribution to a meaningful content generation. For rating and evaluationtion of the response on a 1-5 scale, use the given rubric criteria.",
                         delete_experiment=True, # to save the costs, delete the evaluation experiment after the evaluation is finished
                         sys_metrics=True #calculate some mathematical metrics: entropy, perplexity
                         )
evaluations=pointwise_evaluation_client.get_evaluations()
 

TypeError: 'NoneType' object is not subscriptable

In [7]:
evaluations.head(10)

Unnamed: 0,response,avgLogprobs,response_llm_model,run_experiment_name,run_experiment_date,perplexity,entropy,safety-explanation,safety-score,coherence and fluency-explanation,coherence and fluency-score,verbosity-explanation,verbosity-score
0,The image features a middle-aged man with a fa...,-0.153841,modelVersion,pointwise-evaluation-experiment-c661aae5-1af3-...,2025-01-22,1.166305,7.8817,The AI response provides a detailed and compre...,5.0,The AI response demonstrates excellent coheren...,5.0,The AI response is excessively verbose and con...,2.0


### Prepare Data Sample for Multimodal Coverage Evaluation
The assumption is that the generated content is in the form of json including the fields that are requested from llm models to be extracted from the content.<br>
Because we did not have data in our environment, we make some sample data

In [7]:
from google.cloud import aiplatform
import vertexai
import pandas as pd
import json
import math
from collections import Counter

from vertexai.evaluation import (
    EvalTask, 
    PointwiseMetric,
    PointwiseMetricPromptTemplate
)
 
import uuid 
from google.cloud import bigquery
from google.api_core.exceptions import NotFound
from datetime import datetime

class PointWiseEvaluationClient:
    """Wrapper around Pointwise Evaluation Client."""

    def __init__(
        self,
        project: str=None,
        location: str = "us-central1",
        items: pd.core.frame.DataFrame = None,
        response_desc_column_name: str= 'description',
        response_llm_model_column_name: str= None,
        response_avgLogprobs_column_name: str=None,
        response_mediaType_column_name: str=None,
        response_media_column_metadata : dict=None,
        response_userPrompt_column_name: str=None,
        multimodal_evaluation_promt: dict=None,
        eval_metrics: list[dict] =None,
        experiment_name: str="pointwise-evaluation-experiment",
        evaluation_prompt: str="Evaluate the AI's contribution to a meaningful content generation",  
        delete_experiment: bool= True,
        sys_metrics: bool= True,
        ):
        """
        Initis the hyper parameters
        
        Args:
         str project:  project id 
         str locations: project location         
         Dataframe items: dataframe of AI-generated responses
         str response_desc_column_name: the name of the column in the 'items' dataframe that includes the AI-generated response
         str response_llm_model_column_name: the name of the column in the 'items' dataframe that includes the name of the model that is used for extracting AI-generated responses
         str response_avgLogprobs_column_name:  the name of the column in the 'items' dataframe that includes AI-generated response average probability log values
         str response_mediaType_column_name:  the name of the column in the 'items' dataframe that represent media type
         str response_userPrompt_column_name: the name of the column in the 'items' dataframe that represent user prompt using which the AI model generated the response
         dict response_media_column_metadata: dictionary including the name of fileuri, start and endoffset of the media if available
                                              e.g. {'fileUri':'fileUri', 'startOffset':'startOffset_seconds','endOffset':'endOffset_seconds', 'mediaType':'mediaType'}           
         dict multimodal_evaluation_promt: dictionary including prompts for multimodal content evaluations.
                                           e.g. {"video_prompt":"...","image_prompt":"..."}
         list[dict] eval_metrics: user defined evaluation metrics along with their rating rubric
                                  e.g.  [ {  "metric": "safety", "criteria": "..." }]
         str experiment_name: name of the evaluation experiment
         str evaluation_prompt: the prompt text which will be used as a prompt to evaluate the eval_metrics        
         bool delete_experiment: delete the generated experience after the evaluation are done if True. Will save costs.
         bool sys_metrics: calculates some mathematical metrics including perplexity, entropy if set to True.
        """
        
        #set the parameters
        self.location = location  
        self.project = project   
        self.items =items  
        self.eval_metrics=eval_metrics #user defined metrics along with their rubric ratings
        self.experiment_name=experiment_name
        self.evaluation_prompt=evaluation_prompt
        self.multimodal_evaluation_promt=multimodal_evaluation_promt
        self.response_userPrompt_column_name=response_userPrompt_column_name
        self.response_llm_model_column_name=response_llm_model_column_name
        self.response_media_column_metadata=response_media_column_metadata
        self.response_mediaType_column_name=response_mediaType_column_name
        self.response_desc_column_name=response_desc_column_name
        self.delete_experiment=delete_experiment
        self.response_avgLogprobs_column_name= response_avgLogprobs_column_name
        self.sys_metrics=sys_metrics
        self.run_experiment_name=self.experiment_name+"-"+ str(uuid.uuid4())
        
        #initialize Vertex AI
        vertexai.init(project=self.project, location= self.location )
         

    def set_evaluation_data(self):
        """
        Prepare the input data as in a dataframe for evaluation

        """
            
        eval_dataset = pd.DataFrame(
                        {
                            "response": self.items[self.response_desc_column_name].to_list(),
                            **({"avgLogprobs": self.items[self.response_avgLogprobs_column_name].to_list()} if 
                               self.response_avgLogprobs_column_name !=None else {}),
#                             **({"multimodal_evaluation_promt": [
#                                 self.multimodal_evaluation_promt['video_prompt'] if 'video' in str(self.items[self.response_mediaType_column_name][i]).lower() else 
#                                 self.multimodal_evaluation_promt['image_prompt'] if 'image' in str(self.items[self.response_mediaType_column_name][i]).lower() else None
#                                 for i in range(len(self.items))
#                             ]} if self.response_mediaType_column_name!=None and self.multimodal_evaluation_promt!=None else {}),
                       
                             **({"instruction": self.items[self.response_userPrompt_column_name].to_list()} if 
                               self.response_userPrompt_column_name !=None else {}),                            
                            
                            "refrence": [
                                            {
                                                "fileuri": self.items[self.response_media_column_metadata['fileUri']][i],
                                                "metadata": {
                                                                "start_offset": {"seconds":self.items[self.response_media_column_metadata['startOffset']][i] , "nanos": 0},
                                                                "end_offset": {"seconds":self.items[self.response_media_column_metadata['endOffset']][i],"nanos": 0}
                                                            } if self.response_media_column_metadata['startOffset'] in self.items.columns and self.response_media_column_metadata['endOffset'] in self.items.columns else {}
                                            } if self.response_media_column_metadata['fileUri'] !=None else {}
                                            for i in range(len(self.items))
                                        ],                            
                            "response_llm_model": self.items[self.response_llm_model_column_name],
                            "run_experiment_name": [self.run_experiment_name] * len(self.items),
                            "run_experiment_date": [datetime.today().strftime('%Y-%m-%d')] * len(self.items),
                        }
                    )
        
        return eval_dataset

    def log_evaluations(self, result):
        """
        Log the evaluation result into BigQuery, altering the table schema if needed.

        Args:
            dataframe result : The evaluation result to be recorded into the database.
        """
        # Load configuration from config.json
        with open('config.json') as config_file:
            config = json.load(config_file)

        table_id = config['pointwise_eval_table']
        dataset_id = config['eval_dataset']
        project_id = config["project"]
        location_id=config["project_location"]
        table_full_id = f"{project_id}.{dataset_id}.{table_id}"
        dataset_full_id = f"{project_id}.{dataset_id}"

        #remove unwanted characters from column name
        result.columns = result.columns.str.replace("/", "-")

        # Initialize BigQuery Client
        client = bigquery.Client()


        # Ensure the dataset exists
        try:
            client.get_dataset(dataset_full_id)
            print(f"Dataset {dataset_full_id} exists.")
        except NotFound:
            print(f"Dataset {dataset_full_id} not found. Creating dataset...")
            dataset = bigquery.Dataset(dataset_full_id)
            dataset.location = location_id 
            client.create_dataset(dataset)
            print(f"Dataset {dataset_full_id} created successfully.")




        try:
            # Fetch the existing table
            table = client.get_table(table_full_id)
            existing_schema = {field.name: field.field_type for field in table.schema}
            print(f"Table {table_full_id} exists. Checking schema...")

            # Infer schema from DataFrame
            new_schema = {
                name: bigquery.enums.SqlTypeNames.DATE if (dtype == 'object'  and name=='run_experiment_date')
                else bigquery.enums.SqlTypeNames.STRING if dtype == 'object'
                else bigquery.enums.SqlTypeNames.FLOAT if dtype in ['float64', 'float32']
                else bigquery.enums.SqlTypeNames.INTEGER if dtype in ['int64', 'int32']
                else bigquery.enums.SqlTypeNames.BOOLEAN if dtype == 'bool'
                else bigquery.enums.SqlTypeNames.TIMESTAMP if dtype == 'datetime64[ns]'
                else bigquery.enums.SqlTypeNames.STRING
                for name, dtype in zip(result.columns, result.dtypes)
            }

            # Identify schema differences
            schema_changes = []
            for col, dtype in new_schema.items():
                if col not in existing_schema:
                    # Add new column
                    schema_changes.append(bigquery.SchemaField(col, dtype))
                elif existing_schema[col] != dtype:
                    print(f"Type change detected for column '{col}' from {existing_schema[col]} to {dtype}.")
                    # BigQuery doesn't allow direct type changes; handle as needed.

            if schema_changes:
                print("Altering schema to add new columns...")
                table.schema = table.schema + schema_changes
                table = client.update_table(table, ["schema"])
                print(f"Table {table_full_id} schema updated successfully.")
            else:
                print("Schema is already up-to-date.")

        except NotFound:
            print(f"Table {table_full_id} not found. Creating table...")
            # Infer schema from DataFrame
            schema = [
                bigquery.SchemaField(name, bigquery.enums.SqlTypeNames.DATE if (dtype == 'object'  and name=='run_experiment_date')
                                     else bigquery.enums.SqlTypeNames.STRING if dtype == 'object' 
                                     else bigquery.enums.SqlTypeNames.FLOAT if dtype in ['float64', 'float32']
                                     else bigquery.enums.SqlTypeNames.INTEGER if dtype in ['int64', 'int32']
                                     else bigquery.enums.SqlTypeNames.BOOLEAN if dtype == 'bool'
                                     else bigquery.enums.SqlTypeNames.TIMESTAMP if dtype == 'datetime64[ns]'
                                     else bigquery.enums.SqlTypeNames.STRING)
                for name, dtype in zip(result.columns, result.dtypes)
            ]

            # Create the table
            table = bigquery.Table(table_full_id, schema=schema)
            table = client.create_table(table)
            print(f"Table {table_full_id} created successfully.")

        # Define job configuration
        job_config = bigquery.LoadJobConfig(
            write_disposition=bigquery.WriteDisposition.WRITE_APPEND,
        )

        # Save DataFrame to BigQuery
        job = client.load_table_from_dataframe(result, table_full_id, job_config=job_config)
        job.result()  # Wait for the job to complete

        # Additional error inspection after the job completes
        if job.errors:
            print("The job completed with the following errors:")
            for error in job.errors:
                print(f" - {error['message']}")
        else:
            print(f"Evaluations have successfully been loaded into {table_full_id}.")

    def perplexity(self,prob: float):    
        """Extract perplexity- models confidence in predicting next token using average log probablity

          Args:
          float prob: average log probability

          Returns:
          float:  perplexity value

          """
        return math.exp(-prob)
    
    
    def entropy(self,text: str):
        """Extracts entropy of a texts, higher entropy means diverse range of tokens have been choosen

        Args:
        str text: the input text

        Returns:
        float entropy: entropy value of input text
        """

        # Tokenize the text into words (ignoring punctuation)
        words = text.lower().split()

        # Get the frequency of each word
        word_count = Counter(words)

        # Total number of words
        total_words = len(words)

        # Calculate the probability of each word
        probabilities = [count / total_words for count in word_count.values()]

        # Calculate entropy using the formula
        entrpy = -sum(p * math.log2(p) for p in probabilities)

        return entrpy


    def get_evaluations(self):
        """
        Extracts the evaluation metricsusing:
            1-user defined metrics and rating criteria
            2-pre-defined mathematical metrics: perplexity, entropy

        """
        # set evaluation data
        eval_dataset=self.set_evaluation_data()
        
        #calculate the system defined metrics
        if self.sys_metrics:
            # the evrage prob column is given in the data, calculate perplexity
            if self.response_avgLogprobs_column_name:
                eval_dataset['perplexity']=eval_dataset[self.response_avgLogprobs_column_name].apply(self.perplexity)
            
            #calculate entropy
            eval_dataset['entropy']=eval_dataset['response'].apply(self.entropy)
            eval_results=eval_dataset
        
        #calcualte user defined metrics
        if self.eval_metrics:
            metrics=[]
            # Define  pointwise quality metric(s)
            for metric in self.eval_metrics:
                # Define a pointwise quality metric
                pointwise_quality_metric_prompt = f"""{self.evaluation_prompt}; evaluate {metric['metric']}.
                # Rubric rating criteria
                {metric['criteria']}
                # AI-generated Response
                {{response}}
                """
                pointwise_metric=PointwiseMetric(
                    metric=metric['metric'],
                    metric_prompt_template=pointwise_quality_metric_prompt,
                )
                metrics.append(pointwise_metric)
                
            # Create the evaluation task
            eval_task = EvalTask(
                dataset=eval_dataset,
                metrics=metrics,
                experiment=self.experiment_name,
            )
            # Run evaluation on the data using the evaluation service
            results = eval_task.evaluate( 

                    experiment_run_name=self.run_experiment_name,
                ) 
            #Delete the experiment after getting the result
            if self.delete_experiment:
                experiment = aiplatform.Experiment(self.experiment_name)
                experiment.delete()
                
            eval_results=results.metrics_table
            
        #log the statistics into bigquery
        self.log_evaluations(eval_results)
            
        return eval_results 

    

# Sample User Prompt
This is basically the prompt text that will be used to generate the content for each video segment or image during batch/online content generation.
Here, we used this prompt to generate the content of a sample video from 600s to 900s using two different models 'gemini-1.5-pro-002', 'gemini-1.5-flash-002'. The generated content is recorded in json format in output_model1.txt and output_model2.txt files.

In [28]:
start=600
end=900
schema="""{
    "description": "A structured schema to represent detailed information from a video or text analysis",
    "type": "object",
    "properties": {
        "Category": {
            "type": "string",
            "description": "The category or general type of the content"
        },
        "DetailedDescriptionOfEventsAndConversations": {
            "type": "string",
            "description": "A detailed textual description of the events and conversations in the content"
        },
        "BrandsCompanyNamesLogos": {
            "type": "array",
            "items": {
                "type": "string"
            },
            "description": "A list of brands, company names, or logos appearing or mentioned in the content"
        },
        "KeyLocationsAndScenes": {
            "type": "array",
            "items": {
                "type": "string"
            },
            "description": "A list of key locations and scenes appearing or mentioned in the content"
        },
        "KeyThemes": {
            "type": "array",
            "items": {
                "type": "string"
            },
            "description": "A list of key themes discussed or portrayed in the content"
        },
        "PeopleAppearingAndMentioned": {
            "type": "array",
            "items": {
                "type": "string"
            },
            "description": "A list of people who appear or are mentioned in the content"
        }
    },
    "required": [
        "Category",
        "DetailedDescriptionOfEventsAndConversations",
        "BrandsCompanyNamesLogos",
        "KeyLocationsAndScenes",
        "KeyThemes",
        "PeopleAppearingAndMentioned"
    ]
}"""


VAR_VIDEO_SEGMENT=f"Your task is to provide a comprehensive description of this video from segment {start} seconds to {end} seconds.\n"
VAR_INSTRUCTIONS= """To complete the task you need to follow these steps:\n
                           No greetings, closing remarks, or additional comments. Begin immediately with the video analysis and provide only the requested information in the specified format.\n
                           Idenify all instances of visual product placement. Pay close attention to background details and items held by the characters. List each product placement with the following
                            information: Brand name, product name (if applicable), and a brief description. Include information about product placement into the description generated for the video\n
                           Create a transcript of all the speeches, dialogs, narration.\n
                           Scrupulously examine each scene for any and all visible brand names, logos, and products. Even if a product appears briefly or in the background, it should be included.\n"""

VAR_CONSTRAINTS= """Describe the video content objectively, avoiding any subjective opinions or assumptions.\n
                           Specify who is saying what. If a person talking can be seen, specify their name and/or occupation. If it is voice behind the scenes, then describe it as a narrator.\n
                           Be specific when describing. Include all the information that is shown or given.\n
                           Do not show timestamps.\n
                           If an unidentified person is shown in the video first, but then their name is mentioned later in the video, make sure to mention their name in the description from the start.\n
                           """

VAR_STRUCTURE= f"""Organize the description with the following properties, and give a valid json file with JSON schema.<JSONSchema>{json.dumps(schema)}</JSONSchema>:
                       \n**Category**\n
                       \n**DetailedDescriptionOfEventsAndConversations**\n
                       \n**BrandsCompanyNamesLogos**\n
                       \n**KeyLocationsAndScenes**\n
                       \n**KeyThemes**\n
                       \n**PeopleAppearingAndMentioned**\n 
                 """ 

VAR_CONDITIONS = """Identify a video as one of these categories: News, TV Shows, Live Sport Events, News Analyses. \n
                       When describing the DetailedDescriptionOfEventsAndConversations, consider the following instructions for specific video types:\n
                       * **News:** Pay close attention to transitions, graphics, and on-screen text.\n
                       * **TV Shows:** Describe facial expressions, body language, appearances, and overall mood.\n
                       * **Live Sports Events:** Focus on key moments, like goals or fouls, and describe the overall flow and momentum of the game.\n
                       * **News Analyses:** Identify different perspectives, arguments, and supporting evidence.\n
                       Make sure to mention people's names in the DetailedDescriptionOfEventsAndConversations and in PeopleAppearingAndMentioned as well as any other information about them like their age, occupation, location, etc. \n"""

VAR_EXAMPLE = """Follow this example for the format of the output:\n
              {
                "Category": "TV Show",
                "DetailedDescriptionOfEventsAndConversations": "The video starts with a man sitting at a dining table, reading a letter. Two Fiji bottles are visible on the benchtop. He has short, light brown hair and a beard. His name is Harrison. The scene changes to Melissa. Melissa says: \"I'm Melissa, and I'm a hairdresser. I'm 41 years old, and I'm from Sydney.\"",
                "BrandsCompanyNamesLogos": ["Lacoste", "Fiji"],
                "KeyLocationsAndScenes": ["Apartment"],
                "KeyThemes": ["Marriage"],
                "PeopleAppearingAndMentioned": [
                "Harrison, 32, Builder, NSW",
                "Melissa, 41, Hairdresser, NSW"
                ]
            }
               """
  
    
video_description_prompt=VAR_VIDEO_SEGMENT+VAR_INSTRUCTIONS+VAR_CONSTRAINTS+VAR_STRUCTURE+VAR_CONDITIONS+VAR_EXAMPLE
 

In [32]:
#load pre-executed predictions
with open('output_model1.txt', 'r') as file:
    response1 = json.dumps(json.load(file))
with open('output_model2.txt', 'r') as file:
    response2 = json.dumps(json.load(file))

In [33]:
video_evaluation_prompt="""
  # Instruction
  You are an expert evaluator. Your task is to evaluate the quality of the responses generated by AI models. We will provide you with the user prompt,  and an AI-generated responses, video and the segment (in seconds) for which this response is generated.
  You should first read the user input carefully for analyzing the task, then look into video segment, and then evaluate the quality of the responses based on the Criteria provided in the Evaluation section below.
  You will assign the response a rating following the Rating Rubric and Evaluation Steps. Give step by step explanations for your rating, and only choose ratings from the Rating Rubric.

  # Evaluation
  ## Metric Definition
  You will be assessing coverage, which measures the ability to provide a detailed response based on a the given video segment and requested properties.
  

  ## Criteria
  Coverage: It is the quality of capturing all required detail for each requested property.
  In the context of video content capturing, it refers to the way that all the details of the following properties are captured and presented throughly:
  
  - Category 
  - Detailed Description Of Events And Conversations 
  - Brands,CompanyNames, and Logos 
  - KeyLocations And Scenes" 
  - Key Themes 
  - People Appearing And Mentioned 
  
  This AI-generated responses will be used for data retrieval. So, it has to be able to capture all the details of a scene.

  ## Rating Rubric
  5: (Perfectly Aligned) The details of the video segment is captured properly for all the requested properties thoroughly.
  4: (Highly Aligned) The descriptions captured for each property generally supports the details in the video segment and can be used for data retrieval.
  3: (Moderately Aligned) The descriptions captured for each property is captured well, but there might be minor inconsistencies or missed details, and the response is broadly relevant but not entirely specific.
  2: (Poorly Aligned)  The descriptions captured for each property has significant inconsistencies with the video segment, raising doubts about the validity and quality of text to be usable for data retrieval.
  1: (Misaligned) The descriptions captured for each property has major inconsistencies with the video segment and many information and details are missed casuing a very poor quality of text for data retrieval.

  ## Evaluation Steps
  STEP 1:  Assess User Instruction:  Carefully read the user input prompt to understand the user's request and requested information.
  STEP 2:  Analyze Video Segment: Examine the video segment for each requested property to to make sure all the requested information are captured in detail.
  STEP 3: Evaluate Accuracy:  For each requested property, Check if the generated response correctly identifies the information and details described in the video segment.
  STEP 4:  Identify Inconsistencies: for each requested property, look for any discrepancies between the video segment details and the captured information in the AI-generated responses. For example, if any information is missed, or not captured right.
  STEP 5:  Determine Overall Coverage: Based on the previous steps, assign a coverage score using the 1-5 rubric.  Consider the severity of any inconsistencies and their potential impact on the data retrieval.
  """


image_evaluation_prompt="""
  # Instruction
  You are an expert evaluator. Your task is to evaluate the quality of the responses generated by AI models. We will provide you with the user prompt,  and an AI-generated responses, video and the segment (in seconds) for which this response is generated.
  You should first read the user input carefully for analyzing the task, then look into video segment, and then evaluate the quality of the responses based on the Criteria provided in the Evaluation section below.
  You will assign the response a rating following the Rating Rubric and Evaluation Steps. Give step by step explanations for your rating, and only choose ratings from the Rating Rubric.

  # Evaluation
  ## Metric Definition
  You will be assessing coverage, which measures the ability to provide a detailed response based on a the given video segment and requested properties.
  

  ## Criteria
  Coverage: It is the quality of capturing all required detail for each requested property.
  In the context of video content capturing, it refers to the way that all the details of the following properties are captured and presented throughly:
  
  - Category 
  - Detailed Description Of Events And Conversations 
  - Brands,CompanyNames, and Logos 
  - KeyLocations And Scenes" 
  - Key Themes 
  - People Appearing And Mentioned 
  
  This AI-generated responses will be used for data retrieval. So, it has to be able to capture all the details of a scene.

  ## Rating Rubric
  5: (Perfectly Aligned) The details of the video segment is captured properly for all the requested properties thoroughly.
  4: (Highly Aligned) The descriptions captured for each property generally supports the details in the video segment and can be used for data retrieval.
  3: (Moderately Aligned) The descriptions captured for each property is captured well, but there might be minor inconsistencies or missed details, and the response is broadly relevant but not entirely specific.
  2: (Poorly Aligned)  The descriptions captured for each property has significant inconsistencies with the video segment, raising doubts about the validity and quality of text to be usable for data retrieval.
  1: (Misaligned) The descriptions captured for each property has major inconsistencies with the video segment and many information and details are missed casuing a very poor quality of text for data retrieval.

  ## Evaluation Steps
  STEP 1:  Assess User Instruction:  Carefully read the user input prompt to understand the user's request and requested information.
  STEP 2:  Analyze Video Segment: Examine the video segment for each requested property to to make sure all the requested information are captured in detail.
  STEP 3: Evaluate Accuracy:  For each requested property, Check if the generated response correctly identifies the information and details described in the video segment.
  STEP 4:  Identify Inconsistencies: for each requested property, look for any discrepancies between the video segment details and the captured information in the AI-generated responses. For example, if any information is missed, or not captured right.
  STEP 5:  Determine Overall Coverage: Based on the previous steps, assign a coverage score using the 1-5 rubric.  Consider the severity of any inconsistencies and their potential impact on the data retrieval.
  """

multimodal_evaluation_promt={'video_prompt': video_evaluation_prompt,'image_prompt':image_evaluation_prompt}


generated_response = [
    response1,
    response2
]
llm_models=['gemini-1.5-pro-002', 'gemini-1.5-flash-002']

items = pd.DataFrame(
    {
        "prompt_text": video_description_prompt, #this should be set with the prompt_text when doing batch generation
        "fileUri":'gs://raw_nine_files/vlt_video_extract/MAAT/MAAT2024_1_A_HBB.mp4' , #this should be set to file uri when doing batch generation
        "description": generated_response, #this should be set the generated content when doing batch generation
        "asset_type": 'video/mp4' #this should be set to asset_type/mime_type when doing batch generation
    
    }
)

In [36]:
 items

Unnamed: 0,prompt_text,fileUri,description,asset_type
0,Your task is to provide a comprehensive descri...,gs://raw_nine_files/vlt_video_extract/MAAT/MAA...,"{""Category"": ""TV Show"", ""DetailedDescriptionOf...",video/mp4
1,Your task is to provide a comprehensive descri...,gs://raw_nine_files/vlt_video_extract/MAAT/MAA...,"{""Category"": ""TV Show"", ""DetailedDescriptionOf...",video/mp4


In [None]:
pointwise_evaluation_client=PointWiseEvaluationClient(project='nine-quality-test',
                          location='us-central1',
                          items=items,
                          response_desc_column_name=response_column_name,
                          response_llm_model_column_name=response_modelVersion,
                          response_avgLogprobs_column_name=response_avgLogprobs,
                          eval_metrics=eval_metrics,
                         experiment_name="pointwise-evaluation-experiment",    
                         evaluation_prompt= "Evaluate the AI's contribution to a meaningful content generation. For rating and evaluationtion of the response on a 1-5 scale, use the given rubric criteria.",
                         delete_experiment=True, # to save the costs, delete the evaluation experiment after the evaluation is finished
                         sys_metrics=True #calculate some mathematical metrics: entropy, perplexity
                         )
evaluations=pointwise_evaluation_client.get_evaluations()