In [147]:
import time
import random
from google.cloud import bigquery
import json
from datetime import datetime
import pandas as pd
 
    
  
    
def get_predictions(table, dataset,project_id,filter_query=""):
    """Query nearest neighbors using cosine similarity in BigQuery for text embeddings."""
  
    sql = f"""  
        WITH SEARCH_RESULT AS
         (SELECT 

                        asset_id, 
                        content,
                        headline,
                        html_safe_text,
                        description,
                        startOffset_seconds,
                        endOffset_seconds,
                        fileUri,
                        asset_type,
                        first_published_timestamp,
                        brand_type,
                        primary_category_name,
                        byline,
                        image_license_type,
                        publisher_type,
                        photographer,
                        date_published,
                        dxcId,
                        text_embedding_result ,
                        byline[SAFE_OFFSET(0)].author_name ,                    
                        CAST(JSON_EXTRACT_SCALAR(media_jsonbody, '$.response.candidates[0].avgLogprobs') AS FLOAT64) AS  avgLogprobs
                 FROM  `{dataset}.{table}` WHERE 1=1 and (LOWER(asset_type) LIKE '%video%' OR LOWER(asset_type) LIKE '%image%' ) {filter_query} 
        ),
          IMAGE_CONTEXT AS (
                   SELECT
                          pd.asset_id,
                          plain_text_column,
                          JSON_EXTRACT_SCALAR(entry, '$.image.mediaId') AS image_id,
                          JSON_EXTRACT_SCALAR(entry, '$.image.caption') AS image_caption
                        FROM
                          (SELECT
                              asset_id,
                              plain_text_column,
                              JSON_EXTRACT_ARRAY(article_body_json) AS article_body_json_array
                            FROM
                              `vlt_media_content_prelanding.vlt_article_content` -- change to vlt
                            WHERE
                              article_body_json IS NOT NULL
                          ) pd,
                          UNNEST(pd.article_body_json_array) AS entry -- Unnest the article body JSON array
                        WHERE
                          UPPER(JSON_EXTRACT_SCALAR(entry, '$.type')) = 'IMAGE' -- Filter to only 'IMAGE' type
                          AND JSON_EXTRACT_SCALAR(entry, '$.image.mediaId') IS NOT NULL -- Ensure there's an image ID
                       
          ) 
        
        SELECT sr.*,    plain_text_column as image_context ,  image_caption
        FROM SEARCH_RESULT   sr
        LEFT JOIN IMAGE_CONTEXT imgcnxt
        on REGEXP_REPLACE( sr.asset_id, r'\..*', '') =imgcnxt.image_id
    """       
 ##LOWER(asset_type) LIKE '%image%' OR 
    #print(sql)
    bq_client = bigquery.Client(project_id)
  
    # Run the query
    query_job = bq_client.query(sql)
    output=[]
    try:
        # Fetch results
        results = query_job.result()  
        df = results.to_dataframe()
       
        #drop duplicates
        df = df.drop_duplicates(subset=['asset_id', 'headline', 'description',
            'startOffset_seconds', 'endOffset_seconds', 'fileUri', 'asset_type',
            'first_published_timestamp', 'brand_type', 'primary_category_name',
            'author_name', 'image_license_type', 'publisher_type', 'photographer',
            'date_published', 'dxcId','avgLogprobs', 'image_context','image_caption' ])
        print(len(df))
        # Sort by asset_id and startOffset_seconds to ensure proper order
        df = df.sort_values(by=['asset_id', 'startOffset_seconds'])
        
        print(len(df))
        # Aggregate descriptions for each asset_id, ordered by startOffset_seconds
        df['description'] = df.groupby('asset_id')['description'].transform(lambda x: '\n'.join(x))

        # Aggregate and concatenate segments for each asset_id
        df['time_lines'] = df.apply(
            lambda row: f"{{'startOffset_seconds': {row['startOffset_seconds']}, 'endOffset_seconds': {row['endOffset_seconds']}}}", axis=1)
            
        # Now group by 'asset_id' and concatenate the strings in 'time_lines'
        time_lines = df.groupby(['asset_id'])['time_lines'].apply(lambda x: ', '.join(x)).reset_index()
        
        df.drop('time_lines', axis=1, inplace=True)
        # Merge the time_lines into the original DataFrame
        df = df.merge(time_lines, on=['asset_id'], how='left')
    
        #drop duplicates
        df = df.drop_duplicates(subset=['asset_id', 'headline', 'description',
                'fileUri', 'asset_type',
            'first_published_timestamp', 'brand_type', 'primary_category_name',
            'author_name', 'image_license_type', 'publisher_type', 'photographer',
            'date_published', 'dxcId',  'time_lines','avgLogprobs' ,'image_context','image_caption' ])[['asset_id', 'headline', 'description',
                'fileUri', 'asset_type',
            'first_published_timestamp', 'brand_type', 'primary_category_name',
            'author_name', 'image_license_type', 'publisher_type', 'photographer',
            'date_published', 'dxcId',  'time_lines','avgLogprobs' ,'image_context','image_caption' ]]
            
        # Convert datetime to string using astype(str)
        df['date_published'] = df['date_published'].astype(str)
        df['first_published_timestamp'] = df['first_published_timestamp'].astype(str) 
        
        #set the output
        output = df#.to_dict(orient='records') 
 
    except Exception as e:
        print('error'+str(e))
    return output


In [148]:
dataset= "vlt_media_embeddings_integration"
content_table="vlt_all_media_content_text_embeddings"
project_id='nine-quality-test'
df=get_predictions(content_table, dataset,project_id,filter_query="")
df=df.reset_index(drop=True)

1568
1568


In [149]:
import math
from collections import Counter


def e_confidence(entropy):
    """Scores the model's entropy for token diversity in a sentences
    
    Args:
    float entropy: the entropy 
    
    """
        
    # Define thresholds for categorization
    if entropy > 6:
        return "Good"
    elif 3<= entropy <= 6:
        return "Average"
    else:
        return "Poor"

def word_entropy(text):
    """Extracts entropy of a texts, higher entropy means diverse range of tokens have been choosen
    
    Args:
    str text: the input text
    
    Returns:
    float entropy: entropy value of input text
    """
    
    # Tokenize the text into words (ignoring punctuation)
    words = text.lower().split()
    
    # Get the frequency of each word
    word_count = Counter(words)
    
    # Total number of words
    total_words = len(words)
    
    # Calculate the probability of each word
    probabilities = [count / total_words for count in word_count.values()]
    
    # Calculate entropy using the formula
    entropy = -sum(p * math.log2(p) for p in probabilities)
    
    return entropy



def perpelexity(prob: float):    
    """Extract perplexity- models confidence in predicting next token using average log probablity
      
      Args:
      float prob: average log probability
      
      Returns:
      float:  perplexity value
      
      """
    return math.exp(-prob)

def p_confidence(perplexity: float):
    """Scores the model's perplexity for token prediction in a sentences
    
    Args:
    float perplexity: the perplexity 
    
    """

    if perplexity >=0 and perplexity<2:
        return 'Very Good'
    elif perplexity>=2 and perplexity<5:
        return 'Good'
    elif perplexity>=5 and perplexity<10:
        return 'Average'
    elif perplexity >=10:
        return 'poor'

        
def extract_measures (args):    
    perplexity=perpelexity(-args['avgLogprobs'])
    perplexity_confidence=p_confidence(perplexity)

    entropy=word_entropy(args['description'])
    entropy_confidence=e_confidence(entropy)
        
        
    
    return pd.Series([perplexity,perplexity_confidence,entropy,entropy_confidence], index=['perplexity','perplexity_confidence','entropy','entropy_confidence'])
 
df[['perplexity','perplexity_confidence','entropy','entropy_confidence']]= df.apply(extract_measures ,axis=1)

In [150]:
# General
import inspect
import logging
import random
import string
import warnings

from IPython.display import HTML, Markdown, display
import pandas as pd
import plotly.graph_objects as go

# Main
from vertexai.evaluation import EvalTask, MetricPromptTemplateExamples
from vertexai.generative_models import GenerativeModel, HarmBlockThreshold, HarmCategory
     

In [151]:
dir(MetricPromptTemplateExamples.Pointwise)

['COHERENCE',
 'FLUENCY',
 'GROUNDEDNESS',
 'INSTRUCTION_FOLLOWING',
 'MULTI_TURN_CHAT_QUALITY',
 'MULTI_TURN_SAFETY_QUALITY',
 'QUESTION_ANSWERING_QUALITY',
 'SAFETY',
 'SUMMARIZATION_QUALITY',
 'TEXT_QUALITY',
 'VERBOSITY',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__']