In [48]:
import time
import random
from google.cloud import bigquery
import json
from datetime import datetime
import pandas as pd
 
def get_predictions(table, dataset,project_id,filter_query=""):
    """Query nearest neighbors using cosine similarity in BigQuery for text embeddings."""
  
    sql = f"""  
        SELECT 

                        asset_id, 
                        content,
                        headline,
                        html_safe_text,
                        description,
                        startOffset_seconds,
                        endOffset_seconds,
                        fileUri,
                        asset_type,
                        first_published_timestamp,
                        brand_type,
                        primary_category_name,
                        byline,
                        image_license_type,
                        publisher_type,
                        photographer,
                        date_published,
                        dxcId,
                        text_embedding_result ,
                        byline[SAFE_OFFSET(0)].author_name ,
                        EXP(-CAST(JSON_EXTRACT_SCALAR(media_jsonbody, '$.response.candidates[0].avgLogprobs') AS FLOAT64)) AS perplexity
                 FROM  `{dataset}.{table}` WHERE 1=1 and (LOWER(asset_type) LIKE '%video%' ) {filter_query} 

    """       
 ##LOWER(asset_type) LIKE '%image%' OR 
    #print(sql)
    bq_client = bigquery.Client(project_id)
  
    # Run the query
    query_job = bq_client.query(sql)
    output=[]
    try:
        # Fetch results
        results = query_job.result()  
        df = results.to_dataframe()
       
        #drop duplicates
        df = df.drop_duplicates(subset=['asset_id', 'headline', 'description',
            'startOffset_seconds', 'endOffset_seconds', 'fileUri', 'asset_type',
            'first_published_timestamp', 'brand_type', 'primary_category_name',
            'author_name', 'image_license_type', 'publisher_type', 'photographer',
            'date_published', 'dxcId','perplexity' ])
        print(len(df))
        # Sort by asset_id and startOffset_seconds to ensure proper order
        df = df.sort_values(by=['asset_id', 'startOffset_seconds'])
        
        print(len(df))
        # Aggregate descriptions for each asset_id, ordered by startOffset_seconds
        df['description'] = df.groupby('asset_id')['description'].transform(lambda x: '\n'.join(x))

        # Aggregate and concatenate segments for each asset_id
        df['time_lines'] = df.apply(
            lambda row: f"{{'startOffset_seconds': {row['startOffset_seconds']}, 'endOffset_seconds': {row['endOffset_seconds']}}}", axis=1)
            
        # Now group by 'asset_id' and concatenate the strings in 'time_lines'
        time_lines = df.groupby(['asset_id'])['time_lines'].apply(lambda x: ', '.join(x)).reset_index()
        
        df.drop('time_lines', axis=1, inplace=True)
        # Merge the time_lines into the original DataFrame
        df = df.merge(time_lines, on=['asset_id'], how='left')
    
        #drop duplicates
        df = df.drop_duplicates(subset=['asset_id', 'headline', 'description',
                'fileUri', 'asset_type',
            'first_published_timestamp', 'brand_type', 'primary_category_name',
            'author_name', 'image_license_type', 'publisher_type', 'photographer',
            'date_published', 'dxcId',  'time_lines','perplexity'])[['asset_id', 'headline', 'description',
                'fileUri', 'asset_type',
            'first_published_timestamp', 'brand_type', 'primary_category_name',
            'author_name', 'image_license_type', 'publisher_type', 'photographer',
            'date_published', 'dxcId',  'time_lines','perplexity']]
            
        # Convert datetime to string using astype(str)
        df['date_published'] = df['date_published'].astype(str)
        df['first_published_timestamp'] = df['first_published_timestamp'].astype(str) 
        
        #set the output
        output = df#.to_dict(orient='records') 
 
    except Exception as e:
        print('error'+str(e))
    return output


In [None]:
dataset= "vlt_media_embeddings_integration"
content_table="vlt_all_media_content_text_embeddings"
project_id='nine-quality-test'
df=get_predictions(content_table, dataset,project_id,filter_query="")

In [40]:
len(list(set(df['asset_id'].to_list())))

20

In [46]:
df=df.reset_index(drop=True)
df

Unnamed: 0,asset_id,headline,description,fileUri,asset_type,first_published_timestamp,brand_type,primary_category_name,author_name,image_license_type,publisher_type,photographer,date_published,dxcId,time_lines
0,vlt_video_extract_MAAT_Full_MAAT2023_10_A_HBB.mp4,,Here's a detailed description of the video you...,gs://nineshowcaseassets/VIDEOS/MAFS/vlt_video_...,video/mp4,NaT,,,,,,,NaT,,"{'startOffset_seconds': 0, 'endOffset_seconds'..."
1,vlt_video_extract_MAAT_Full_MAAT2023_11_A_HBB.mp4,,Of course! Here's a detailed description of th...,gs://nineshowcaseassets/VIDEOS/MAFS/vlt_video_...,video/mp4,NaT,,,,,,,NaT,,"{'startOffset_seconds': 0, 'endOffset_seconds'..."
2,vlt_video_extract_MAAT_Full_MAAT2023_12_A_HBB.mp4,,"Sure, here's a detailed description of the vid...",gs://nineshowcaseassets/VIDEOS/MAFS/vlt_video_...,video/mp4,NaT,,,,,,,NaT,,"{'startOffset_seconds': 0, 'endOffset_seconds'..."
3,vlt_video_extract_MAAT_Full_MAAT2023_13_A_HBB.mp4,,"Sure, here's a detailed description of the vid...",gs://nineshowcaseassets/VIDEOS/MAFS/vlt_video_...,video/mp4,NaT,,,,,,,NaT,,"{'startOffset_seconds': 0, 'endOffset_seconds'..."
4,vlt_video_extract_MAAT_Full_MAAT2023_14_A_HBB.mp4,,Here is a detailed description of the video pr...,gs://nineshowcaseassets/VIDEOS/MAFS/vlt_video_...,video/mp4,NaT,,,,,,,NaT,,"{'startOffset_seconds': 0, 'endOffset_seconds'..."
5,vlt_video_extract_NINE_NEWS_SYD-NINE_NNNT23_10...,,"Sure, here is a detailed description of the vi...",gs://nineshowcaseassets/VIDEOS/NEWS/vlt_video_...,video/mp4,NaT,,,,,,,NaT,,"{'startOffset_seconds': 0, 'endOffset_seconds'..."
6,vlt_video_extract_NINE_NEWS_SYD-NINE_NNNT23_10...,,"Sure, here is a detailed description of the vi...",gs://nineshowcaseassets/VIDEOS/NEWS/vlt_video_...,video/mp4,NaT,,,,,,,NaT,,"{'startOffset_seconds': 0, 'endOffset_seconds'..."
7,vlt_video_extract_NINE_NEWS_SYD-NINE_NNNT23_10...,,"Sure, here is a detailed description of the vi...",gs://nineshowcaseassets/VIDEOS/NEWS/vlt_video_...,video/mp4,NaT,,,,,,,NaT,,"{'startOffset_seconds': 0, 'endOffset_seconds'..."
8,vlt_video_extract_NINE_NEWS_SYD-NINE_NNNT23_10...,,"Sure, here's a detailed description of the vid...",gs://nineshowcaseassets/VIDEOS/NEWS/vlt_video_...,video/mp4,NaT,,,,,,,NaT,,"{'startOffset_seconds': 0, 'endOffset_seconds'..."
9,vlt_video_extract_NINE_NEWS_SYD-NINE_NNNT23_10...,,Sure! Here is a detailed description of the vi...,gs://nineshowcaseassets/VIDEOS/NEWS/vlt_video_...,video/mp4,NaT,,,,,,,NaT,,"{'startOffset_seconds': 0, 'endOffset_seconds'..."
