In [201]:

import functions_framework
import time
import random
from EmbeddingPredictionClient import EmbeddingPredictionClient  
from google.cloud import bigquery
import json
import asyncio

async def exponential_backoff_retries(client, text=None, image_file=None, max_retries=5, embedding_type=None):
    """
    This function applies exponential backoff with retries to the API calls.
    """
    attempt = 0
    while attempt < max_retries:
        try:
            # Try to get the embedding from the client
            if embedding_type=="multimodal_embedding":
                    return client.get_multimodal_embedding(text, image_file)
            elif embedding_type=="text_embedding":
                    return client.get_text_embedding(text)
        except Exception as e:
            attempt += 1
            backoff_delay = min(2 ** attempt + random.uniform(0, 1), 32)  # Exponential backoff with jitter
            print(f"Attempt {attempt} failed with error {e}. Retrying in {backoff_delay:.2f} seconds...")
            time.sleep(backoff_delay)  # Wait before retrying

    raise Exception("Max retries reached. Could not complete the request.")

    
async def generate_query_embedding(client,text=None,image_file=None, embedding_type=None):
    try:
        # Retry logic with exponential backoff to calculate query embeddings
        result = exponential_backoff_retries(embedding_client, text, image_file, embedding_type)
        
        # Respond with the successful embedding response
        return {
            "text_embedding": result.text_embedding,
            "image_embedding": result.image_embedding
        }, 200

    except Exception as e:
        # Handle failure after max retries
        return f"Error: {str(e)}", 500


async def get_media_nearest_neighbors(query_embedding, table, dataset,source_embedding_column,project_id,top_k=50, filter_query=""):
    """Query nearest neighbors using cosine similarity in BigQuery for multimodal embeddings."""
    
    # Record the start time
    start_time = time.time()
    options="""'{"fraction_lists_to_search": 1}'"""
    #options="""'{"use_brute_force":true}' """

    sql = f"""  
         WITH search_results AS
         (
              SELECT
              search_results.base.uri as fileUri,  
              search_results.base.combined_multimodal_id as unique_id,
              search_results.distance,  -- The computed distance (similarity score) between the embeddings
              search_results.base.asset_id ,
              search_results.base.ml_generate_embedding_start_sec as startOffset_seconds,
              search_results.base.ml_generate_embedding_end_sec as endOffset_seconds,  
              search_results.base.content_type as asset_type,
              ROW_NUMBER() OVER (PARTITION BY search_results.base.asset_id ORDER BY distance ASC) AS rank_within_document  -- Rank by distance within each document
              
            FROM
              VECTOR_SEARCH(     
                ( SELECT * FROM  `{dataset}.{table}` WHERE 1=1 {filter_query}), --source embedding table
                '{source_embedding_column}',  -- Column with the embedding vectors in the base table

                -- Use the query embedding computed in the previous step
                 (SELECT {json.dumps(query_embedding)} query_embedding),  -- The query embedding from the CTE (query_embedding)

                -- Return top-k closest matches (adjust k as necessary)
                top_k =>{ top_k  }, -- Top k most similar matches based on distance
                distance_type => 'COSINE',
                options => {options}                
              ) search_results
              
          ),   
            -- Step 2: Find the minimum distance per asset_id
             ranked_documents AS (
                SELECT
                    asset_id,        
                    MIN(distance) AS min_distance  -- Alternatively, you can use the average distance
                FROM search_results
                GROUP BY asset_id
            )

            -- Step 3: Retrieve the top-k ranked documents based on relevance
            SELECT * FROM (
              SELECT  
                sr.asset_id,  
                sr.fileUri,  
                sr.asset_type,
                rd.min_distance,
                ROW_NUMBER() OVER (PARTITION BY SR.asset_id ORDER BY min_distance ASC) AS IDX,
                STRING_AGG(CONCAT("""+"'{startOffset_seconds:', sr.startOffset_seconds, ',endOffset_seconds:', sr.endOffset_seconds, '}')"""+f""", ", " ) 
                OVER (PARTITION BY sr.asset_id ORDER BY sr.startOffset_seconds) AS time_lines
               -- sr.distance,
               -- final_rank--,
               -- rank_within_document
            FROM search_results sr
            JOIN ranked_documents rd ON sr.asset_id = rd.asset_id
            ORDER BY min_distance ASC  
            )
            WHERE IDX=1
    """       
    #print(sql)
    bq_client = bigquery.Client(project_id)
  
    # Run the query
    query_job = bq_client.query(sql)

    # Fetch results
    results = query_job.result()
    
    output=[]
    for row in results:
        output.append({'asset_id':row['asset_id'], 'fileUri':row['fileUri'], "time_lines":row['time_lines'], "asset_type":row["asset_type"], "distance":row['min_distance']})

    
    end_time = time.time()

    # Calculate the elapsed time
    elapsed_time = end_time - start_time
    print(elapsed_time)
    return output

async def get_content_nearest_neighbors(query_embedding, table, dataset,source_embedding_column,project_id,top_k=50,filter_query=""):
    """Query nearest neighbors using cosine similarity in BigQuery for text embeddings."""
    
    # Record the start time
    start_time = time.time()
    options="""'{"fraction_lists_to_search": 1}'"""
    #options="""'{"use_brute_force":true}' """
    
    sql = f"""  
         WITH search_results AS
         (
              SELECT
              search_results.base.content as content,  
              search_results.base.combined_id as combined_id,
              search_results.base.unique_id,
              distance,  -- The computed distance (similarity score) between the embeddings
              search_results.base.asset_id,
              search_results.base.headline,
              ifnull(search_results.base.html_safe_text,search_results.base.description) as description,
              search_results.base.startOffset_seconds,
              search_results.base.endOffset_seconds,
              search_results.base.fileUri,
              search_results.base.asset_type,
              ROW_NUMBER() OVER (PARTITION BY  search_results.base.asset_id ORDER BY distance ASC) AS rank_within_document  -- Rank by distance within each document
              
            FROM
              VECTOR_SEARCH(     
                ( SELECT * FROM  `{dataset}.{table}` WHERE 1=1 {filter_query}), --source embedding table
                '{source_embedding_column}',  -- Column with the embedding vectors in the base table

                -- Use the query embedding computed in the previous step
                 (SELECT {json.dumps(query_embedding)} query_embedding),  -- The query embedding from the CTE (query_embedding)

                -- Return top-k closest matches (adjust k as necessary)
                top_k =>{ top_k  }, -- Top k most similar matches based on distance
                distance_type => 'COSINE',
                options => {options}                   
              ) search_results              
          ),          

             -- Step 2: Aggregate relevance per document (original_document_id)
            ranked_documents AS (
                SELECT
                    asset_id,        
                    MIN(distance) AS min_distance  -- Alternatively, you can use the average distance
                FROM search_results
                GROUP BY asset_id
            )

            -- Step 4: Retrieve the top-k ranked documents based on relevance
            SELECT * FROM (
              SELECT  
                sr.asset_id,  
                sr.headline,
                sr.description,
                sr.combined_id,
                sr.unique_id,
                sr.fileUri,
                sr.asset_type,
                rd.min_distance,
                ROW_NUMBER() OVER (PARTITION BY SR.asset_id ORDER BY min_distance ASC) AS IDX,
                STRING_AGG(CONCAT("""+"'{startOffset_seconds:', sr.startOffset_seconds, ',endOffset_seconds:', sr.endOffset_seconds, '}')"""+f""", ", " ) 
                OVER (PARTITION BY sr.asset_id ORDER BY sr.startOffset_seconds) AS time_lines               
                --sr.distance,
                --final_rank--,
               -- rank_within_document
            FROM search_results sr
            JOIN ranked_documents rd ON sr.asset_id = rd.asset_id           
            --and sr.asset_id like '%00261507986b0faf31c775597d2d24beb4381e43%'
            ORDER BY min_distance ASC
            )
            WHERE IDX=1
    """       
    print(sql)
    bq_client = bigquery.Client(project_id)
  
    # Run the query
    query_job = bq_client.query(sql)

    # Fetch results
    results = query_job.result()
    
    output=[]
    for row in results:
        output.append({'asset_id':row['asset_id'], 'headline':row['headline'],'description':row['description'],'fileUri':row['fileUri'], "time_lines":row['time_lines'], "asset_type":row["asset_type"], 
                      "distance":row['min_distance']})

    
    end_time = time.time()

    # Calculate the elapsed time
    elapsed_time = end_time - start_time
    print(elapsed_time)
    return output

def merge_result(combined_list):
    # Step 2: Create a dictionary to merge by 'id'
    merged_dict = {}

    # Step 3: Iterate through the combined list and merge dictionaries by 'id'
    for d in combined_list:
        id_value = d['asset_id']

        # If the id already exists in merged_dict, update it
        if id_value in merged_dict:
            merged_dict[id_value].update(d)
        else:
            # If the id doesn't exist, add the dictionary as it is
            merged_dict[id_value] = d.copy()

    # Step 4: Convert the merged dictionary back into a list
    final_merged_list = list(merged_dict.values())
    
    return final_merged_list



async def get_nearest_contet(request):
    """
    Cloud Function entry point. This function handles the incoming request, 
    performs exponential backoff retries, and returns the embedding response.
    """ 
    # Parse the incoming request to extract text or image file
    request_json = request.get_json(silent=True)
    text = request_json.get('search_query')
    image_file = request_json.get('image_file')  # Assume it's the path or base64 string of the image
    project_id = request_json.get('project')  
    region = request_json.get('region')  
    filter_image ="True"# request_json.get('filter_image') 
    filter_video ="True"# request_json.get('filter_video') 
    filter_article="True"#request_json.get('filter_article')
    
    # Load configuration from config.json
    with open('config.json') as config_file:
         config = json.load(config_file)
    
    
    top_k=int(config['top_k'])  
    dataset= config['dataset']
    content_table=config['content_table']
    mm_table=config['mm_table']
    content_source_embedding_column=config['content_source_embedding_column']
    mm_source_embedding_column=config['mm_source_embedding_column'] 
    if image_file=="" or image_file=="None":
        image_file=None
        
    article_filter_query=""
    if filter_article=="True" or filter_article=="1":
        article_filter_query= article_filter_query+f" AND lower(asset_type) like '%article%' " 
        
    image_filter_query=""
    if filter_image=="True" or filter_image=="1":
        image_filter_query= image_filter_query+f" AND lower(asset_type) like '%image%' "  
        
    video_filter_query=""
    if filter_video=="True" or filter_image=="1":
        video_filter_query= video_filter_query+f" AND lower(asset_type) like '%video%' "
        
#     project_id='nine-quality-test'
#     region="us-central1"
#     text='curtis sittenfeld'
#     image_file=None
    
#     top_k=50     
#     dataset='vlt_media_embeddings_integration'
#     content_table='vlt_all_media_content_text_embeddings'
#     mm_table='vlt_imgvdo_multimodal_embeddings'
#     content_source_embedding_column='text_embedding_result'
#     mm_source_embedding_column='ml_generate_embedding_result'

    # Initialize the EmbeddingPredictionClient outside the function for reuse
    embedding_client = EmbeddingPredictionClient(project=project_id , location=region,api_regional_endpoint=region+"-aiplatform.googleapis.com")
        
    if not text and not image_file:
        print('you are here')
        return 'Error: At least one of "text" or "image_file" must be provided.', 400
     
    content_result_article=[]
    content_result_image=[]
    content_result_video=[]
    #media_text_result=[]
    media_image_result=[]
    if text:
        #if a text is given, calculate both multiomdal embedding and text embedding of the search query
        txtembding_for_text_result =  await asyncio.create_task(exponential_backoff_retries(embedding_client, text, embedding_type='text_embedding'))
        #mmembding_for_text_result =  await asyncio.create_task(exponential_backoff_retries(embedding_client, text, embedding_type='multimodal_embedding')) 
        txtembding_for_text_result=txtembding_for_text_result .text_embedding
        #mmembding_for_text_result=mmembding_for_text_result.text_embedding
        #find nearest neighbours both from text embedding and multimodal embedding
        if article_filter_query!="":
            content_result_article = await asyncio.create_task(get_content_nearest_neighbors(txtembding_for_text_result, content_table, dataset,content_source_embedding_column,project_id,top_k=top_k, filter_query=article_filter_query))
        if image_filter_query!="":
            content_result_image = await asyncio.create_task(get_content_nearest_neighbors(txtembding_for_text_result, content_table, dataset,content_source_embedding_column,project_id,top_k=top_k,filter_query=image_filter_query))
        if video_filter_query!="":
            content_result_video = await asyncio.create_task(get_content_nearest_neighbors(txtembding_for_text_result, content_table, dataset,content_source_embedding_column,project_id,top_k=top_k, filter_query=video_filter_query))
        
        #media_text_result = await asyncio.create_task(get_media_nearest_neighbors(mmembding_for_text_result, mm_table, dataset,mm_source_embedding_column,project_id,top_k=top_k))
               
    if image_file:
        #if an image is given convert image to 64bytestring and extract embedding
        mmembding_for_image_result = await asyncio.create_task(exponential_backoff_retries(embedding_client, image_file, embedding_type='multimodal_embedding'))
        mmembding_for_image_result=mmembding_for_text_result.image_embedding
        #find nearest neighbours both from multimodal embedding
        media_image_result = await asyncio.create_task(get_media_nearest_neighbors(mmembding_for_image_result, mm_table, dataset,mm_source_embedding_column,project_id,top_k=top_k))
        media_image_result=media_image_result
        
   
    
    final_merged_list=merge_result(content_result_article+content_result_image+content_result_video+media_image_result)
    return final_merged_list,content_result_article,content_result_image,content_result_video 


@functions_framework.http
async def search_content_function(request):
 
    result = await get_nearest_contet(request) 
    return result#[0],result[1],result[2]

# @functions_framework.http
# def search_content_function(request):
#     """This is the entry point for the Cloud Function."""
#     try:
#         loop = asyncio.get_event_loop()
#     except RuntimeError as e:
#         # If no event loop is running, create a new event loop for this thread
#         loop = asyncio.new_event_loop()
#         asyncio.set_event_loop(loop)
#     result = loop.run_until_complete(get_nearest_contet(request))
#     return result
     

In [202]:
from unittest.mock import Mock
import json

# Your input data as a dictionary
data = {"search_query":"curtis sittenfeld","image_file":"","project":"nine-quality-test","region":"us-central1",
        "filter_image":"1",
         "filter_video":"1",
        "filter_article":"1"
       }

# Simulating an HTTP request with the mock object
mock_request = Mock()
mock_request.get_json.return_value = data  # Mock the get_json method to return your data


In [216]:
x,a,b,c= await search_content_function(mock_request)


  
         WITH search_results AS
         (
              SELECT
              search_results.base.content as content,  
              search_results.base.combined_id as combined_id,
              search_results.base.unique_id,
              distance,  -- The computed distance (similarity score) between the embeddings
              search_results.base.asset_id,
              search_results.base.headline,
              ifnull(search_results.base.html_safe_text,search_results.base.description) as description,
              search_results.base.startOffset_seconds,
              search_results.base.endOffset_seconds,
              search_results.base.fileUri,
              search_results.base.asset_type,
              ROW_NUMBER() OVER (PARTITION BY  search_results.base.asset_id ORDER BY distance ASC) AS rank_within_document  -- Rank by distance within each document
              
            FROM
              VECTOR_SEARCH(     
                ( SELECT * FROM  `vlt_media_embeddings_in

In [193]:
len(a)

33

In [194]:
len(b)

5

In [195]:
len(c)

5