In [None]:

import functions_framework
import time
import random
from EmbeddingPredictionClient import EmbeddingPredictionClient  
from google.cloud import bigquery
import json
import asyncio

async def exponential_backoff_retries(client, text=None, image_file=None, max_retries=5, embedding_type=None):
    """
    This function applies exponential backoff with retries to the API calls.
    """
    attempt = 0
    while attempt < max_retries:
        try:
            # Try to get the embedding from the client
            if embedding_type=="multimodal_embedding":
                    return client.get_multimodal_embedding(text, image_file)
            elif embedding_type=="text_embedding":
                    return client.get_text_embedding(text)
        except Exception as e:
            attempt += 1
            backoff_delay = min(2 ** attempt + random.uniform(0, 1), 32)  # Exponential backoff with jitter
            print(f"Attempt {attempt} failed with error {e}. Retrying in {backoff_delay:.2f} seconds...")
            time.sleep(backoff_delay)  # Wait before retrying

    raise Exception("Max retries reached. Could not complete the request.")

    
async def generate_query_embedding(client,text=None,image_file=None, embedding_type=None):
    try:
        # Retry logic with exponential backoff to calculate query embeddings
        result = exponential_backoff_retries(embedding_client, text, image_file, embedding_type)
        
        # Respond with the successful embedding response
        return {
            "text_embedding": result.text_embedding,
            "image_embedding": result.image_embedding
        }, 200

    except Exception as e:
        # Handle failure after max retries
        return f"Error: {str(e)}", 500


async def get_media_nearest_neighbors(query_embedding, table, dataset,source_embedding_column,project_id,top_k=50):
    """Query nearest neighbors using cosine similarity in BigQuery for multimodal embeddings."""
    
    # Record the start time
    start_time = time.time()
    #option="""'{"fraction_lists_to_search": 0.01}'"""
    sql = f"""  
         WITH search_results AS
         (
              SELECT
              search_results.base.uri as uri,  
              search_results.base.combined_multimodal_id as combined_id,
              search_results.distance,  -- The computed distance (similarity score) between the embeddings
              search_results.base.asset_id ,
              ROW_NUMBER() OVER (PARTITION BY search_results.base.asset_id ORDER BY distance) AS rank_within_document  -- Rank by distance within each document
              
            FROM
              VECTOR_SEARCH(     
                TABLE `{dataset}.{table}`, --source embedding table
                '{source_embedding_column}',  -- Column with the embedding vectors in the base table

                -- Use the query embedding computed in the previous step
                 (SELECT {json.dumps(query_embedding)} query_embedding),  -- The query embedding from the CTE (query_embedding)

                -- Return top-k closest matches (adjust k as necessary)
                top_k =>{ top_k  }, -- Top k most similar matches based on distance
                distance_type => 'COSINE'                 
              ) search_results
              
          )
          -- Step 2: Aggregate relevance per document (original_document_id)
            ,aggregated_results AS (
                SELECT
                    asset_id,
                    COUNT(*) AS chunk_count,  -- The number of chunks for this document
                    SUM(distance) AS total_distance,  -- Sum of the distances for this document's chunks
                    AVG(distance) AS avg_distance  -- Alternatively, you can use the average distance
                FROM search_results
                GROUP BY asset_id
            ),

            -- Step 3: Rank the documents by relevance (number of chunks and sum of distances)
            ranked_documents AS (
                SELECT
                    asset_id,
                    chunk_count,
                    total_distance,
                    avg_distance,
                    ROW_NUMBER() OVER (ORDER BY chunk_count DESC, total_distance ASC) AS final_rank  -- Rank by chunk_count and then distance

                FROM aggregated_results
            )

            -- Step 4: Retrieve the top-k ranked documents based on relevance
            SELECT * FROM (
              SELECT  
                sr.asset_id,  
                sr.uri,              
                ROW_NUMBER() OVER (PARTITION BY SR.asset_id) AS IDX,
               -- sr.distance,
                final_rank--,
               -- rank_within_document
            FROM search_results sr
            JOIN ranked_documents rd ON sr.asset_id = rd.asset_id
            WHERE rd.final_rank <= {top_k} -- Return the top-k documents based on chunk relevance
            ORDER BY rd.final_rank, sr.rank_within_document  -- Order by document relevance and chunk rank
            )
            WHERE IDX=1
    """       
    #print(sql)
    bq_client = bigquery.Client(project_id)
  
    # Run the query
    query_job = bq_client.query(sql)

    # Fetch results
    results = query_job.result()
    
    output=[]
    for row in results:
        output.append({'asset_id':row['asset_id'], 'uri':row['uri']})

    
    end_time = time.time()

    # Calculate the elapsed time
    elapsed_time = end_time - start_time
    print(elapsed_time)
    return output

async def get_content_nearest_neighbors(query_embedding, table, dataset,source_embedding_column,project_id,top_k=50):
    """Query nearest neighbors using cosine similarity in BigQuery for text embeddings."""
    
    # Record the start time
    start_time = time.time()
    #option="""'{"fraction_lists_to_search": 0.01}'"""
    sql = f"""  
         WITH search_results AS
         (
              SELECT
              search_results.base.content as content,  
              search_results.base.combined_id as combined_id,
              search_results.distance,  -- The computed distance (similarity score) between the embeddings
              b.asset_id,
              b.headline,
              b.description,
              ROW_NUMBER() OVER (PARTITION BY b.asset_id ORDER BY distance) AS rank_within_document  -- Rank by distance within each document
              
            FROM
              VECTOR_SEARCH(     
                TABLE `{dataset}.{table}`, --source embedding table
                '{source_embedding_column}',  -- Column with the embedding vectors in the base table

                -- Use the query embedding computed in the previous step
                 (SELECT {json.dumps(query_embedding)} query_embedding),  -- The query embedding from the CTE (query_embedding)

                -- Return top-k closest matches (adjust k as necessary)
                top_k =>{ top_k  }, -- Top k most similar matches based on distance
                distance_type => 'COSINE'                  
              ) search_results
              --this part should be removed later
              inner join   `nine-quality-test.vlt_media_content_prelanding.vlt_combined_media_content` b
              on search_results.base.combined_id =b.combined_id  
          )
          -- Step 2: Aggregate relevance per document (original_document_id)
            ,aggregated_results AS (
                SELECT
                    asset_id,
                    COUNT(*) AS chunk_count,  -- The number of chunks for this document
                    SUM(distance) AS total_distance,  -- Sum of the distances for this document's chunks
                    AVG(distance) AS avg_distance  -- Alternatively, you can use the average distance
                FROM search_results
                GROUP BY asset_id
            ),

            -- Step 3: Rank the documents by relevance (number of chunks and sum of distances)
            ranked_documents AS (
                SELECT
                    asset_id,
                    chunk_count,
                    total_distance,
                    avg_distance,
                    ROW_NUMBER() OVER (ORDER BY chunk_count DESC, total_distance ASC) AS final_rank  -- Rank by chunk_count and then distance

                FROM aggregated_results
            )

            -- Step 4: Retrieve the top-k ranked documents based on relevance
            SELECT * FROM (
              SELECT  
                sr.asset_id,  
                sr.headline,
                sr.description,
                sr.combined_id,
                ROW_NUMBER() OVER (PARTITION BY SR.asset_id) AS IDX,
                sr.distance,
                final_rank--,
               -- rank_within_document
            FROM search_results sr
            JOIN ranked_documents rd ON sr.asset_id = rd.asset_id
            WHERE rd.final_rank <= {top_k} -- Return the top-k documents based on chunk relevance      
            --and sr.asset_id like '%00261507986b0faf31c775597d2d24beb4381e43%'
            ORDER BY rd.final_rank, sr.rank_within_document  -- Order by document relevance and chunk rank
            )
            WHERE IDX=1
    """       
    #print(sql)
    bq_client = bigquery.Client(project_id)
  
    # Run the query
    query_job = bq_client.query(sql)

    # Fetch results
    results = query_job.result()
    
    output=[]
    for row in results:
        output.append({'asset_id':row['asset_id'], 'headline':row['headline'],'description':row['description']})

    
    end_time = time.time()

    # Calculate the elapsed time
    elapsed_time = end_time - start_time
    print(elapsed_time)
    return output

def merge_result(combined_list):
    # Step 2: Create a dictionary to merge by 'id'
    merged_dict = {}

    # Step 3: Iterate through the combined list and merge dictionaries by 'id'
    for d in combined_list:
        id_value = d['asset_id']

        # If the id already exists in merged_dict, update it
        if id_value in merged_dict:
            merged_dict[id_value].update(d)
        else:
            # If the id doesn't exist, add the dictionary as it is
            merged_dict[id_value] = d.copy()

    # Step 4: Convert the merged dictionary back into a list
    final_merged_list = list(merged_dict.values())
    
    return final_merged_list



async def get_nearest_contet(request):
    """
    Cloud Function entry point. This function handles the incoming request, 
    performs exponential backoff retries, and returns the embedding response.
    """
    # Parse the incoming request to extract text or image file
    # request_json = request.get_json(silent=True)
    # text = request_json.get('text')
    # image_file = request_json.get('image_file')  # Assume it's the path or base64 string of the image
    # project = request_json.get('project')  
    # region = request_json.get('region')  

    project_id='nine-quality-test'
    region="us-central1"
    text='Curtis Sittenfeld'
    image_file=None
    
    top_k=50     
    dataset='langchain_dataset'
    content_table='vlt_media_content_text_test_for_search'
    mm_table='vlt_imgvdo_multimodal_embeddings'
    source_embedding_column='ml_generate_embedding_result'

    # Initialize the EmbeddingPredictionClient outside the function for reuse
    embedding_client = EmbeddingPredictionClient(project=project_id , location=region,api_regional_endpoint=region+"-aiplatform.googleapis.com")
        
    if not text and not image_file:
        print('you are here')
        return 'Error: At least one of "text" or "image_file" must be provided.', 400
     
    content_result=[]
    media_text_result=[]
    media_image_result=[]
    if text:
        #if a text is given, calculate both multiomdal embedding and text embedding of the search query
        txtembding_for_text_result =  await asyncio.create_task(exponential_backoff_retries(embedding_client, text, embedding_type='text_embedding'))
        mmembding_for_text_result =  await asyncio.create_task(exponential_backoff_retries(embedding_client, text, embedding_type='multimodal_embedding')) 
        txtembding_for_text_result=txtembding_for_text_result .text_embedding
        mmembding_for_text_result=mmembding_for_text_result.text_embedding
        #find nearest neighbours
        content_result = await asyncio.create_task(get_content_nearest_neighbors(txtembding_for_text_result, content_table, dataset,source_embedding_column,project_id,top_k=top_k))
        dataset='vlt_media_embeddings_integration'
        media_text_result = await asyncio.create_task(get_media_nearest_neighbors(mmembding_for_text_result, mm_table, dataset,source_embedding_column,project_id,top_k=top_k))
        print('search is done')
            
    if image_file:
        #if an image is given convert image to 64bytestring and extract embedding
        mmembding_for_image_result = await asyncio.create_task(exponential_backoff_retries(embedding_client, image_file, embedding_type='multimodal_embedding').image_embedding)
        dataset='vlt_media_embeddings_integration'
        media_image_result = await asyncio.create_task(get_media_nearest_neighbors(mmembding_for_image_result, mm_table, dataset,source_embedding_column,project_id,top_k=top_k))
        media_image_result=media_image_result
        
    final_merged_list=merge_result(content_result+media_text_result+media_image_result)
    return final_merged_list#, content_result, media_text_result, media_image_result

# @functions_framework.http
# async def search_content_function(request):
 
#     result = await get_nearest_contet(request) 
#     return result#[0],result[1],result[2]

@functions_framework.http
def search_content_function(request):
    # """This is the entry point for the Cloud Function."""
    # try:
    #     loop = asyncio.get_event_loop()
    # except RuntimeError as e:
    #     # If no event loop is running, create a new event loop for this thread
    #     loop = asyncio.new_event_loop()
    #     asyncio.set_event_loop(loop)
    # result = loop.run_until_complete(get_nearest_contet(request))
    # return result
     
    if asyncio.get_event_loop().is_running():  # Check if an event loop is running
        result = asyncio.ensure_future(get_nearest_contet(request))  # Schedule the coroutine
        return result
    else:
        result = asyncio.run(get_nearest_contet(request))  # If no event loop, use asyncio.run()
        return result

In [None]:
x =search_content_function('')

In [337]:
import functions_framework
import time
import random
from EmbeddingPredictionClient import EmbeddingPredictionClient  
from google.cloud import bigquery
import json
import asyncio

def exponential_backoff_retries(client, text=None, image_file=None, max_retries=5, embedding_type=None):
    """
    This function applies exponential backoff with retries to the API calls.
    """
    attempt = 0
    while attempt < max_retries:
        try:
            # Try to get the embedding from the client
            if embedding_type=="multimodal_embedding":
                    return client.get_multimodal_embedding(text, image_file)
            elif embedding_type=="text_embedding":
                    return client.get_text_embedding(text)
        except Exception as e:
            attempt += 1
            backoff_delay = min(2 ** attempt + random.uniform(0, 1), 32)  # Exponential backoff with jitter
            print(f"Attempt {attempt} failed with error {e}. Retrying in {backoff_delay:.2f} seconds...")
            time.sleep(backoff_delay)  # Wait before retrying

    raise Exception("Max retries reached. Could not complete the request.")

    
def generate_query_embedding(client,text=None,image_file=None, embedding_type=None):
    try:
        # Retry logic with exponential backoff to calculate query embeddings
        result = exponential_backoff_retries(embedding_client, text, image_file, embedding_type)
        
        # Respond with the successful embedding response
        return {
            "text_embedding": result.text_embedding,
            "image_embedding": result.image_embedding
        }, 200

    except Exception as e:
        # Handle failure after max retries
        return f"Error: {str(e)}", 500



async def get_media_nearest_neighbors(query_embedding, table, dataset,source_embedding_column,project_id,top_k=50):
    """Query nearest neighbors using cosine similarity in BigQuery for multimodal embeddings."""
    
    # Record the start time
    start_time = time.time()
    #option="""'{"fraction_lists_to_search": 0.01}'"""
    sql = f"""  
         WITH search_results AS
         (
              SELECT
              search_results.base.uri as fileUri,  
              search_results.base.combined_multimodal_id as combined_id,
              search_results.distance,  -- The computed distance (similarity score) between the embeddings
              search_results.base.asset_id ,
              search_results.base.ml_generate_embedding_start_sec as startOffset_seconds,
              search_results.base.ml_generate_embedding_end_sec as endOffset_seconds,  
              ROW_NUMBER() OVER (PARTITION BY search_results.base.asset_id ORDER BY distance) AS rank_within_document  -- Rank by distance within each document
              
            FROM
              VECTOR_SEARCH(     
                TABLE `{dataset}.{table}`, --source embedding table
                '{source_embedding_column}',  -- Column with the embedding vectors in the base table

                -- Use the query embedding computed in the previous step
                 (SELECT {json.dumps(query_embedding)} query_embedding),  -- The query embedding from the CTE (query_embedding)

                -- Return top-k closest matches (adjust k as necessary)
                top_k =>{ top_k  }, -- Top k most similar matches based on distance
                distance_type => 'COSINE'                 
              ) search_results
              
          )
          -- Step 2: Aggregate relevance per document (original_document_id)
            ,aggregated_results AS (
                SELECT
                    asset_id,
                    COUNT(*) AS chunk_count,  -- The number of chunks for this document
                    SUM(distance) AS total_distance,  -- Sum of the distances for this document's chunks
                    AVG(distance) AS avg_distance  -- Alternatively, you can use the average distance
                FROM search_results
                GROUP BY asset_id
            ),

            -- Step 3: Rank the documents by relevance (number of chunks and sum of distances)
            ranked_documents AS (
                SELECT
                    asset_id,
                    chunk_count,
                    total_distance,
                    avg_distance,
                    ROW_NUMBER() OVER (ORDER BY chunk_count DESC, total_distance ASC) AS final_rank  -- Rank by chunk_count and then distance

                FROM aggregated_results
            )

            -- Step 4: Retrieve the top-k ranked documents based on relevance
            SELECT * FROM (
              SELECT  
                sr.asset_id,  
                sr.fileUri,              
                ROW_NUMBER() OVER (PARTITION BY SR.asset_id) AS IDX,
                STRING_AGG(CONCAT("""+"'{startOffset_seconds:', sr.startOffset_seconds, ',endOffset_seconds:', sr.endOffset_seconds, '}')"""+f""", ", " ) 
                OVER (PARTITION BY sr.asset_id ORDER BY sr.startOffset_seconds) AS time_lines
               -- sr.distance,
               -- final_rank--,
               -- rank_within_document
            FROM search_results sr
            JOIN ranked_documents rd ON sr.asset_id = rd.asset_id
            WHERE rd.final_rank <= {top_k} -- Return the top-k documents based on chunk relevance
            ORDER BY rd.final_rank, sr.rank_within_document  -- Order by document relevance and chunk rank
            )
            WHERE IDX=1
    """       
    print(sql)
    bq_client = bigquery.Client(project_id)
  
    # Run the query
    query_job = bq_client.query(sql)

    # Fetch results
    results = query_job.result()
    
    output=[]
    for row in results:
        output.append({'asset_id':row['asset_id'], 'fileUri':row['fileUri'], "time_lines":row['time_lines']})

    
    end_time = time.time()

    # Calculate the elapsed time
    elapsed_time = end_time - start_time
    print(elapsed_time)
    return output

async def get_content_nearest_neighbors(query_embedding, table, dataset,source_embedding_column,project_id,top_k=50):
    """Query nearest neighbors using cosine similarity in BigQuery for text embeddings."""
    
    # Record the start time
    start_time = time.time()
    #option="""'{"fraction_lists_to_search": 0.01}'"""
    options="""'{"use_brute_force":true}' """
    sql = f"""  
         WITH search_results AS
         (
              SELECT
              search_results.base.content as content,  
              search_results.base.combined_id as combined_id,
              search_results.distance,  -- The computed distance (similarity score) between the embeddings
              b.asset_id,
              b.headline,
              ifnull(b.html_safe_text,b.description) as description,
              b.startOffset_seconds,
              b.endOffset_seconds,
              fileUri,
              ROW_NUMBER() OVER (PARTITION BY b.asset_id ORDER BY distance) AS rank_within_document  -- Rank by distance within each document
              
            FROM
              VECTOR_SEARCH(     
                TABLE `{dataset}.{table}`, --source embedding table
                '{source_embedding_column}',  -- Column with the embedding vectors in the base table

                -- Use the query embedding computed in the previous step
                 (SELECT {json.dumps(query_embedding)} query_embedding),  -- The query embedding from the CTE (query_embedding)

                -- Return top-k closest matches (adjust k as necessary)
                top_k =>{ top_k  }, -- Top k most similar matches based on distance
                distance_type => 'COSINE',
               options => {options} 
              ) search_results
              --this part should be removed later
              inner join   `nine-quality-test.vlt_media_content_prelanding.vlt_combined_media_content` b
              on search_results.base.combined_id =b.combined_id  
          )
          -- Step 2: Aggregate relevance per document (original_document_id)
            ,aggregated_results AS (
                SELECT
                    asset_id,
                    COUNT(*) AS chunk_count,  -- The number of chunks for this document
                    SUM(distance) AS total_distance,  -- Sum of the distances for this document's chunks
                    AVG(distance) AS avg_distance  -- Alternatively, you can use the average distance
                FROM search_results
                GROUP BY asset_id
            ),

            -- Step 3: Rank the documents by relevance (number of chunks and sum of distances)
            ranked_documents AS (
                SELECT
                    asset_id,
                    chunk_count,
                    total_distance,
                    avg_distance,
                    ROW_NUMBER() OVER (ORDER BY chunk_count DESC, total_distance ASC) AS final_rank  -- Rank by chunk_count and then distance

                FROM aggregated_results
            )

            -- Step 4: Retrieve the top-k ranked documents based on relevance
            SELECT * FROM (
              SELECT  
                sr.asset_id,  
                sr.headline,
                sr.description,
                sr.combined_id,
                sr.fileUri,
                ROW_NUMBER() OVER (PARTITION BY SR.asset_id) AS IDX,
                STRING_AGG(CONCAT("""+"'{startOffset_seconds:', sr.startOffset_seconds, ',endOffset_seconds:', sr.endOffset_seconds, '}')"""+f""", ", " ) 
                OVER (PARTITION BY sr.asset_id ORDER BY sr.startOffset_seconds) AS time_lines
                --sr.distance,
                --final_rank--,
               -- rank_within_document
            FROM search_results sr
            JOIN ranked_documents rd ON sr.asset_id = rd.asset_id
            WHERE rd.final_rank <= {top_k} -- Return the top-k documents based on chunk relevance      
            --and sr.asset_id like '%00261507986b0faf31c775597d2d24beb4381e43%'
            ORDER BY rd.final_rank, sr.rank_within_document  -- Order by document relevance and chunk rank
            )
            WHERE IDX=1
    """       
    #print(sql)
    bq_client = bigquery.Client(project_id)
  
    # Run the query
    query_job = bq_client.query(sql)

    # Fetch results
    results = query_job.result()
    
    output=[]
    for row in results:
        output.append({'asset_id':row['asset_id'], 'headline':row['headline'],'description':row['description'],'fileUri':row['fileUri'], "time_lines":row['time_lines']})

    
    end_time = time.time()

    # Calculate the elapsed time
    elapsed_time = end_time - start_time
    print(elapsed_time)
    return output

In [None]:
response_textembedding=exponential_backoff_retries(embedding_client, 'Curtis Sittenfeld', embedding_type='text_embedding').text_embedding


In [None]:
#response_multimodal_embedding=exponential_backoff_retries(embedding_client, 'Curtis sittenfeld', embedding_type='multimodal_embedding').text_embedding

In [None]:
response_textembedding1=exponential_backoff_retries(embedding_client, 'curtis sittenfeld', embedding_type='text_embedding').text_embedding


In [309]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Example vectors
vector1 = np.array(response_textembedding)
vector2 = np.array(response_textembedding1)

# Reshape vectors to 2D arrays for cosine_similarity function
vector1 = vector1.reshape(1, -1)
vector2 = vector2.reshape(1, -1)

# Compute cosine similarity
cos_sim = cosine_similarity(vector1, vector2)[0][0]

# Compute cosine distance (1 - cosine similarity)
cos_distance = 1 - cos_sim

print(f"Cosine Similarity: {cos_sim}")
print(f"Cosine Distance: {cos_distance}")

Cosine Similarity: 0.9725177820375048
Cosine Distance: 0.02748221796249517


In [None]:
PROJECT = !gcloud config get-value project
PROJECT_ID = PROJECT[0]
REGION = "us-central1" 

In [None]:
query_embedding=response_textembedding
top_k=50
project_id=PROJECT_ID
dataset='langchain_dataset'
table='vlt_media_content_text_test_for_search'
source_embedding_column='ml_generate_embedding_result'

In [341]:
import asyncio
 
if 1==1:
    
    query_embedding=response_textembedding
    top_k=50
    project_id=PROJECT_ID
    dataset='langchain_dataset'
    table='vlt_media_content_text_test_for_search'
    source_embedding_column='ml_generate_embedding_result'

    
    #content_result= 
    
    # Create tasks and run them concurrently
    content_result = await asyncio.create_task(get_content_nearest_neighbors(query_embedding, table, dataset,source_embedding_column,project_id,top_k=50))
    
    # query_embedding=response_multimodal_embedding
    # top_k=50
    # project_id=PROJECT_ID
    # dataset='vlt_media_embeddings_integration'
    # table='vlt_imgvdo_multimodal_embeddings'
    # source_embedding_column='ml_generate_embedding_result'
    # media_result = await asyncio.create_task(get_media_nearest_neighbors(query_embedding, table, dataset,source_embedding_column,project_id,top_k=50))


0.7401032447814941


In [None]:
import pandas as pd
df1 = pd.DataFrame(content_result1)
df1

In [342]:
import pandas as pd
df = pd.DataFrame(content_result)
df

Unnamed: 0,asset_id,headline,description,fileUri,time_lines
0,p5d2tw,Curtis Sittenfeld’s celebrity romcom didn’t sw...,<p>Curtis Sittenfeld likes to pose questions w...,,
1,p5cxoi,The ‘anonymous mogul’ who’s behind your favour...,<p>Bill Lawrence has a gift for TV comedy and ...,,
2,p5dgrj,"Yes, the men of Sex and the City were awful, b...",<p>The interaction of the past with the presen...,,
3,p5e9zq,‘Hollywood is a hellhole’: The book digging up...,<p><strong>HOLLYWOOD</strong><br/><em><strong>...,,
4,p5dz8t,What to read next: A sinister psychothriller a...,<p><strong>FICTION PICK OF THE WEEK</strong><b...,,
5,p5erzr,Top CEOs say economy’s soft landing on track,<p>Australia’s top chief executives almost una...,,
6,p5eiuj,The novel tackling the myth of Australia’s ‘bl...,<p><strong>FICTION</strong><br/><em><strong>Ed...,,
7,p5e49l,The 20 best shows to stream on Disney+ right now,<p>Disney may be better known for its theme pa...,,
8,p5cwq9,The comedy festival has kicked off – here’s a ...,<p>This wrap of shows across the Melbourne Int...,,
9,p5dvjv,What to read: First Nations sci-fi and secrets...,<p><strong>FICTION PICK OF THE WEEK</strong></...,,


In [None]:
query_embedding=response_multimodal_embedding
top_k=50
project_id=PROJECT_ID
dataset='vlt_media_embeddings_integration'
table='vlt_imgvdo_multimodal_embeddings'
source_embedding_column='ml_generate_embedding_result'
media_result = await asyncio.create_task(get_media_nearest_neighbors(query_embedding, table, dataset,source_embedding_column,project_id,top_k=50))


In [343]:
from google.cloud import bigquery
from google.cloud.aiplatform_v1.types import NearestNeighborQuery
from vertexai.resources.preview import (FeatureOnlineStore, FeatureView,
                                        FeatureViewBigQuerySource)
from vertexai.resources.preview.feature_store import utils

#set project info
PROJECT = !gcloud config get-value project
PROJECT_ID = PROJECT[0]
REGION = "us-central1" 

FEATURE_ONLINE_STORE_ID = "vlt_searchcontent_text_embedding_featurestore"  # @param {type: "string"}
FEATURE_VIEW_ID = "vlt_textembedding_feature_view1"  # @param {type: "string"}

nine_fs=FeatureOnlineStore(FEATURE_ONLINE_STORE_ID)

#Verify that the FeatureView instance is created by getting the feature view.
nine_fv=FeatureView(
    FEATURE_VIEW_ID, feature_online_store_id=FEATURE_ONLINE_STORE_ID
) 
 

result=nine_fv.search(
    embedding_value=response_textembedding,
    neighbor_count=50,
    #string_filters=[country_filter],#for multimodal embedding this can be set to None, unless having a description column
    return_full_entity=True,  # returning entities with metadata
)

Public endpoint for the optimized online store vlt_searchcontent_text_embedding_featurestore is 3036209001126690816.us-central1-494586852359.featurestore.vertexai.goog


In [344]:
result=result.to_dict()

In [345]:
result['neighbors'][4]

{'entity_id': 'p5e9zq',
 'distance': 1.634165644645691,
 'entity_key_values': {'key_values': {'features': [{'name': 'combined_id',
     'value': {'string_value': 'p5e9zq'}},
    {'name': 'unique_id', 'value': {'string_value': 'p5e9zq-0'}},
    {'name': 'asset_type', 'value': {'string_value': 'article'}},
    {'name': 'brand_type', 'value': {'string_value': 'BRAND_TYPE_SMH'}},
    {'name': 'headline',
     'value': {'string_value': '‘Hollywood is a hellhole’: The book digging up the dirt on Tinseltown'}},
    {'name': 'first_published_timestamp',
     'value': {'int64_value': '1696996800000000'}},
    {'name': 'primary_category_id', 'value': {'string_value': '9'}},
    {'name': 'primary_category_name', 'value': {'string_value': 'Culture'}},
    {'name': 'secondary_category_id', 'value': {'string_value': 'b'}},
    {'name': 'secondary_category_name', 'value': {'string_value': 'Books'}},
    {'name': 'public_tag_id', 'value': {'string_value': '1q4'}},
    {'name': 'primary_tag_name', 'val

In [321]:
entity_id,  chunk, asset_id
headline
description
{'name': 'startOffset_seconds'},
      {'name': 'endOffset_seconds'},
      {'name': 'fileUri'}]}}},

IndentationError: unexpected indent (470175976.py, line 5)

In [347]:
result['neighbors']

[{'entity_id': 'p5d2tw',
  'distance': 1.7462705373764038,
  'entity_key_values': {'key_values': {'features': [{'name': 'combined_id',
      'value': {'string_value': 'p5d2tw'}},
     {'name': 'unique_id', 'value': {'string_value': 'p5d2tw-0'}},
     {'name': 'asset_type', 'value': {'string_value': 'article'}},
     {'name': 'brand_type', 'value': {'string_value': 'BRAND_TYPE_SMH'}},
     {'name': 'headline',
      'value': {'string_value': 'Curtis Sittenfeld’s celebrity romcom didn’t sweep me off my feet'}},
     {'name': 'first_published_timestamp',
      'value': {'int64_value': '1682661600000000'}},
     {'name': 'primary_category_id', 'value': {'string_value': '9'}},
     {'name': 'primary_category_name', 'value': {'string_value': 'Culture'}},
     {'name': 'secondary_category_id', 'value': {'string_value': 'b'}},
     {'name': 'secondary_category_name', 'value': {'string_value': 'Books'}},
     {'name': 'public_tag_id', 'value': {'string_value': '1q4'}},
     {'name': 'primary_ta

In [351]:
nearest_neighbours=[]
i=0
for neighbour in  result['neighbors'] :
    nearest_neighbour={}
    nearest_neighbour['entity_id']=neighbour['entity_id']
    nearest_neighbour['distance']=neighbour['distance']
    print(neighbour['entity_id'],i)
    i=i+1
    
    for feature in neighbour['entity_key_values']['key_values']['features']:
        if feature['name']=='text_embedding_result':
            if type(list(feature['value'].values())[0]) is dict:
                nearest_neighbour[feature['name']]=[]#list(list(feature['value'].values())[0].values())[0]             
            else:
                nearest_neighbour[feature['name']]=list(feature['value'].values())[0]             
        else :
            nearest_neighbour[feature['name']]=None
   
    nearest_neighbours.append(nearest_neighbour)
    

p5d2tw 0
p5cn38 1
p5dgrj 2
p5d2tw 3


KeyError: 'key_values'

In [354]:
result['neighbors'][3]

{'entity_id': 'p5d2tw',
 'distance': 1.6744652390480042,
 'entity_key_values': {}}

In [330]:
nearest_neighbours

[{'entity_id': 'p5d2tw',
  'distance': -0.7462674975395203,
  'chunk': '0',
  'ml_generate_embedding_result': [],
  'asset_id': 'p5d2tw',
  'headline': 'Curtis Sittenfeld’s celebrity romcom didn’t sweep me off my feet',
  'description': '<p>Curtis Sittenfeld likes to pose questions with her novels. What if Hillary had refused to marry Bill? (<em>Rodham.</em>) What if <em>Pride and Prejudice</em> was set in contemporary times? (<em>Eligible.</em>) Why did Democrat-raised librarian Laura Bush marry boisterous Republican George W? (<em>American Wife</em>.)</p> <p>There’s an entertainment tradition of ordinary looking talented men pairing with spectacularly beautiful women. But it would never happen if the genders were reversed – would it? That’s the premise of Sittenfeld’s sixth novel.</p> <p>Sally Milner is a writer for <em>TNO</em>, a fictional <em>Saturday Night Live</em>. When her colleague Danny becomes engaged to a gorgeous celebrity, Sally writes a mocking sketch – The Danny Horst 