In [3]:
import time
import random
from google.cloud import functions_v2
from google.cloud import aiplatform
from EmbeddingPredictionClient import EmbeddingPredictionClient  
from google.cloud import bigquery
import json
import time
 

# Initialize the EmbeddingPredictionClient outside the function for reuse
embedding_client = EmbeddingPredictionClient(project='nine-quality-test' , location="us-central1",api_regional_endpoint="us-central1-aiplatform.googleapis.com")

def exponential_backoff_retries(client, text=None, image_file=None, max_retries=5, embedding_type=None):
    """
    This function applies exponential backoff with retries to the API calls.
    """
    attempt = 0
    while attempt < max_retries:
        try:
            # Try to get the embedding from the client
            if embedding_type=="multimodal_embedding":
                    return client.get_multimodal_embedding(text, image_file)
            elif embedding_type=="text_embedding":
                    return client.get_text_embedding(text)
        except Exception as e:
            attempt += 1
            backoff_delay = min(2 ** attempt + random.uniform(0, 1), 32)  # Exponential backoff with jitter
            print(f"Attempt {attempt} failed with error {e}. Retrying in {backoff_delay:.2f} seconds...")
            time.sleep(backoff_delay)  # Wait before retrying

    raise Exception("Max retries reached. Could not complete the request.")

def search_content_function(request):
    """
    Cloud Function entry point. This function handles the incoming request, 
    performs exponential backoff retries, and returns the embedding response.
    """
    # Parse the incoming request to extract text or image file
    request_json = request.get_json(silent=True)
    text = request_json.get('text')
    image_file = request_json.get('image_file')  # Assume it's the path or base64 string of the image

    if not text and not image_file:
        return 'Error: At least one of "text" or "image_file" must be provided.', 400

    try:
        # Retry logic with exponential backoff to calculate query embeddings
        result = exponential_backoff_retries(embedding_client, text, image_file)
        
        # Respond with the successful embedding response
        return {
            "text_embedding": result.text_embedding,
            "image_embedding": result.image_embedding
        }, 200

    except Exception as e:
        # Handle failure after max retries
        return f"Error: {str(e)}", 500

In [31]:
response_textembedding=exponential_backoff_retries(embedding_client, 'biden president of usa', embedding_type='text_embedding').text_embedding


In [32]:
response_multimodal_embedding=exponential_backoff_retries(embedding_client, 'biden president of usa', embedding_type='multimodal_embedding').text_embedding

In [37]:
###text embedding
import asyncio

async def get_media_nearest_neighbors(query_embedding, table, dataset,source_embedding_column,project_id,top_k=50):
    """Query nearest neighbors using cosine similarity in BigQuery."""
    
    # Record the start time
    start_time = time.time()
    option="""'{"fraction_lists_to_search": 0.01}'"""
    sql = f"""  
         WITH search_results AS
         (
              SELECT
              search_results.base.uri as uri,  
              search_results.base.combined_multimodal_id as combined_id,
              search_results.distance,  -- The computed distance (similarity score) between the embeddings
              search_results.base.asset_id ,
              ROW_NUMBER() OVER (PARTITION BY search_results.base.asset_id ORDER BY distance) AS rank_within_document  -- Rank by distance within each document
              
            FROM
              VECTOR_SEARCH(     
                TABLE `{dataset}.{table}`, --source embedding table
                '{source_embedding_column}',  -- Column with the embedding vectors in the base table

                -- Use the query embedding computed in the previous step
                 (SELECT {json.dumps(query_embedding)} query_embedding),  -- The query embedding from the CTE (query_embedding)

                -- Return top-k closest matches (adjust k as necessary)
                top_k =>{ top_k  }, -- Top k most similar matches based on distance
                distance_type => 'COSINE',
                options => {option}
                 
              ) search_results
              
          )
          -- Step 2: Aggregate relevance per document (original_document_id)
            ,aggregated_results AS (
                SELECT
                    asset_id,
                    COUNT(*) AS chunk_count,  -- The number of chunks for this document
                    SUM(distance) AS total_distance,  -- Sum of the distances for this document's chunks
                    AVG(distance) AS avg_distance  -- Alternatively, you can use the average distance
                FROM search_results
                GROUP BY asset_id
            ),

            -- Step 3: Rank the documents by relevance (number of chunks and sum of distances)
            ranked_documents AS (
                SELECT
                    asset_id,
                    chunk_count,
                    total_distance,
                    avg_distance,
                    ROW_NUMBER() OVER (ORDER BY chunk_count DESC, total_distance ASC) AS final_rank  -- Rank by chunk_count and then distance

                FROM aggregated_results
            )

            -- Step 4: Retrieve the top-k ranked documents based on relevance
            SELECT * FROM (
              SELECT  
                sr.asset_id,  
                sr.uri,              
                ROW_NUMBER() OVER (PARTITION BY SR.asset_id) AS IDX,
               -- sr.distance,
                final_rank--,
               -- rank_within_document
            FROM search_results sr
            JOIN ranked_documents rd ON sr.asset_id = rd.asset_id
            WHERE rd.final_rank <= {top_k} -- Return the top-k documents based on chunk relevance
            ORDER BY rd.final_rank, sr.rank_within_document  -- Order by document relevance and chunk rank
            )
            WHERE IDX=1
    """       
    #print(sql)
    bq_client = bigquery.Client(project_id)
  
    # Run the query
    query_job = bq_client.query(sql)

    # Fetch results
    results = query_job.result()
    
    output=[]
    for row in results:
        output.append({'asset_id':row['asset_id'], 'uri':row['uri']})

    
    end_time = time.time()

    # Calculate the elapsed time
    elapsed_time = end_time - start_time
    print(elapsed_time)
    return output

async def get_content_nearest_neighbors(query_embedding, table, dataset,source_embedding_column,project_id,top_k=50):
    """Query nearest neighbors using cosine similarity in BigQuery."""
    
    # Record the start time
    start_time = time.time()
    option="""'{"fraction_lists_to_search": 0.01}'"""
    sql = f"""  
         WITH search_results AS
         (
              SELECT
              search_results.base.content as content,  
              search_results.base.combined_id as combined_id,
              search_results.distance,  -- The computed distance (similarity score) between the embeddings
              b.asset_id,
              b.headline,
              b.description,
              ROW_NUMBER() OVER (PARTITION BY b.asset_id ORDER BY distance) AS rank_within_document  -- Rank by distance within each document
              
            FROM
              VECTOR_SEARCH(     
                TABLE `{dataset}.{table}`, --source embedding table
                '{source_embedding_column}',  -- Column with the embedding vectors in the base table

                -- Use the query embedding computed in the previous step
                 (SELECT {json.dumps(query_embedding)} query_embedding),  -- The query embedding from the CTE (query_embedding)

                -- Return top-k closest matches (adjust k as necessary)
                top_k =>{ top_k  }, -- Top k most similar matches based on distance
                distance_type => 'COSINE',
                options => {option}
                 
              ) search_results
              --this part should be removed later
              inner join   `nine-quality-test.vlt_media_content_prelanding.vlt_combined_media_content` b
              on search_results.base.combined_id =b.combined_id  
          )
          -- Step 2: Aggregate relevance per document (original_document_id)
            ,aggregated_results AS (
                SELECT
                    asset_id,
                    COUNT(*) AS chunk_count,  -- The number of chunks for this document
                    SUM(distance) AS total_distance,  -- Sum of the distances for this document's chunks
                    AVG(distance) AS avg_distance  -- Alternatively, you can use the average distance
                FROM search_results
                GROUP BY asset_id
            ),

            -- Step 3: Rank the documents by relevance (number of chunks and sum of distances)
            ranked_documents AS (
                SELECT
                    asset_id,
                    chunk_count,
                    total_distance,
                    avg_distance,
                    ROW_NUMBER() OVER (ORDER BY chunk_count DESC, total_distance ASC) AS final_rank  -- Rank by chunk_count and then distance

                FROM aggregated_results
            )

            -- Step 4: Retrieve the top-k ranked documents based on relevance
            SELECT * FROM (
              SELECT  
                sr.asset_id,  
                sr.headline,
                sr.description,
                sr.combined_id,
                ROW_NUMBER() OVER (PARTITION BY SR.asset_id) AS IDX,
               -- sr.distance,
                final_rank--,
               -- rank_within_document
            FROM search_results sr
            JOIN ranked_documents rd ON sr.asset_id = rd.asset_id
            WHERE rd.final_rank <= {top_k} -- Return the top-k documents based on chunk relevance
            ORDER BY rd.final_rank, sr.rank_within_document  -- Order by document relevance and chunk rank
            )
            WHERE IDX=1
    """       
    #print(sql)
    bq_client = bigquery.Client(project_id)
  
    # Run the query
    query_job = bq_client.query(sql)

    # Fetch results
    results = query_job.result()
    
    output=[]
    for row in results:
        output.append({'asset_id':row['asset_id'], 'headline':row['headline'],'description':row['description']})

    
    end_time = time.time()

    # Calculate the elapsed time
    elapsed_time = end_time - start_time
    print(elapsed_time)
    return output

In [6]:
PROJECT = !gcloud config get-value project
PROJECT_ID = PROJECT[0]
REGION = "us-central1" 

In [22]:
query_embedding=response
top_k=50
project_id=PROJECT_ID
dataset='langchain_dataset'
table='vlt_media_content_text_test_for_search'
source_embedding_column='ml_generate_embedding_result'

In [23]:
output=get_content_nearest_neighbors(query_embedding, table, dataset,source_embedding_column,project_id,top_k=50)



5.59295916557312


In [42]:
import asyncio
 
if 1==1:
    
    query_embedding=response_textembedding
    top_k=50
    project_id=PROJECT_ID
    dataset='langchain_dataset'
    table='vlt_media_content_text_test_for_search'
    source_embedding_column='ml_generate_embedding_result'

    
    #content_result= 
    
    # Create tasks and run them concurrently
    content_result = await asyncio.create_task(get_content_nearest_neighbors(query_embedding, table, dataset,source_embedding_column,project_id,top_k=50))
    
    query_embedding=response_multimodal_embedding
    top_k=50
    project_id=PROJECT_ID
    dataset='vlt_media_embeddings_integration'
    table='vlt_imgvdo_multimodal_embeddings'
    source_embedding_column='ml_generate_embedding_result'
    media_result = await asyncio.create_task(get_media_nearest_neighbors(query_embedding, table, dataset,source_embedding_column,project_id,top_k=50))

    print(content_result)
    print(media_result)
    


    



0.7552201747894287
0.5199756622314453
[{'asset_id': '0788b3187449fd8a5b84170ea7be2b68c645ef7d.jpeg', 'uri': 'gs://nineshowcaseassets/IMAGES/0788b3187449fd8a5b84170ea7be2b68c645ef7d.jpeg'}, {'asset_id': '0c6db30d6ca4f8bbcd4b27fdd23ceaaa39905635.jpeg', 'uri': 'gs://nineshowcaseassets/IMAGES/0c6db30d6ca4f8bbcd4b27fdd23ceaaa39905635.jpeg'}, {'asset_id': '080963f1ee94dbe52d671f97c598c16533510b26.jpeg', 'uri': 'gs://nineshowcaseassets/IMAGES/080963f1ee94dbe52d671f97c598c16533510b26.jpeg'}, {'asset_id': '037e80795c410c1244c335279f3a8375ae4b73b9.jpeg', 'uri': 'gs://nineshowcaseassets/IMAGES/037e80795c410c1244c335279f3a8375ae4b73b9.jpeg'}, {'asset_id': '08ff182d2306429ad3ef0c692b7ec9f5ba02e865.jpeg', 'uri': 'gs://nineshowcaseassets/IMAGES/08ff182d2306429ad3ef0c692b7ec9f5ba02e865.jpeg'}, {'asset_id': '0a56982d523d367a50fcebb2e6299e273eed758d.jpeg', 'uri': 'gs://nineshowcaseassets/IMAGES/0a56982d523d367a50fcebb2e6299e273eed758d.jpeg'}, {'asset_id': '0a15e082a756514a7cdc21cddcc0d3cc648d44af.jpeg',

In [39]:
print(media_result)

[{'asset_id': '0788b3187449fd8a5b84170ea7be2b68c645ef7d.jpeg', 'uri': 'gs://nineshowcaseassets/IMAGES/0788b3187449fd8a5b84170ea7be2b68c645ef7d.jpeg'}, {'asset_id': '0c6db30d6ca4f8bbcd4b27fdd23ceaaa39905635.jpeg', 'uri': 'gs://nineshowcaseassets/IMAGES/0c6db30d6ca4f8bbcd4b27fdd23ceaaa39905635.jpeg'}, {'asset_id': '080963f1ee94dbe52d671f97c598c16533510b26.jpeg', 'uri': 'gs://nineshowcaseassets/IMAGES/080963f1ee94dbe52d671f97c598c16533510b26.jpeg'}, {'asset_id': '037e80795c410c1244c335279f3a8375ae4b73b9.jpeg', 'uri': 'gs://nineshowcaseassets/IMAGES/037e80795c410c1244c335279f3a8375ae4b73b9.jpeg'}, {'asset_id': '08ff182d2306429ad3ef0c692b7ec9f5ba02e865.jpeg', 'uri': 'gs://nineshowcaseassets/IMAGES/08ff182d2306429ad3ef0c692b7ec9f5ba02e865.jpeg'}, {'asset_id': '0a56982d523d367a50fcebb2e6299e273eed758d.jpeg', 'uri': 'gs://nineshowcaseassets/IMAGES/0a56982d523d367a50fcebb2e6299e273eed758d.jpeg'}, {'asset_id': '0a15e082a756514a7cdc21cddcc0d3cc648d44af.jpeg', 'uri': 'gs://nineshowcaseassets/IMAGE