In [25]:
import bigframes.pandas as bpd

PROJECT_ID = 'wortz-project-352116'
DATASET = 'ecomm-embedding'
bpd.options.bigquery.project = PROJECT_ID
BUCKET = 'gs://ecomm-query-product-pairs'
USER_PROMPT = 'User query: '
PRODUCT_PROMPT = 'Product title: '

In [2]:
from datasets import load_dataset

raw_data = load_dataset("tasksource/esci")

In [39]:
import pandas as pd
from typing import Dict

LIMIT = 3000


def get_input_dataframe(
    raw_data: Dict,
    user_prompt: str = USER_PROMPT,
    product_prompt: str = PRODUCT_PROMPT,
    limit: int = LIMIT,
) -> pd.DataFrame:

    for i, row in enumerate(raw_data["train"]):
        if i == limit - 1:
            break
        elif i == 0:
            query_prod_pairs = pd.DataFrame(
                {"content": [f'{user_prompt}{row["query"]}'], "type": ["query"]}
            )
        else:
            query_prod_pairs = pd.concat(
                [
                    query_prod_pairs,
                    pd.DataFrame(
                        {"content": [f'{user_prompt}{row["query"]}'], "type": ["query"]}
                    ),
                ]
            )
        query_prod_pairs = pd.concat(
            [
                query_prod_pairs,
                pd.DataFrame(
                    {
                        "content": [f'{product_prompt}{row["product_title"]}'],
                        "type": ["product_title"],
                    }
                ),
            ]
        )
    return query_prod_pairs

In [40]:
query_prod_pairs = get_input_dataframe(raw_data)

In [41]:
query_prod_pairs = query_prod_pairs.reset_index()
query_prod_pairs.head()

Unnamed: 0,index,content,type
0,0,User query: revent 80 cfm,query
1,0,Product title: Panasonic FV-20VQ3 WhisperCeili...,product_title
2,0,User query: bathroom fan without light,query
3,0,Product title: Panasonic FV-20VQ3 WhisperCeili...,product_title
4,0,User query: revent 80 cfm,query


# Batch prediction

https://cloud.google.com/vertex-ai/generative-ai/docs/embeddings/batch-prediction-genai-embeddings#request_a_batch_response

____

In [42]:
! gsutil mb $BUCKET

Creating gs://ecomm-query-product-pairs/...
ServiceException: 409 A Cloud Storage bucket named 'ecomm-query-product-pairs' already exists. Try another name. Bucket names must be globally unique across all Google Cloud projects, including those outside of your organization.


In [43]:
output_file = 'batch_prediction_inputs.jsonl'

with open(output_file, 'w') as f:
    f.write(query_prod_pairs[['content']].to_json(lines=True, orient='records'))

In [44]:
! gsutil cp $output_file $BUCKET

Copying file://batch_prediction_inputs.jsonl [Content-Type=application/octet-stream]...
/ [1 files][585.1 KiB/585.1 KiB]                                                
Operation completed over 1 objects/585.1 KiB.                                    


In [45]:
from datetime import datetime
now = datetime.now()
now_string_tag = now.strftime("%Y-%m-%d-%H-%M-%S")
print("Tag for this run: ", now_string_tag)

Tag for this run:  2024-03-19-03-12-42


In [46]:
from vertexai.preview.language_models import TextEmbeddingModel
textembedding_model = TextEmbeddingModel.from_pretrained("textembedding-gecko")
batch_prediction_job = textembedding_model.batch_predict(
  dataset=[f"{BUCKET}/{output_file}"],
  destination_uri_prefix=f"{BUCKET}/batch-predict-{now_string_tag}",
)
print(batch_prediction_job.display_name)
print(batch_prediction_job.resource_name)
print(batch_prediction_job.state)

Creating BatchPredictionJob


BatchPredictionJob created. Resource name: projects/679926387543/locations/us-central1/batchPredictionJobs/8242719746345140224
To use this BatchPredictionJob in another session:
bpj = aiplatform.BatchPredictionJob('projects/679926387543/locations/us-central1/batchPredictionJobs/8242719746345140224')
View Batch Prediction Job:
https://console.cloud.google.com/ai/platform/locations/us-central1/batch-predictions/8242719746345140224?project=679926387543
BatchPredictionJob projects/679926387543/locations/us-central1/batchPredictionJobs/8242719746345140224 current state:
JobState.JOB_STATE_QUEUED
BatchPredictionJob projects/679926387543/locations/us-central1/batchPredictionJobs/8242719746345140224 current state:
JobState.JOB_STATE_QUEUED
BatchPredictionJob projects/679926387543/locations/us-central1/batchPredictionJobs/8242719746345140224 current state:
JobState.JOB_STATE_QUEUED
BatchPredictionJob projects/679926387543/locations/us-central1/batchPredictionJobs/8242719746345140224 current sta

#### When complete you should see something like this

<img src='../img/bp-job.png' width=600px />

<img src='../img/output-data.png' width=600px />

### Visualize the embeddings with Tensorboard

Following this guide https://www.tensorflow.org/tensorboard/tensorboard_projector_plugin