# Getting batch prediction EDA with Tensor Board and Text Gecko

In [42]:
import pandas as pd
from typing import Dict
from datasets import load_dataset
from typing import Dict, List
import tensorflow as tf
import numpy as np
import os
from tensorboard.plugins import projector

LIMIT = 3000
PROJECT_ID = 'wortz-project-352116'
DATASET = 'ecomm-embedding'
BUCKET = 'gs://ecomm-query-product-pairs'
USER_PROMPT = 'User query: '
PRODUCT_PROMPT = 'Product title: '

#### Load the data from huggingface

In [6]:

raw_data = load_dataset("tasksource/esci")

In [11]:
def get_input_dataframe(
    raw_data: Dict,
    user_prompt: str = USER_PROMPT,
    product_prompt: str = PRODUCT_PROMPT,
    limit: int = LIMIT,
) -> pd.DataFrame:
    """
    This function returns batch prediction data for embeddings
    """

    for i, row in enumerate(raw_data["train"]):
        if i == limit - 1:
            break
        elif i == 0:
            query_prod_pairs = pd.DataFrame(
                {"content": [f'{user_prompt}{row["query"]}'], "type": ["query"]}
            )
        else:
            query_prod_pairs = pd.concat(
                [
                    query_prod_pairs,
                    pd.DataFrame(
                        {"content": [f'{user_prompt}{row["query"]}'], "type": ["query"]}
                    ),
                ]
            )
        query_prod_pairs = pd.concat(
            [
                query_prod_pairs,
                pd.DataFrame(
                    {
                        "content": [f'{product_prompt}{row["product_title"]}'],
                        "type": ["product_title"],
                    }
                ),
            ]
        )
    return query_prod_pairs

In [12]:
query_prod_pairs = get_input_dataframe(raw_data)

In [13]:
query_prod_pairs = query_prod_pairs.reset_index()
query_prod_pairs.head()

Unnamed: 0,index,content,type
0,0,User query: revent 80 cfm,query
1,0,Product title: Panasonic FV-20VQ3 WhisperCeili...,product_title
2,0,User query: bathroom fan without light,query
3,0,Product title: Panasonic FV-20VQ3 WhisperCeili...,product_title
4,0,User query: revent 80 cfm,query


# Batch prediction

https://cloud.google.com/vertex-ai/generative-ai/docs/embeddings/batch-prediction-genai-embeddings#request_a_batch_response

____

In [14]:
! gsutil mb $BUCKET

Creating gs://ecomm-query-product-pairs/...
ServiceException: 409 A Cloud Storage bucket named 'ecomm-query-product-pairs' already exists. Try another name. Bucket names must be globally unique across all Google Cloud projects, including those outside of your organization.


In [15]:
output_file = 'batch_prediction_inputs.jsonl'

with open(output_file, 'w') as f:
    f.write(query_prod_pairs[['content']].to_json(lines=True, orient='records'))

In [16]:
! gsutil cp $output_file $BUCKET

Copying file://batch_prediction_inputs.jsonl [Content-Type=application/octet-stream]...
/ [1 files][585.1 KiB/585.1 KiB]                                                
Operation completed over 1 objects/585.1 KiB.                                    


In [17]:
from datetime import datetime
now = datetime.now()
now_string_tag = now.strftime("%Y-%m-%d-%H-%M-%S")
print("Tag for this run: ", now_string_tag)

Tag for this run:  2024-03-19-03-20-01


In [18]:
from vertexai.preview.language_models import TextEmbeddingModel

textembedding_model = TextEmbeddingModel.from_pretrained("textembedding-gecko")
batch_prediction_job = textembedding_model.batch_predict(
    dataset=[f"{BUCKET}/{output_file}"],
    destination_uri_prefix=f"{BUCKET}/batch-predict-{now_string_tag}",
)
print(batch_prediction_job.display_name)
print(batch_prediction_job.resource_name)
print(batch_prediction_job.state)

Creating BatchPredictionJob
BatchPredictionJob created. Resource name: projects/679926387543/locations/us-central1/batchPredictionJobs/3781904315434663936
To use this BatchPredictionJob in another session:
bpj = aiplatform.BatchPredictionJob('projects/679926387543/locations/us-central1/batchPredictionJobs/3781904315434663936')
View Batch Prediction Job:
https://console.cloud.google.com/ai/platform/locations/us-central1/batch-predictions/3781904315434663936?project=679926387543
BatchPredictionJob projects/679926387543/locations/us-central1/batchPredictionJobs/3781904315434663936 current state:
JobState.JOB_STATE_QUEUED
BatchPredictionJob projects/679926387543/locations/us-central1/batchPredictionJobs/3781904315434663936 current state:
JobState.JOB_STATE_QUEUED
BatchPredictionJob projects/679926387543/locations/us-central1/batchPredictionJobs/3781904315434663936 current state:
JobState.JOB_STATE_QUEUED
BatchPredictionJob projects/679926387543/locations/us-central1/batchPredictionJobs/378

#### When complete you should see something like this

<img src='../img/bp-job.png' width=600px />

<img src='../img/output-data.png' width=600px />

### Visualize the embeddings with Tensorboard

Following this guide https://www.tensorflow.org/tensorboard/tensorboard_projector_plugin

In [4]:
bp_output_gcs_folder = batch_prediction_job.output_info.gcs_output_directory

! gsutil cp $bp_output_gcs_folder/* .

NameError: name 'batch_prediction_job' is not defined

In [9]:
predictions = pd.read_json(path_or_buf='000000000000.jsonl', lines=True)
predictions.head()

Unnamed: 0,instance,predictions,status
0,{'content': 'Product title: # This Is A Sharp ...,[{'embeddings': {'statistics': {'token_count':...,
1,{'content': 'Product title: # This Is A Sharp ...,[{'embeddings': {'statistics': {'token_count':...,
2,{'content': 'Product title: #10 Security Self-...,[{'embeddings': {'statistics': {'token_count':...,
3,{'content': 'Product title: #10 Security Self-...,[{'embeddings': {'statistics': {'token_count':...,
4,{'content': 'Product title: #10 Security Self-...,[{'embeddings': {'statistics': {'token_count':...,


In [35]:
def get_predictions(df: pd.DataFrame) -> List[List[float]]:
    embedding_list = []
    for _, row in df.iterrows():
        single_emb = row['predictions'][0]['embeddings']['values']
        embedding_list.append(single_emb)
    return embedding_list

embedding_list = get_predictions(predictions)
len(embedding_list)

5998

In [43]:
# Set up a logs directory, so Tensorboard knows where to look for files.
log_dir = "logs/ecomm-example/"
if not os.path.exists(log_dir):
    os.makedirs(log_dir)

# Save Labels separately on a line-by-line manner.
with open(os.path.join(log_dir, "metadata.tsv"), "w") as f:
    for instance in predictions.instance:
        f.write(f"{instance['content']}\n")


# Save the weights we want to analyze as a variable. Note that the first
# value represents any unknown word, which is not in the metadata, here
# we will remove this value.
weights = tf.Variable(embedding_list)
# Create a checkpoint from embedding, the filename and key are the
# name of the tensor.
checkpoint = tf.train.Checkpoint(embedding=weights)
checkpoint.save(os.path.join(log_dir, "embedding.ckpt"))

# Set up config.
config = projector.ProjectorConfig()
embedding = config.embeddings.add()
# The name of the tensor will be suffixed by `/.ATTRIBUTES/VARIABLE_VALUE`.
embedding.tensor_name = "embedding/.ATTRIBUTES/VARIABLE_VALUE"
embedding.metadata_path = "metadata.tsv"
projector.visualize_embeddings(log_dir, config)

In [45]:
# Now run tensorboard against on log data we just saved.
! tensorboard --logdir logs/ecomm-example/

2024-03-19 18:05:20.357926: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-03-19 18:05:20.424112: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-03-19 18:05:20.424949: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.

NOTE: Using experimental fast data loading logic. To disable, pass
    "--load_fast=false" and report issues on GitHub. More details:
    https://github.com/tensorflow/tensorboard/issues/4784

Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
TensorBoard 2.16.2 at http://localhost:6006/ (Press CTRL+C to quit)
E0319 18:05:28.382687 137618436110080 _internal.py:96] Error on

#### The above will run until you stop it

You should be able to investigate the embedding space via PCA. Note the total variance captured to understand how complete the veiw investigate

<img src="../img/tensorboard.png" width=600px />


#### Also a great way to understand performance is to select a point of interest and top k neighbors appear

Below, we see natural hair dye query and it's associated nearest product description in the embedding space:


<img src="../img/knn-analysis.png" width=900px />


