# Getting batch prediction EDA with Tensor Board and Text Gecko

In [158]:
import pandas as pd
from typing import Dict
from datasets import load_dataset
from datasets import Dataset
from typing import Dict, List
import tensorflow as tf
import numpy as np
import os
from tensorboard.plugins import projector

LIMIT = 3000
PROJECT_ID = 'wortz-project-352116'
DATASET = 'ecomm-embedding'
BUCKET = 'gs://ecomm-query-product-pairs'
USER_PROMPT = 'User query: '
PRODUCT_PROMPT = 'Product title: '

#### Load the data from huggingface

In [152]:
raw_data = load_dataset("tasksource/esci")#, split=['train[:10%]','test'])

#### Quick examination of the data

In [159]:
def create_jsonl_from_raw_data(jsonl_path: str, id_field: str, text_field: str, raw_data = raw_data, **kwargs) -> None:
    # if 'limit' in kwargs:
    #     raw_data['train'] = raw_data['train'][:, kwargs['limit']]
    
    if 'product_id' in id_field:
        concat_list = raw_data['train'].map(lambda example: {'concat_id': f"{example[id_field]}_|_{example[text_field]}_|_{example['product_title']}"})                     
        unique_item_list = concat_list.unique('concat_id') 
        filtered_dataset_dict = {'_id': [example.split('_|_')[0] for example in unique_item_list],
                                 'title': [example.split('_|_')[2] for example in unique_item_list],
                                 'text': [example.split('_|_')[1] for example in unique_item_list],
                                 }
        filtered_dataset = pd.DataFrame(filtered_dataset_dict)
    else:
        concat_list = raw_data['train'].map(lambda example: {'concat_id': f"{example[id_field]}_|_{example[text_field]}"})                                         
        unique_item_list = concat_list.unique('concat_id')
        filtered_dataset_dict = {'_id': [example.split('_|_')[0] for example in unique_item_list],
                                 'text': [example.split('_|_')[1] for example in unique_item_list],
                                 }
        filtered_dataset = pd.DataFrame(filtered_dataset_dict)
    output_file = jsonl_path

    with open(output_file, 'w') as f:
        f.write(filtered_dataset.to_json(lines=True, orient='records'))
    
def create_jsonl_training_labels(tsv_train_path: str, tsv_test_path: str, raw_data = raw_data) -> None:
    tsv_options = {'sep': '\t'}
    train_data = raw_data['train'].map(lambda example: {'query-id': example['query_id'], 'corpus-id': example['product_id']})
    test_data = raw_data['test'].map(lambda example: {'query-id': example['query_id'], 'corpus-id': example['product_id']})
    train_data.to_csv(tsv_train_path, **tsv_options)
    test_data.to_csv(tsv_test_path, **tsv_options)

In [None]:
if not os.path.exists('tuning_data/'):
    os.mkdir('tuning_data/')
    
#the corpus file https://cloud.google.com/vertex-ai/generative-ai/docs/models/tune-embeddings#prepare-tuning
create_jsonl_from_raw_data(jsonl_path='tuning_data/query.jsonl', id_field='query_id', text_field='query', limit=100)

# the query file
create_jsonl_from_raw_data(jsonl_path='tuning_data/corpus.jsonl', id_field='product_id', text_field='product_description', limit=100)

Map:   0%|          | 4966/2027874 [00:01<09:00, 3743.56 examples/s]

#### Lastly, get training labels

In [None]:
create_jsonl_training_labels(tsv_train_path = 'tuning_data/corpus-train.TSV', tsv_test_path = 'tuning_data/corpus-test.TSV'

In [None]:
# def get_fine_tuning_dataframe(raw_data: datasets.dataset_dict.DatasetDict, unique_queries: Dict, unique_products: Dict, split: str = 'train') -> Tuple[pd.Dataframe, pd.Dataframe]:
#     ### https://cloud.google.com/vertex-ai/generative-ai/docs/models/tune-embeddings#generative-ai-tune-embedding-drest
#     data_dict_corpus = {"_id": [], "text": []}
#     data_dict_query = data_dict_corpus.copy()
#     raw_data = raw_data[split]
#     for row in raw_data:
        

In [58]:
def get_input_dataframe(
    raw_data: Dict,
    user_prompt: str = USER_PROMPT,
    product_prompt: str = PRODUCT_PROMPT,
    limit: int = LIMIT,
) -> pd.DataFrame:
    """
    This function returns batch prediction data for embeddings
    """

    for i, row in enumerate(raw_data["train"]):
        if i == limit - 1:
            break
        elif i == 0:
            query_prod_pairs = pd.DataFrame(
                {"content": [f'{user_prompt}{row["query"]}'], "type": ["query"], "id": [row["query_id"]]}
            )
        else:
            query_prod_pairs = pd.concat(
                [
                    query_prod_pairs,
                    pd.DataFrame(
                        {"content": [f'{user_prompt}{row["query"]}'], "type": ["query"], "id": [row["query_id"]]}
                    ),
                ]
            )
        query_prod_pairs = pd.concat(
            [
                query_prod_pairs,
                pd.DataFrame(
                    {
                        "content": [f'{product_prompt}{row["product_title"]}'],
                        "type": ["product_title"], "id": [row["product_id"]]}
                    
                ),
            ]
        )
    return query_prod_pairs

In [59]:
query_prod_pairs = get_input_dataframe(raw_data)

In [60]:
query_prod_pairs = query_prod_pairs.reset_index()
query_prod_pairs.head()

Unnamed: 0,index,content,type,id
0,0,User query: revent 80 cfm,query,0
1,0,Product title: Panasonic FV-20VQ3 WhisperCeili...,product_title,B000MOO21W
2,0,User query: bathroom fan without light,query,13723
3,0,Product title: Panasonic FV-20VQ3 WhisperCeili...,product_title,B000MOO21W
4,0,User query: revent 80 cfm,query,0


In [None]:
#### Get unique product ids

# Batch prediction

https://cloud.google.com/vertex-ai/generative-ai/docs/embeddings/batch-prediction-genai-embeddings#request_a_batch_response

____

In [None]:
! gsutil mb $BUCKET

In [None]:
output_file = 'batch_prediction_inputs.jsonl'

with open(output_file, 'w') as f:
    f.write(query_prod_pairs[['content']].to_json(lines=True, orient='records'))

In [None]:
! gsutil cp $output_file $BUCKET

In [None]:
from datetime import datetime
now = datetime.now()
now_string_tag = now.strftime("%Y-%m-%d-%H-%M-%S")
print("Tag for this run: ", now_string_tag)

In [None]:
from vertexai.preview.language_models import TextEmbeddingModel

textembedding_model = TextEmbeddingModel.from_pretrained("textembedding-gecko")
batch_prediction_job = textembedding_model.batch_predict(
    dataset=[f"{BUCKET}/{output_file}"],
    destination_uri_prefix=f"{BUCKET}/batch-predict-{now_string_tag}",
)
print(batch_prediction_job.display_name)
print(batch_prediction_job.resource_name)
print(batch_prediction_job.state)

#### When complete you should see something like this

<img src='../img/bp-job.png' width=600px />

<img src='../img/output-data.png' width=600px />

### Visualize the embeddings with Tensorboard

Following this guide https://www.tensorflow.org/tensorboard/tensorboard_projector_plugin

In [None]:
bp_output_gcs_folder = batch_prediction_job.output_info.gcs_output_directory

! gsutil cp $bp_output_gcs_folder/* .

In [None]:
predictions = pd.read_json(path_or_buf='000000000000.jsonl', lines=True)
predictions.head()

In [None]:
def get_predictions(df: pd.DataFrame) -> List[List[float]]:
    embedding_list = []
    for _, row in df.iterrows():
        single_emb = row['predictions'][0]['embeddings']['values']
        embedding_list.append(single_emb)
    return embedding_list

embedding_list = get_predictions(predictions)
len(embedding_list)

In [29]:
# Set up a logs directory, so Tensorboard knows where to look for files.
log_dir = "logs/ecomm-example/"
if not os.path.exists(log_dir):
    os.makedirs(log_dir)

# Save Labels separately on a line-by-line manner.
with open(os.path.join(log_dir, "metadata.tsv"), "w") as f:
    #header for columns
    f.write("data_type\tdata\n")
    for instance in predictions.instance:
        data_type = instance['content'].split(': ')[0]
        # data_type = data_type
        data = ''.join(instance['content'].split(': ')[1:])
        f.write(f"{data_type}\t{data}\n")


# Save the weights we want to analyze as a variable. Note that the first
# value represents any unknown word, which is not in the metadata, here
# we will remove this value.
weights = tf.Variable(embedding_list)
# Create a checkpoint from embedding, the filename and key are the
# name of the tensor.
checkpoint = tf.train.Checkpoint(embedding=weights)
checkpoint.save(os.path.join(log_dir, "embedding.ckpt"))

# Set up config.
config = projector.ProjectorConfig()
embedding = config.embeddings.add()
# The name of the tensor will be suffixed by `/.ATTRIBUTES/VARIABLE_VALUE`.
embedding.tensor_name = "embedding/.ATTRIBUTES/VARIABLE_VALUE"
embedding.metadata_path = "metadata.tsv"
projector.visualize_embeddings(log_dir, config)

In [30]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [31]:
# Now run tensorboard against on log data we just saved.
%tensorboard --logdir logs/ecomm-example/

Reusing TensorBoard on port 6006 (pid 30352), started 0:08:44 ago. (Use '!kill 30352' to kill it.)

#### The above will run until you stop it

You should be able to investigate the embedding space via PCA. Note the total variance captured to understand how complete the veiw investigate

<img src="../img/tensorboard.png" width=600px />


#### Also a great way to understand performance is to select a point of interest and top k neighbors appear

Below, we see natural hair dye query and it's associated nearest product description in the embedding space:


<img src="../img/knn-analysis.png" width=900px />


#### Lastly, you can analyze and color by data type to get a feel for how well the queryies releate to the products



<img src="../img/analysis-by-type.png" width=900px />


