In [None]:
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Overview

This notebook demonstrates how to enrich your data using Generative AI with Vertex AI on Google Cloud.

The specific example is a retail use case for improving product description metadata. Better product descriptions lead to more user engagement and higher conversion rates.


The workflow includes:
* Importing the data
* Analyzing the product metadata
* Enriching the data with an LLM
* Updating the data

# Getting Started

In [None]:
# Install the Vertex AI SDK

# This step currently includes an additional dependency, to avoid a conflict

!pip install --upgrade google-cloud-aiplatform shapely"<2"

**Restart** the runtime to use these package versions.

# Import data

This notebook requires downloading the [Flipkart Products](https://www.kaggle.com/datasets/PromptCloudHQ/flipkart-products?resource=download) dataset from Kaggle.

After downloading the data, upload the CSV file to the notebook using the Files feature of Colab.

In [None]:
import pandas as pd

# Read the data from the CSV file
df = pd.read_csv("flipkart_com-ecommerce_sample.csv", engine="python")

# Filter out blank rows
df = df.dropna(how="all")

# Print the first five rows of the data
df.head()

# Data Analysis

Let's now look at the product description field. To keep things simple, we'll use the length of the description as a proxy for data quality.

We'll look for short descriptions, where it's likely that useful details are omitted.

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import gaussian_kde

def get_num_chars_in_description(dataframe: pd.DataFrame) -> pd.Series:
  # Get the number of characters in the description field
  num_chars_in_description = dataframe["description"].str.len()

  # Try converting the number of characters to numerics before plotting the distribution
  try:
    # If it is string type, remove all characters except numbers, ".", "+" or "-".
    if pd.api.types.is_string_dtype(num_chars_in_description):
      num_chars_in_description = num_chars_in_description.str.replace(r"[^\d\-+\.]", "")
    num_chars_in_description = pd.to_numeric(num_chars_in_description)
  except:
    pass

  # Drop the rows with NA values
  num_chars_in_description = num_chars_in_description.dropna()

  # Convert the number of characters to integers
  num_chars_in_description = num_chars_in_description.astype(int)

  return num_chars_in_description

def plot_distribution(series: pd.Series) -> None:
  # Create a kernel density estimate (KDE) object
  kde = gaussian_kde(series)

  # Evaluate the PDF at a range of points
  x = np.linspace(0, max(series), 1000)
  y = kde(x)

  # Plot the smoothed distribution function
  plt.xscale("log")
  plt.plot(x, y)
  plt.xlabel("Number of characters in description")
  plt.ylabel("Probability density")
  plt.title("Product Description Length KDE Plot")
  plt.show()

num_chars_in_description = get_num_chars_in_description(df)
plot_distribution(num_chars_in_description)

Let's now set a threshold to improve the shortest 0.05% of our descriptions. We see that 93 characters should be that threshold, and there are 13 descriptions of that length or less.

In [None]:
threshold = int(num_chars_in_description.quantile(0.0005))

threshold

In [None]:
def get_rows_under_threshold(series: pd.Series, threshold) -> pd.Series:
  return num_chars_in_description.loc[series <= threshold]

rows_with_description_under_threshold = get_rows_under_threshold(num_chars_in_description, threshold)

print(rows_with_description_under_threshold.shape[0])

# Data Transformation

Next, let's collect the relevant details from each row with short descriptions. We'll put that into a JSON structure that we'll pass into the large language model.

In [None]:
import json

# Create a JSON object for each row
json_objects = []
for index, row in df.loc[rows_with_description_under_threshold.index].iterrows():
  row = row[["product_name", "description", "brand", "product_category_tree", "pid"]]
  json_object = {}
  for column in row.index:
    json_object[column] = row[column]
  json_objects.append(json_object)

# Create a JSON array
json_array = json.dumps(json_objects)

# Print the JSON array
print(json_array)

We can now include this JSON array into a prompt to query the LLM with.

In [None]:
prompt = f"""
Generate a compelling and accurate product description
for each of the products provided in the JSON data structure below.
This description should be included in the output.

The output should be a JSON array consisting only of the
original `pid` and updated `description` fields for each product.
===
{json_array}
"""

print(prompt)

# Query LLM

In this step, we'll connect to a Vertex AI LLM with our prompt and return a result.

First, let's define some project parameters:

In [None]:
project_id = 'YOUR_PROJECT_ID' # @param {type:"string"}
location = 'us-central1' # @param {type:"string"}'


In [None]:
# Authenticate to use the SDK

import google.colab.auth
google.colab.auth.authenticate_user()

In [None]:
import vertexai
from vertexai.language_models import TextGenerationModel

# Initialize the client
vertexai.init(project=project_id, location=location)

# Use the text-bison model from the Vertex Model Garden
model = TextGenerationModel.from_pretrained("text-bison@001")

# Update the default max_output_tokens
parameters = {"max_output_tokens": 1024}

# Query the model
response = model.predict(prompt, **parameters)

# Print the result
response.text

# Update Data

With these new descriptions, let's update the original dataframe and analyze the result.

In [None]:
# Load the string response into a JSON object

def clean_array(string):
  """Remove whitespace and any trailing/leading brackets."""
  string = string.strip()
  if string[0] == '{':
    string = string[1:]
  if string[-1] == '}':
    string = string[:-1]
  return string

products = json.loads(clean_array(response.text))

In [None]:
# Create a mapping between product IDs and updated descriptions

pid_to_description = {}
for product in products:
    pid_to_description[product['pid']] = product['description']

In [None]:
# Create a boolean mask to indicate which of the original products have an updated description
mask = df['pid'].isin(pid_to_description.keys())

# Update the descriptions in a new data frame
updated_df = df.copy()
updated_df.loc[mask, 'description'] = df.loc[mask, 'pid'].apply(lambda pid: pid_to_description.get(pid))

In [None]:
# Create a new series containing the product description lengths
updated_num_chars_in_description = get_num_chars_in_description(updated_df)

# Calculate how many rows are now under the threshold
print(get_rows_under_threshold(updated_num_chars_in_description, threshold).shape[0])

Finally, let's plot our result!

In [None]:
# Define the bin width and xlimit
bin_width = 10
xlim_left = 60
xlim_right = 120

# Create two histograms
fig, ax = plt.subplots()
ax.hist(num_chars_in_description, bins=np.arange(0, xlim_right + bin_width, bin_width), alpha=0.5, label='Original')
ax.hist(updated_num_chars_in_description, bins=np.arange(0, xlim_right + bin_width, bin_width), alpha=0.5, label='Updated')

# Set the x-axis limits
ax.set_xlim(xlim_left, xlim_right)

# Add a legend
ax.legend()

# Set the title and labels
ax.set_title('Product Description Length')
ax.set_xlabel('Number of Characters')
ax.set_ylabel('Frequency')

# Show the plot
plt.show()


Thanks for walking through this tutorial. I hope you've learned some new things, and enjoyed the experience!