In [None]:
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Using the Vertex AI PaLM API to explain BQML Clustering

This example demostrates how to use the Vertex AI PaLM API to explain BQML clustering. For more information see the doc

Let's log in with Google, load the Vertex AI libraries and restart the runtime

In [None]:
#!pip install git+https://github.com/googleapis/python-aiplatform.git "shapely<2.0.0" "protobuf==3.19.6"
%pip install google-cloud-aiplatform --upgrade --user

---

#### ⚠️ Do not forget to click the "RESTART RUNTIME" button above.

---

Let's define some variables that will be used throughout this notebook.

These are the GCP Project ID `project_id`, the Model name `model_name` which is any name you prefer, and finally the Dataset name `dataset_name`.
The dataset needs to exist in the same Project as `project_id` and you'll need appropriate access to create and delete.

In [2]:
import pandas as pd
from typing import Union
import sys
from google.cloud import bigquery


In [25]:
# @param {type:"string"}
PROJECT_ID = !gcloud config get-value project


In [8]:
#@title Setup Project Variables { run: "auto", display-mode: "form" }
REGION = "us-central1"  # @param {type:"string"}

project_id = PROJECT_ID
dataset_name = "jm_us" #@param {type:"string"}
model_name = "ecommerce_customer_segment_cluster5" #@param {type:"string"}
eval_name = model_name + "_eval"
LOCATION = "us-central1"  # @param {type:"string"}
client = bigquery.Client(project=project_id)

## Create a K-means model to cluster ecommerce data

First let's look at our data quickly before we create the model. This query can be run in BigQuery on its own. Try it out!

In [9]:
query = """
SELECT
  user_id,
  order_id,
  sale_price,
  created_at as order_created_date
FROM `bigquery-public-data.thelook_ecommerce.order_items`
WHERE created_at BETWEEN CAST('2020-01-01 00:00:00' AS TIMESTAMP)
AND CAST('2023-01-01 00:00:00' AS TIMESTAMP)
"""
df = client.query(query).to_dataframe()
df.head()


Unnamed: 0,user_id,order_id,sale_price,order_created_date
0,24972,30861,2.5,2021-05-19 13:02:47+00:00
1,72138,89851,2.5,2020-09-07 15:07:16+00:00
2,17913,22090,2.5,2022-12-16 16:19:12+00:00
3,57884,72030,3.0,2022-05-22 03:43:43+00:00
4,69467,86584,3.0,2022-02-08 21:27:26+00:00


## `CREATE MODEL` using `KMEANS`

Create a query then start the model creation job, using a python loop to wait for the job to complete. Please note, if you've created the model already, there's no need to rerun this step to create the clustering using model on the data.

In [10]:
query = """
CREATE OR REPLACE MODEL `{0}.{1}`
OPTIONS (
  MODEL_TYPE = "KMEANS",
  NUM_CLUSTERS = 5,
  KMEANS_INIT_METHOD = "KMEANS++",
  STANDARDIZE_FEATURES = TRUE )
AS (
SELECT * EXCEPT (user_id)
FROM (
  SELECT user_id,
    DATE_DIFF(CURRENT_DATE(), CAST(MAX(order_created_date) as DATE), day) AS days_since_order, -- RECENCY
    COUNT(order_id) AS count_orders, -- FREQUENCY
    AVG(sale_price) AS avg_spend -- MONETARY
  FROM (
    SELECT user_id,
      order_id,
      sale_price,
      created_at as order_created_date
    FROM `bigquery-public-data.thelook_ecommerce.order_items`
    WHERE created_at BETWEEN CAST('2020-01-01 00:00:00' AS TIMESTAMP)
    AND CAST('2023-01-01 00:00:00' AS TIMESTAMP)
  )
  GROUP BY user_id, order_id
 )
)
""".format(dataset_name, model_name)


In [11]:
# Wrapper to use BigQuery client to run query/job, return job ID or result as DF
def run_bq_query(sql: str) -> Union[str, pd.DataFrame]:
    """
    Input: SQL query, as a string, to execute in BigQuery
    Returns the query results as a pandas DataFrame, or error, if any
    """

    # Try dry run before executing query to catch any errors
    job_config = bigquery.QueryJobConfig(dry_run=True, use_query_cache=False)
    client.query(sql, job_config=job_config)

    # If dry run succeeds without errors, proceed to run query
    job_config = bigquery.QueryJobConfig()
    client_result = client.query(sql, job_config=job_config)

    job_id = client_result.job_id

    # Wait for query/job to finish running. then get & return data frame
    df = client_result.result().to_arrow().to_pandas()
    print(f"Finished job_id: {job_id}")
    return df

In [12]:
print(query)

# this should take under 5 minutes to create the model
run_bq_query(query)


CREATE OR REPLACE MODEL `jm_us.ecommerce_customer_segment_cluster5`
OPTIONS (
  MODEL_TYPE = "KMEANS",
  NUM_CLUSTERS = 5,
  KMEANS_INIT_METHOD = "KMEANS++",
  STANDARDIZE_FEATURES = TRUE )
AS (
SELECT * EXCEPT (user_id)
FROM (
  SELECT user_id,
    DATE_DIFF(CURRENT_DATE(), CAST(MAX(order_created_date) as DATE), day) AS days_since_order, -- RECENCY
    COUNT(order_id) AS count_orders, -- FREQUENCY
    AVG(sale_price) AS avg_spend -- MONETARY
  FROM (
    SELECT user_id,
      order_id,
      sale_price,
      created_at as order_created_date
    FROM `bigquery-public-data.thelook_ecommerce.order_items`
    WHERE created_at BETWEEN CAST('2020-01-01 00:00:00' AS TIMESTAMP)
    AND CAST('2023-01-01 00:00:00' AS TIMESTAMP)
  )
  GROUP BY user_id, order_id
 )
)

Finished job_id: 719796aa-3625-42fb-b2f1-5301aa664f9b


Let's take a look at the model's clustering performance, using these metrics - [Davies Bouldin Index](https://en.wikipedia.org/wiki/Davies%E2%80%93Bouldin_index) and Mean Squared Distance

In [13]:
query = """
SELECT *
FROM ML.EVALUATE(MODEL `{0}.{1}`)
""".format(dataset_name, model_name)
run_bq_query(query)


Finished job_id: f137a216-4529-4297-aba4-8843157accd8


Unnamed: 0,davies_bouldin_index,mean_squared_distance
0,1.503612,1.468655


Now let's get the cluster (centroid) information

In [14]:
query = """
SELECT
  CONCAT('cluster ', CAST(centroid_id as STRING)) as centroid,
  avg_spend as average_spend,
  count_orders as count_of_orders,
  days_since_order
FROM (
  SELECT centroid_id, feature, ROUND(numerical_value, 2) as value
  FROM ML.CENTROIDS(MODEL `{0}.{1}`)
)
PIVOT (
  SUM(value)
  FOR feature IN ('avg_spend',  'count_orders', 'days_since_order')
)
ORDER BY centroid_id
""".format(dataset_name, model_name)
run_bq_query(query)

Finished job_id: d77f52a1-4246-4695-ac7e-c60f05ebafac


Unnamed: 0,centroid,average_spend,count_of_orders,days_since_order
0,cluster 1,41.69,1.44,629.42
1,cluster 2,597.38,1.15,563.59
2,cluster 3,43.0,1.54,294.58
3,cluster 4,163.18,1.2,457.94
4,cluster 5,52.85,1.4,1012.31


Whew! That's a lot of metrics and cluster info. How about we explain this to our colleagues using the magic of LLMs.

In [15]:
df = client.query(query).to_dataframe()
df.to_string(header=False, index=False)

cluster_info = []
for i, row in df.iterrows():
  cluster_info.append("{0}, average spend ${2}, count of orders per person {1}, days since last order {3}"
    .format(row["centroid"], row["count_of_orders"], row["average_spend"], row["days_since_order"]) )

print(str.join("\n", cluster_info))

cluster 1, average spend $41.69, count of orders per person 1.44, days since last order 629.42
cluster 2, average spend $597.38, count of orders per person 1.15, days since last order 563.59
cluster 3, average spend $43.0, count of orders per person 1.54, days since last order 294.58
cluster 4, average spend $163.18, count of orders per person 1.2, days since last order 457.94
cluster 5, average spend $52.85, count of orders per person 1.4, days since last order 1012.31


## Explain with Vertex AI PaLM API

Install the python library and restart the runtime

In [16]:
from google.cloud import aiplatform
#from google.cloud.aiplatform.private_preview.language_models import TextGenerationModel, ChatModel
from vertexai.language_models._language_models import TextGenerationModel, ChatModel

aiplatform.init(project=project_id, location=LOCATION)

Generate a text prediction

In [17]:
#from google.cloud.aiplatform.private_preview.language_models import TextGenerationModel
from vertexai.language_models._language_models import TextGenerationModel

model = TextGenerationModel.from_pretrained("text-bison@001")

clusters = str.join("\n", cluster_info)

prompt = f"""
You're a creative brand strategist, given the following clusters, come up with creative brand persona, a catchy title, and next marketing action, explained step by step.

Clusters:
{clusters}

For each Cluster:
* Title:
* Persona:
* Next Marketing Step:
"""

print(model.predict(
    prompt,
    max_output_tokens=1024,
    temperature=0.55,
    top_p=0.8,
    top_k=40,
))

**Cluster 1**

* Title: The Loyal Customers
* Persona: These customers are loyal and spend a moderate amount of money on our products. They typically order once every few months.
* Next Marketing Step: We should focus on keeping these customers happy by offering them special deals and promotions. We could also send them personalized emails with product recommendations.

**Cluster 2**

* Title: The Big Spenders
* Persona: These customers spend a lot of money on our products and order frequently. They are typically looking for the latest and greatest products.
* Next Marketing Step: We should focus on marketing our new products to these customers. We could also offer them exclusive access to new products or services.

**Cluster 3**

* Title: The Occasional Buyers
* Persona: These customers buy from us only occasionally, but they typically spend a moderate amount of money. They are typically looking for products that are specific to their needs.
* Next Marketing Step: We should focus on m

Voila! We've now used k-means clustering to create groups of spenders and explain their profiles.

Sometimes, though, you want a little bit [extra](https://cloud.google.com/blog/transform/prompt-debunking-five-generative-ai-misconceptions).

In [19]:
from vertexai.language_models._language_models import TextGenerationModel

model = TextGenerationModel.from_pretrained("text-bison@001")

cluster_info = str.join('\n', cluster_info)

prompt = f"""
Pretend you're a creative strategist, analyse the following clusters and come up with \
creative brand persona for each that includes the detail of which Taylor Swift song is \
likely to be their favorite, a summary of how this relates to their purchasing behavior, \
and a witty e-mail headline for marketing campaign targeted to their group.

Clusters:
{cluster_info}
"""

print(model.predict(
    prompt,
    max_output_tokens=1024,
    temperature=0.45,
    top_p=0.8, top_k=40,
))

**Cluster 1: The Swifties**

Taylor Swift fans are known for their loyalty and devotion, and this cluster is no exception. They are typically young women who are passionate about music and fashion. They are also very social and love to share their love of Taylor Swift with their friends and family. The Swifties are typically high-spenders, and they are likely to buy Taylor Swift's merchandise, concert tickets, and albums. Their favorite Taylor Swift song is likely to be "Shake It Off," which is about being confident and not letting anyone bring you down.

**E-mail headline:** Taylor Swift fans, we have something special for you!

**Cluster 2: The Royals**

This cluster is made up of affluent, middle-aged women who are looking for high-quality products. They are typically well-educated and successful in their careers. They are also very stylish and are always looking for the latest trends. The Royals are typically low-volume shoppers, but they spend a lot of money when they do shop. The