# AutoML Propensity to Purchase with code

Use the Vertex AI Python Client to recreate a no-code approach with code (Python). This builds a custom model with AutoML and executes a batch prediction 


## Setup
Inputs:

In [20]:
PROJECT_ID = 'demos-vertex-ai'
REGION = 'us-central1'
DATANAME = 'propensity'
NOTEBOOK = 'automl-propensity-code'

# Resources
DEPLOY_COMPUTE = 'n1-standard-4'

# Model Training
VAR_TARGET = 'will_buy_on_return_visit'
VAR_OMIT = 'fullVisitorId' # add more variables to the string with space delimiters

packages:

In [21]:
from google.cloud import aiplatform
from datetime import datetime

from google.cloud import bigquery
from google.protobuf import json_format
from google.protobuf.struct_pb2 import Value
import json
import numpy as np

clients: 

In [22]:
aiplatform.init(project=PROJECT_ID, location=REGION)
bq = bigquery.Client(project = PROJECT_ID)

parameters:

In [25]:
TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")
DIR = f"temp/{NOTEBOOK}"

environment:

In [26]:
!rm -rf {DIR}
!mkdir -p {DIR}

## Create BigQuery Dataset
List BQ datasets in the project first to see

In [8]:
query = f"""
SELECT schema_name
FROM `{PROJECT_ID}.INFORMATION_SCHEMA.SCHEMATA`
"""
bq.query(query = query).to_dataframe()

Unnamed: 0,schema_name
0,customer_segmentation
1,bank_marketing
2,propensity
3,demo_demandforecasting
4,propensity_demo
5,auditlog_dataset


Create dataset if missing

In [9]:
query = f"""
CREATE SCHEMA IF NOT EXISTS `{PROJECT_ID}.{DATANAME}`
OPTIONS(
    location = 'US',
    labels = [('notebook','{NOTEBOOK}')]
)
"""
job = bq.query(query = query)
job.result()

<google.cloud.bigquery.table._EmptyRowIterator at 0x7fdd878caa10>

In [10]:
(job.ended-job.started).total_seconds()

0.104

list BQ datasets again to confirm creation:

In [6]:
query = f"""
SELECT schema_name
FROM `{PROJECT_ID}.INFORMATION_SCHEMA.SCHEMATA`
"""
bq.query(query = query).to_dataframe()

Unnamed: 0,schema_name
0,customer_segmentation
1,bank_marketing
2,propensity
3,demo_demandforecasting
4,propensity_demo
5,auditlog_dataset


## Create BigQuery Table 

Submit job to save query results to a table via Python [Writing query results  |  BigQuery  |  Google Cloud](https://cloud.google.com/bigquery/docs/writing-results#writing_query_results)

In [46]:
table_id = f"{PROJECT_ID}.{DATANAME}.{DATANAME}"

job_config = bigquery.QueryJobConfig(destination=table_id,
                                    write_disposition = 'WRITE_TRUNCATE')

sql = """
  SELECT
    fullVisitorId,
    bounces,
    time_on_site,
    will_buy_on_return_visit
  FROM (
        # select features
        SELECT
          fullVisitorId,
          IFNULL(totals.bounces, 0) AS bounces,
          IFNULL(totals.timeOnSite, 0) AS time_on_site
        FROM
          `data-to-insights.ecommerce.web_analytics`
        WHERE
          totals.newVisits = 1
        AND date BETWEEN '20160801' # train on first 9 months of data
        AND '20170430'
       )
  JOIN (
        SELECT
          fullvisitorid,
          IF (
              COUNTIF (
                       totals.transactions > 0
                       AND totals.newVisits IS NULL
                      ) > 0,
              1,
              0
             ) AS will_buy_on_return_visit
        FROM
          `bigquery-public-data.google_analytics_sample.*`
        GROUP BY
          fullvisitorid
       )
  USING (fullVisitorId)
  ORDER BY time_on_site DESC
"""

# Start the query, passing in the extra configuration.
query_job = bq.query(sql, job_config=job_config)  # Make an API request.
query_job.result()  # Wait for the job to complete.

print("Query results loaded to the table {}".format(table_id))

Query results loaded to the table demos-vertex-ai.propensity.propensity


# Create AutoML Dataset (link to BigQuery table)

In [7]:
dataset = aiplatform.TabularDataset.create(
    display_name = f'{NOTEBOOK}_{DATANAME}_{TIMESTAMP}', 
    bq_source = f'bq://{PROJECT_ID}.{DATANAME}.{DATANAME}',
    labels = {'notebook':f'{NOTEBOOK}'}
)

INFO:google.cloud.aiplatform.datasets.dataset:Creating TabularDataset
INFO:google.cloud.aiplatform.datasets.dataset:Create TabularDataset backing LRO: projects/746038361521/locations/us-central1/datasets/652586539364122624/operations/1682442524972548096
INFO:google.cloud.aiplatform.datasets.dataset:TabularDataset created. Resource name: projects/746038361521/locations/us-central1/datasets/652586539364122624
INFO:google.cloud.aiplatform.datasets.dataset:To use this TabularDataset in another session:
INFO:google.cloud.aiplatform.datasets.dataset:ds = aiplatform.TabularDataset('projects/746038361521/locations/us-central1/datasets/652586539364122624')


# Train Model with AutoML 

In [15]:
column_specs = list(set(dataset.column_names) - set(VAR_OMIT.split()) - set([VAR_TARGET, 'splits']))

In [16]:
column_specs = dict.fromkeys(column_specs, 'auto')

In [17]:
print(column_specs)

{'bounces': 'auto', 'time_on_site': 'auto'}


Define a Job:

* Consider Weighting
* Model Type
* Optimization Objective

https://googleapis.dev/python/aiplatform/latest/aiplatform.html#google.cloud.aiplatform.AutoMLTabularTrainingJob

In [19]:
tabular_classification_job = aiplatform.AutoMLTabularTrainingJob(
    display_name = f'{NOTEBOOK}_{DATANAME}_{TIMESTAMP}',
    optimization_prediction_type = 'classification',
    optimization_objective = 'maximize-au-prc',
    column_specs = column_specs,
    labels = {'notebook':f'{NOTEBOOK}'}
)

In [20]:
model = tabular_classification_job.run(
    dataset = dataset,
    target_column = VAR_TARGET,
    # predefined_split_column_name = 'splits',
    #    training_fraction_split = 0.8,
    #    validation_fraction_split = 0.1,
    #    test_fraction_split = 0.1,
    budget_milli_node_hours = 1000,
    model_display_name = f'{NOTEBOOK}_{DATANAME}_{TIMESTAMP}',
    disable_early_stopping = False,
    model_labels = {'notebook':f'{NOTEBOOK}'}
)

INFO:google.cloud.aiplatform.training_jobs:No dataset split provided. The service will use a default split.
INFO:google.cloud.aiplatform.training_jobs:View Training:
https://console.cloud.google.com/ai/platform/locations/us-central1/training/6321304004072570880?project=746038361521
INFO:google.cloud.aiplatform.training_jobs:AutoMLTabularTrainingJob projects/746038361521/locations/us-central1/trainingPipelines/6321304004072570880 current state:
PipelineState.PIPELINE_STATE_RUNNING
INFO:google.cloud.aiplatform.training_jobs:AutoMLTabularTrainingJob projects/746038361521/locations/us-central1/trainingPipelines/6321304004072570880 current state:
PipelineState.PIPELINE_STATE_RUNNING
INFO:google.cloud.aiplatform.training_jobs:AutoMLTabularTrainingJob projects/746038361521/locations/us-central1/trainingPipelines/6321304004072570880 current state:
PipelineState.PIPELINE_STATE_RUNNING
INFO:google.cloud.aiplatform.training_jobs:AutoMLTabularTrainingJob projects/746038361521/locations/us-central1

# Evaluation 
One can evaluate the model in 2 ways 

1. within the Cloud Console under [Vertex AI > Models](https://console.cloud.google.com/vertex-ai/models) 
2. via the API 


Setup a model client for the model created by this notebook:

In [13]:
# model = aiplatform.Model('projects/746038361521/locations/us-central1/models/298666940522561536')
model.resource_name

'projects/746038361521/locations/us-central1/models/298666940522561536'

In [14]:
model_client = aiplatform.gapic.ModelServiceClient(
    client_options = {
        'api_endpoint' : f'{REGION}-aiplatform.googleapis.com'
    }
)

Retrives the aggregate model evalution metrics for the model as a whole. First, use `.list_model_evaluations` to retrieve the evaluation id, then use `.get_model_evaluation` for the evaluation id:

In [15]:
evaluations = model_client.list_model_evaluations(parent = model.resource_name)
evals = iter(evaluations)
eval_id = next(evals).name
geteval = model_client.get_model_evaluation(name = eval_id)

In [16]:
geteval.metrics['auPrc']

0.99612117

In [17]:
for i in range(len(geteval.metrics['confusionMatrix']['annotationSpecs'])):
    print('True Label = ', 
          geteval.metrics['confusionMatrix']['annotationSpecs'][i]['displayName'], 
          ' has Predicted labels = ', 
          geteval.metrics['confusionMatrix']['rows'][i])


True Label =  0  has Predicted labels =  [55224.0, 0.0]
True Label =  1  has Predicted labels =  [384.0, 0.0]


In [18]:
slices = model_client.list_model_evaluation_slices(parent = eval_id)
for slice in slices:
    print('Label = ', slice.slice_.value, 'has auPrc = ', slice.metrics['auPrc'])

Label =  0 has auPrc =  0.9971852
Label =  1 has auPrc =  0.036081985


# Endpoint and Deployment

In [34]:
# endpoint = aiplatform.Endpoint.create(
#     display_name = f'{NOTEBOOK}_{DATANAME}_{TIMESTAMP}',
#     labels = {'notebook':f'{NOTEBOOK}'}
# )

INFO:google.cloud.aiplatform.models:Creating Endpoint
INFO:google.cloud.aiplatform.models:Create Endpoint backing LRO: projects/746038361521/locations/us-central1/endpoints/6232603652280811520/operations/822817946098204672
INFO:google.cloud.aiplatform.models:Endpoint created. Resource name: projects/746038361521/locations/us-central1/endpoints/6232603652280811520
INFO:google.cloud.aiplatform.models:To use this Endpoint in another session:
INFO:google.cloud.aiplatform.models:endpoint = aiplatform.Endpoint('projects/746038361521/locations/us-central1/endpoints/6232603652280811520')


In [None]:
# endpoint.deploy(
#     model = model,
#     deployed_model_display_name = f'{NOTEBOOK}_{DATANAME}_{TIMESTAMP}',
#     traffic_percentage = 100,
#     machine_type = DEPLOY_COMPUTE,
#     min_replica_count = 1,
#     max_replica_count = 1
# )

INFO:google.cloud.aiplatform.models:Deploying Model projects/746038361521/locations/us-central1/models/298666940522561536 to Endpoint : projects/746038361521/locations/us-central1/endpoints/6232603652280811520
INFO:google.cloud.aiplatform.models:Deploy Endpoint model backing LRO: projects/746038361521/locations/us-central1/endpoints/6232603652280811520/operations/8848232482072428544


# Batch Prediction

## Create sample batch input (BigQuery table)
From original dataset for a simplified demonstration

In [30]:
table_id = f"{PROJECT_ID}.{DATANAME}.batch_01"

job_config = bigquery.QueryJobConfig(destination=table_id,
                                    write_disposition = 'WRITE_TRUNCATE')

sql = f"""
  SELECT * FROM {PROJECT_ID}.{DATANAME}.{DATANAME} WHERE RAND() < 10/555987
"""

# Start the query, passing in the extra configuration.
query_job = bq.query(sql, job_config=job_config)  # Make an API request.
query_job.result()  # Wait for the job to complete.

print("Query results loaded to the table {}".format(table_id))

Query results loaded to the table demos-vertex-ai.propensity.batch_01


## Batch Prediction from BigQuery source to BigQuery Destination, with Explanations

In [28]:
batch = aiplatform.BatchPredictionJob.create(
    job_display_name = f'{NOTEBOOK}_{DATANAME}_{TIMESTAMP}',
    model_name = model.name,
    instances_format = "bigquery",
    predictions_format = "bigquery",
    bigquery_source = f'bq://{PROJECT_ID}.{DATANAME}.batch_01',
    bigquery_destination_prefix = f"{PROJECT_ID}",
    generate_explanation = True,
    labels = {'notebook':f'{NOTEBOOK}'}
)

INFO:google.cloud.aiplatform.jobs:Creating BatchPredictionJob
INFO:google.cloud.aiplatform.jobs:BatchPredictionJob created. Resource name: projects/746038361521/locations/us-central1/batchPredictionJobs/4379795167838732288
INFO:google.cloud.aiplatform.jobs:To use this BatchPredictionJob in another session:
INFO:google.cloud.aiplatform.jobs:bpj = aiplatform.BatchPredictionJob('projects/746038361521/locations/us-central1/batchPredictionJobs/4379795167838732288')
INFO:google.cloud.aiplatform.jobs:View Batch Prediction Job:
https://console.cloud.google.com/ai/platform/locations/us-central1/batch-predictions/4379795167838732288?project=746038361521
INFO:google.cloud.aiplatform.jobs:BatchPredictionJob projects/746038361521/locations/us-central1/batchPredictionJobs/4379795167838732288 current state:
JobState.JOB_STATE_PENDING
INFO:google.cloud.aiplatform.jobs:BatchPredictionJob projects/746038361521/locations/us-central1/batchPredictionJobs/4379795167838732288 current state:
JobState.JOB_STAT