In [1]:
# Copyright 2021 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Feature engineering for large scale recommenders with NVIDIA NVTabular and Vertex AI

# Overview

The focus of this guide is to compile prescriptive guidelines for developing and operationalizing data preprocessing and feature engineering workflows using Vertex AI and NVIDIA NVTabular.  

The guide will also focus on providing guidance around high performance storage management and access and configuration management for NVTabular feature engineering artifacts.

This tutorial demonstrates how to use NVIDIA NVtabular to preprocess a large dataset on GPUs orchestrated by Vertex AI Pipelines.

# Dataset

The dataset used for this tutorial is the [Criteo 1TB Click Logs dataset](https://ailab.criteo.com/download-criteo-1tb-click-logs-dataset/).  

### From the Criteo website:
 - This dataset contains feature values and click feedback for millions of display ads. Its purpose is to benchmark algorithms for clickthrough rate (CTR) prediction.
 - This dataset contains 24 files, each one corresponding to one day of data.
 - Each row corresponds to a display ad served by Criteo and the first column is indicates whether this ad has been clicked or not. The positive (clicked) and negatives  (non-clicked) examples have both been subsampled (but at different rates) in order to reduce the dataset size.
 - There are 13 features taking integer values (mostly count features) and 26 categorical features. The values of the categorical features have been hashed
onto 32 bits for anonymization purposes.
 - The semantic of these features is undisclosed. Some features may have missing values.
 - The rows are chronologically ordered.

#### Data fields
 - Label - Target variable that indicates if an ad was clicked (1) or not (0).
 - I1-I13 - A total of 13 columns of integer features (mostly count features).
 - C1-C26 - A total of 26 columns of categorical features. The values of these features have been hashed onto 32 bits for anonymization purposes. 
 - The semantic of the features is undisclosed.  
 - When a value is missing, the field is empty.

#### Format
The columns are tab separated with the following schema:  
<label> <integer feature 1> … <integer feature 13> <categorical feature 1> … <categorical feature 26>

# Objective
This notebook demonstrates how to do data preprocessing with NVIDIA NVTabular on Vertex AI Pipeline steps.
Three different pipelines are created to show possible alternatives on how to input and output data.

*Pipeline 1*: Input CSV files from Google Cloud Storage (GCS for short)  
 - Read CSV files from Google Cloud Storage
 - Fit the dataset (calculate statistics necessary for data transformation)
 - Transform the data
 - Output to GCS

*Pipeline 2*: Input Parquet files exported from BigQuery
 - Export Parquer files from a table in Bigquery
 - Fit the dataset (calculate statistics necessary for data transformation)
 - Transform the data
 - Output to GCS

*Pipeline 3*: Input CSV files from GCS and output to Vertex AI Feature Store 
 - Read CSV files from Google Cloud Storage
 - Fit the dataset (calculate statistics necessary for data transformation)
 - Transform the data
 - Output to GCS
 - Load the transformed data to BigQuery
 - Create a Vertex AI Feature Store and load the data from BigQuery

The goal is to present how to use NVTabular to transform the data on GPU and different ways of inputing (GCS and Bigquery) and outputing (GCS, BigQuery, FeatureStore) data.

# Costs
This tutorial uses billable components of Google Cloud (GCP):
 - Vertex AI (Pipelines, FeatureStore)
 - Cloud Storage
 - BigQuery

Use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage.

# Instalation

## Vertex AI SDK, NVTabular


In [1]:
# Standard
import json

# Google Cloud
from google.cloud import aiplatform

# Kubeflow Pipelines
from kfp.v2 import compiler

# NVTabular
import nvtabular as nvt
from nvtabular.ops import (
    Categorify,
    Clip,
    FillMissing,
    Normalize,
)



# 1) Pipeline: Source data in GCS
## Data Definition

In [2]:
# Import components and pipeline definition
from pipeline_gcs import preprocessing_pipeline_gcs

In [3]:
# Columns and dtypes definition
cont_names = ["I" + str(x) for x in range(1, 14)]
cat_names = ["C" + str(x) for x in range(1, 27)]
columns = ["label"] + cont_names + cat_names

# Specify column dtypes. Note that "hex" means that
# the values will be hexadecimal strings that should
# be converted to int32
cols_dtype = {}
cols_dtype["label"] = 'int32'
for x in cont_names:
    cols_dtype[x] = 'int32'
for x in cat_names:
    cols_dtype[x] = 'hex'

In [4]:
# Transformation pipeline
num_buckets = 10000000
categorify_op = Categorify(max_size=num_buckets)
cat_features = cat_names >> categorify_op
cont_features = cont_names >> FillMissing() >> Clip(min_value=0) >> Normalize()
features = cat_features + cont_features + ['label']

# Create and save workflow
workflow = nvt.Workflow(features)
workflow.save('./saved_workflow')

In [5]:
train_paths = ['renatoleite-criteo-partial/flat_data/day_0']
valid_paths = ['renatoleite-criteo-partial/flat_data/day_1']
output_path = 'renatoleite-criteo-partial/converted'
workflow_path = 'renatoleite-criteo-partial/saved_workflow'
output_transformed = 'renatoleite-criteo-partial/transformed_data'

sep = '\t'
gpus = '0'

recursive = False
shuffle = None

In [6]:
parameter_values = {
    'train_paths': json.dumps(train_paths),
    'valid_paths': json.dumps(valid_paths),
    'output_path': output_path,
    'columns': json.dumps(columns),
    'cols_dtype': json.dumps(cols_dtype),
    'output_transformed': output_transformed,
    'workflow_path': workflow_path,
    'sep': sep,
    'gpus': gpus,
    'recursive': json.dumps(recursive),
    'shuffle': json.dumps(shuffle)
}

In [7]:
# Compile Pipeline
PACKAGE_PATH = 'pipeline_gcs.json'
compiler.Compiler().compile(
       pipeline_func=preprocessing_pipeline_gcs,
       package_path=PACKAGE_PATH
)

In [8]:
project_id = 'renatoleite-mldemos'
region = 'us-central1'
staging_bucket = 'gs://renatoleite-staging'

In [9]:
aiplatform.init(
    project=project_id,
    location=region,
    staging_bucket=staging_bucket
)

In [10]:
pipeline_job = aiplatform.PipelineJob(
    display_name='nvt_convert_pipeline_gcs',
    template_path=PACKAGE_PATH,
    enable_caching=False,
    parameter_values=parameter_values,
)

pipeline_job.run()


INFO:google.cloud.aiplatform.pipeline_jobs:Creating PipelineJob


InvalidArgument: 400 Invalid image URI nvcr.io/nvidia/merlin/merlin-training:21.09.

# 2) Pipeline: Source data in BQ

In [2]:
# Import components and pipeline definition
from pipeline_bq import preprocessing_pipeline_bq

In [3]:
output_path = 'renatoleite-criteo-partial/bq_converted'
bq_project = 'renatoleite-mldemos'
bq_dataset_id = 'criteo_pipeline'
bq_table_train = 'train'
bq_table_valid = 'valid'
location = 'US'

workflow_path = 'renatoleite-criteo-partial/saved_workflow'
output_transformed = 'renatoleite-criteo-partial/bq_transformed_data'
gpus = '0'

recursive = False
shuffle = None

In [4]:
parameter_values = {
    'bq_table_train': bq_table_train,
    'bq_table_valid': bq_table_valid,
    'output_path': output_path,
    'bq_project': bq_project,
    'bq_dataset_id': bq_dataset_id,
    'location': location,
    'gpus': gpus,
    'workflow_path': workflow_path,
    'output_transformed': output_transformed,
    'recursive': json.dumps(recursive),
    'shuffle': json.dumps(shuffle)
}

In [7]:
# Compile Pipeline
PACKAGE_PATH = 'pipeline_bq.json'
compiler.Compiler().compile(
       pipeline_func=preprocessing_pipeline_bq,
       package_path=PACKAGE_PATH
)

In [8]:
project_id = 'renatoleite-mldemos'
region = 'us-central1'
staging_bucket = 'gs://renatoleite-staging'

In [9]:
aiplatform.init(
    project=project_id,
    location=region,
    staging_bucket=staging_bucket
)

In [None]:
pipeline_job = aiplatform.PipelineJob(
    display_name='nvt_convert_pipeline_bq',
    template_path=PACKAGE_PATH,
    enable_caching=False,
    parameter_values=parameter_values,
)

pipeline_job.run()

# 3) Pipeline: Source GCS and output to Feature Store

In [2]:
# 1) Create op to export from gcs parquet back to BQ
# 2) Export dataset to Feature Store

# Import components and pipeline definition
from pipeline_gcs_feat import preprocessing_pipeline_gcs_feat

In [3]:
# Columns and dtypes definition
cont_names = ["I" + str(x) for x in range(1, 14)]
cat_names = ["C" + str(x) for x in range(1, 27)]
columns = ["label"] + cont_names + cat_names

# Specify column dtypes. Note that "hex" means that
# the values will be hexadecimal strings that should
# be converted to int32
cols_dtype = {}
cols_dtype["label"] = 'int32'
for x in cont_names:
    cols_dtype[x] = 'int32'
for x in cat_names:
    cols_dtype[x] = 'hex'

In [4]:
train_paths = ['renatoleite-criteo-partial/flat_data/day_0']
valid_paths = ['renatoleite-criteo-partial/flat_data/day_1']
output_path = 'renatoleite-criteo-partial/converted'
workflow_path = 'renatoleite-criteo-partial/saved_workflow'
output_transformed = 'renatoleite-criteo-partial/transformed_data'

sep = '\t'
gpus = '0'

recursive = False
shuffle = None

bq_project = 'renatoleite-mldemos'
bq_dataset_id = 'criteo_pipeline'
bq_dest_table_id = 'transformed_train'

In [5]:
parameter_values = {
    'train_paths': json.dumps(train_paths),
    'valid_paths': json.dumps(valid_paths),
    'output_path': output_path,
    'columns': json.dumps(columns),
    'cols_dtype': json.dumps(cols_dtype),
    'output_transformed': output_transformed,
    'workflow_path': workflow_path,
    'sep': sep,
    'gpus': gpus,
    'bq_project': bq_project,
    'bq_dataset_id':bq_dataset_id,
    'bq_dest_table_id': bq_dest_table_id,
    'recursive': json.dumps(recursive),
    'shuffle': json.dumps(shuffle)
}

In [7]:
# Compile Pipeline
PACKAGE_PATH = 'pipeline_nvt_gcs_feat.json'
compiler.Compiler().compile(
       pipeline_func=preprocessing_pipeline_gcs_feat,
       package_path=PACKAGE_PATH
)

In [8]:
project_id = 'renatoleite-mldemos'
region = 'us-central1'
staging_bucket = 'gs://renatoleite-staging'

In [9]:
aiplatform.init(
    project=project_id,
    location=region,
    staging_bucket=staging_bucket
)

In [None]:
pipeline_job = aiplatform.PipelineJob(
    display_name='pipeline_nvt_gcs_feat',
    template_path=PACKAGE_PATH,
    enable_caching=False,
    parameter_values=parameter_values,
)

pipeline_job.run()