In [1]:
# Copyright 2021 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Using NVTabular for large scale feature engineering on CSV files in Google Cloud Storage

This notebook demonstrates how to do data preprocessing with NVIDIA NVTabular on Vertex AI Pipeline steps using Google Cloud Storage as the data source.  
You will create a pipeline with the following steps:
 - Read CSV files from Google Cloud Storage (GCS)
 - Convert these files to parquet format and write to GCS
 - Define the DAG with transformation steps and create a Workflow
 - Fit the dataset (calculate statistics necessary for data transformation)
 - Transform the data
 - Output transformed parquet files to GCS

The goal is to present how to use NVTabular to transform the data on multiple GPUs.  
The dataset used for this tutorial is the [Criteo 1TB Click Logs dataset](https://ailab.criteo.com/download-criteo-1tb-click-logs-dataset/).  

Architecture overview:

<img src="./images/pipeline_1.png" alt="Pipeline" style="height: 60%; width:60%;"/>

## Setup

### Import libraries

In [1]:
# Environment variables
import config

# Standard
import json
from datetime import datetime

# Google Cloud
from google.cloud import aiplatform

# Kubeflow Pipelines
from kfp.v2 import compiler

# NVTabular
import nvtabular as nvt
from nvtabular.ops import (
    Categorify,
    Clip,
    FillMissing,
    Normalize,
)

# Import components and pipeline definition
from src.preprocessing.pipeline_gcs import preprocessing_pipeline_gcs

TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")

## 1. Create Workflow DAG definition for data transformation

### Columns and dtype definitions

In [2]:
# Columns and dtypes definition
cont_names = ["I" + str(x) for x in range(1, 14)]
cat_names = ["C" + str(x) for x in range(1, 27)]
columns = ["label"] + cont_names + cat_names

# Specify column dtypes. Note that "hex" means that
# the values will be hexadecimal strings that should
# be converted to int32
cols_dtype = {}
cols_dtype["label"] = 'int32'
for x in cont_names:
    cols_dtype[x] = 'int32'
for x in cat_names:
    cols_dtype[x] = 'hex'

### Create transformation pipeline

In [3]:
# Transformation pipeline
num_buckets = 10000000
categorify_op = Categorify(max_size=num_buckets)
cat_features = cat_names >> categorify_op
cont_features = cont_names >> FillMissing() >> Clip(min_value=0) >> Normalize()
features = cat_features + cont_features + ['label']

In [4]:
# Create and save workflow
workflow = nvt.Workflow(features)
workflow.save('./saved_workflow')

In [5]:
# Upload saved workflow to GCS
! gsutil cp -r ./saved_workflow/ 'gs://{config.SAVED_WORKFLOW_PATH}'

Copying file://./saved_workflow/workflow.pkl [Content-Type=application/octet-stream]...
Copying file://./saved_workflow/metadata.json [Content-Type=application/json]...

Operation completed over 2 objects/1.6 KiB.                                      


## 2. Pipeline definition

### Parameters values for pipeline execution

In [3]:
train_paths = ['gs://workshop-datasets/criteo/day_0'] # Sample training CSV file to be converted
valid_paths = ['gs://workshop-datasets/criteo/day_1'] # Sample validation CSV file to be converted

sep = '\t' # Separator for the CSV file
gpus = '0' # Identifier of the GPU. As you will execute with only 1 GPU, only the first identier is passed.
           # If you were to execute the pipeline with 4 GPUs, you should use '0,1,2,3'.

recursive = False # If the train/valid paths should be navigated recursivelly
shuffle = None # How to shuffle the dataset both in the conversion from CSV to PARQUET and during transformation.

### Create dictionary with parameter values

In [4]:
# Create a dictionary with all the parameters defined until now
parameter_values = {
    'train_paths': json.dumps(train_paths),
    'valid_paths': json.dumps(valid_paths),
    'output_converted': config.OUTPUT_CONVERTED,
    'columns': json.dumps(columns),
    'cols_dtype': json.dumps(cols_dtype),
    'output_transformed': config.OUTPUT_TRANSFORMED,
    'workflow_path': config.SAVED_WORKFLOW_PATH,
    'sep': sep,
    'gpus': gpus,
    'recursive': json.dumps(recursive),
    'shuffle': json.dumps(shuffle)
}

## Pipeline execution

### KFP pipeline compilation

In [5]:
# Compile the pipeline.
# This command will validate the pipeline and generate a JSON file with its specifications
PACKAGE_PATH = 'nvt_gcs_pipeline.json'
compiler.Compiler().compile(
       pipeline_func=preprocessing_pipeline_gcs,
       package_path=PACKAGE_PATH
)

### Initialize aiplatform SDK client

In [6]:
# Initialize aiplatform SDK client
aiplatform.init(
    project=config.PROJECT_ID,
    location=config.REGION,
    staging_bucket=config.STAGING_BUCKET
)

### Submit job to Vertex AI Pipelines

In [None]:
pipeline_job = aiplatform.PipelineJob(
    display_name=f'{TIMESTAMP}_nvt_gcs_pipeline',
    template_path=PACKAGE_PATH,
    enable_caching=False,
    parameter_values=parameter_values,
)

pipeline_job.run()