In [1]:
# Copyright 2021 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Using NVTabular for large scale feature engineering on CSV files in Google Cloud Storage

This notebook demonstrates how to do data preprocessing with NVIDIA NVTabular on Vertex AI Pipeline steps.  
You will create a pipeline with the following steps:
 - Read CSV files from Google Cloud Storage (GCS)
 - Convert these files to parquet format and write to GCS
 - Define the DAG with transformation steps and create a Workflow
 - Fit the dataset (calculate statistics necessary for data transformation)
 - Transform the data
 - Output transformed parquet files to GCS

The goal is to present how to use NVTabular to transform the data on multiple GPUs.  
The dataset used for this tutorial is the [Criteo 1TB Click Logs dataset](https://ailab.criteo.com/download-criteo-1tb-click-logs-dataset/).  

Architecture overview:

<img src="./images/pipeline_1.png" alt="Pipeline" style="height: 60%; width:60%;"/>

## Setup

### Import libraries

In [1]:
# Environment variables
import config

# Standard
import json
from datetime import datetime

# Google Cloud
from google.cloud import aiplatform

# Kubeflow Pipelines
from kfp.v2 import compiler

# NVTabular
import nvtabular as nvt
from nvtabular.ops import (
    Categorify,
    Clip,
    FillMissing,
    Normalize,
)

# Import components and pipeline definition
from src.preprocessing.pipeline_gcs import preprocessing_pipeline_gcs

TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")

## 1. Create Workflow DAG definition for data transformation

### Columns and dtype definitions

In [2]:
# Columns and dtypes definition
cont_names = ["I" + str(x) for x in range(1, 14)]
cat_names = ["C" + str(x) for x in range(1, 27)]
columns = ["label"] + cont_names + cat_names

# Specify column dtypes. Note that "hex" means that
# the values will be hexadecimal strings that should
# be converted to int32
cols_dtype = {}
cols_dtype["label"] = 'int32'
for x in cont_names:
    cols_dtype[x] = 'int32'
for x in cat_names:
    cols_dtype[x] = 'hex'

### Create transformation pipeline

In [3]:
# Transformation pipeline
num_buckets = 10000000
categorify_op = Categorify(max_size=num_buckets)
cat_features = cat_names >> categorify_op
cont_features = cont_names >> FillMissing() >> Clip(min_value=0) >> Normalize()
features = cat_features + cont_features + ['label']

In [5]:
# Create and save workflow
workflow = nvt.Workflow(features)
workflow.save('./saved_workflow')

In [6]:
# Upload saved workflow to GCS
! gsutil cp -r ./saved_workflow/ 'gs://{config.SAVED_WORKFLOW_PATH[5:]}'

Copying file://./saved_workflow/workflow.pkl [Content-Type=application/octet-stream]...
Copying file://./saved_workflow/metadata.json [Content-Type=application/json]...

Operation completed over 2 objects/1.8 KiB.                                      


## 2. Pipeline definition

### Parameters values for pipeline execution

In [3]:
train_paths = ['gs://workshop-datasets/criteo/day_0'] # Sample training CSV file to be converted
valid_paths = ['gs://workshop-datasets/criteo/day_1'] # Sample validation CSV file to be converted

sep = '\t' # Separator for the CSV file
gpus = '0' # Identifier of the GPU. As you will execute with only 1 GPU, only the first identier is passed.
           # If you were to execute the pipeline with 4 GPUs, you should use '0,1,2,3'.

recursive = False # If the train/valid paths should be navigated recursivelly
shuffle = None # How to shuffle the dataset both in the conversion from CSV to PARQUET and during transformation.

### Create dictionary with parameter values

In [8]:
# Create a dictionarry will all the parameters defined until now
parameter_values = {
    'train_paths': json.dumps(train_paths),
    'valid_paths': json.dumps(valid_paths),
    'output_path': config.OUTPUT_PATH,
    'columns': json.dumps(columns),
    'cols_dtype': json.dumps(cols_dtype),
    'output_transformed': config.OUTPUT_TRANSFORMED,
    'workflow_path': config.SAVED_WORKFLOW_PATH,
    'sep': sep,
    'gpus': gpus,
    'recursive': json.dumps(recursive),
    'shuffle': json.dumps(shuffle)
}

## Pipeline execution

### KFP pipeline compilation

In [11]:
# Compile the pipeline.
# This command will validate the pipeline and generate a JSON file with its specifications
PACKAGE_PATH = 'nvt_gcs_pipeline.json'
compiler.Compiler().compile(
       pipeline_func=preprocessing_pipeline_gcs,
       package_path=PACKAGE_PATH
)

### Initialize aiplatform SDK client

In [12]:
# Initialize aiplatform SDK client
aiplatform.init(
    project=config.PROJECT_ID,
    location=config.REGION,
    staging_bucket=config.STAGING_BUCKET
)

### Submit job to Vertex AI Pipelines

In [None]:
pipeline_job = aiplatform.PipelineJob(
    display_name=f'{TIMESTAMP}_nvt_gcs_pipeline',
    template_path=PACKAGE_PATH,
    enable_caching=False,
    parameter_values=parameter_values,
)

pipeline_job.run()

# Test section

In [1]:
# Columns and dtypes definition
cont_names = ["I" + str(x) for x in range(1, 14)]
cat_names = ["C" + str(x) for x in range(1, 27)]
columns = ["label"] + cont_names + cat_names

# Specify column dtypes. Note that "hex" means that
# the values will be hexadecimal strings that should
# be converted to int32
cols_dtype = {}
cols_dtype["label"] = 'int32'
for x in cont_names:
    cols_dtype[x] = 'int32'
for x in cat_names:
    cols_dtype[x] = 'hex'

train_paths = ['gs://workshop-datasets/criteo/day_0'] # Sample training CSV file to be converted
valid_paths = ['gs://workshop-datasets/criteo/day_1'] # Sample validation CSV file to be converted
output_path = 'gs://renatoleite-criteo-partial/converted'

sep = '\t' # Separator for the CSV file
gpus = '0' # Identifier of the GPU. As you will execute with only 1 GPU, only the first identier is passed.
           # If you were to execute the pipeline with 4 GPUs, you should use '0,1,2,3'.

recursive = False # If the train/valid paths should be navigated recursivelly
shuffle = None # How to shuffle the dataset both in the conversion from CSV to PARQUET and during transformation.

In [2]:
def convert_csv_to_parquet(
    train_paths: list,
    valid_paths: list,
    output_path: str,
    columns: list,
    cols_dtype: dict,
    sep: str,
    gpus: str,
    shuffle: str = None,
    recursive: bool = False
):
    '''
    Component to convert CSV file(s) to Parquet format using NVTabular.

    output_datasets: Output[Dataset]
        Output metadata with references to the converted CSVs in GCS.
        Usage:
            output_datasets.metadata['train']
                .example: 'gs://my_bucket/folders/train'
            output_datasets.metadata['valid']
                .example: 'gs://my_bucket/folders/valid'
    train_paths: list
        List of paths to folders or files in GCS for training.
        For recursive folder search, set the recursive variable to True
        Format:
            'gs://<bucket_name>/<subfolder1>/<subfolder>/' or
            'gs://<bucket_name>/<subfolder1>/<subfolder>/flat_file.csv' or
            a combination of both.
    valid_paths: list
        List of paths to folders or files in GCS for validation.
        For recursive folder search, set the recursive variable to True
        Format:
            'gs://<bucket_name>/<subfolder1>/<subfolder>/' or
            'gs://<bucket_name>/<subfolder1>/<subfolder>/flat_file.csv' or
            a combination of both.
    output_path: str
        Path in GCS to write the converted parquet files.
        Format:
            'gs://<bucket_name>/<subfolder1>/<subfolder>'
    columns: list
        List with the columns name from CSV file.
        Format:
            ['I1', 'I2', ..., 'C1', ...]
    cols_dtype: dict
        Dict with the dtype of the columns from CSV.
        Format:
            {'I1':'int32', ..., 'C20':'hex'}
    gpus: str
        GPUs available. 
        Format:
            If there are 4 gpus available, must be '0,1,2,3'
    shuffle: str
        How to shuffle the converted CSV, default to None.
        Options:
            PER_PARTITION
            PER_WORKER
            FULL
    '''

    output_datasets = {}

    # Standard Libraries
    import logging
    import fsspec
    import os

    # External Dependencies
    from dask_cuda import LocalCUDACluster
    from dask.distributed import Client
    import numpy as np

    # NVTabular
    from nvtabular.utils import device_mem_size, get_rmm_size
    import nvtabular as nvt
    from nvtabular.io.shuffle import Shuffle

    logging.basicConfig(level=logging.INFO)

    # Specify column dtypes (from numpy). Note that 'hex' means that
    # the values will be hexadecimal strings that should be converted to int32
    logging.info('Converting columns dtypes to numpy objects')
    converted_col_dtype = {}
    for col, dt in cols_dtype.items():
        if dt == 'hex':
            converted_col_dtype[col] = 'hex'
        else:
            converted_col_dtype[col] = getattr(np, dt)

    logging.info('Creating a Dask CUDA cluster')
    cluster = LocalCUDACluster(
        rmm_pool_size=get_rmm_size(0.8 * device_mem_size())
    )
    client = Client(cluster)

    logging.info('Creating Dataset definition')
    dataset = nvt.Dataset(
        path_or_source = train_paths,
        engine='csv',
        names=columns,
        sep=sep,
        dtypes=converted_col_dtype,
        client=client
    )

    full_output_path = os.path.join(output_path, 'train')

    logging.info(f'Writing parquet file(s) to {full_output_path}')
    if shuffle:
        shuffle = getattr(Shuffle, shuffle)

    dataset.to_parquet(
        full_output_path,
        preserve_files=True,
        dtypes=cols_dtype,
        shuffle=shuffle
    )

    # Write output path to metadata
    output_datasets['train'] = full_output_path

    return output_datasets

In [None]:
ret = convert_csv_to_parquet(
    train_paths,
    valid_paths,
    output_path,
    columns,
    cols_dtype,
    sep,
    gpus,
    shuffle,
    recursive
)

In [2]:
import logging
import fsspec
import os

# External Dependencies
from dask_cuda import LocalCUDACluster
from dask.distributed import Client
import numpy as np

# NVTabular
from nvtabular.utils import device_mem_size, get_rmm_size
import nvtabular as nvt
from nvtabular.io.shuffle import Shuffle

In [3]:
cluster = LocalCUDACluster(
    rmm_pool_size=get_rmm_size(0.8 * device_mem_size())
)
client = Client(cluster)

distributed.diskutils - INFO - Found stale lock file and directory '/home/renatoleite/workspace/merlin-on-vertex/dask-worker-space/worker-rddhbolk', purging
distributed.preloading - INFO - Import preload module: dask_cuda.initialize


In [4]:
import numpy as np
converted_col_dtype = {}
for col, dt in cols_dtype.items():
    if dt == 'hex':
        converted_col_dtype[col] = 'hex'
    else:
        converted_col_dtype[col] = getattr(np, dt)

In [5]:
dataset = nvt.Dataset(
    path_or_source = ['gs://renatoleite-criteo-partial/flat_data/day_0'],
    engine='csv',
    names=columns,
    sep=sep,
    dtypes=converted_col_dtype,
    client=client
)

Function:  execute_task
args:      ((subgraph_callable-39b3570a-9b7a-4835-a73a-106b0b27ee24, [(<function read_block_from_file at 0x7f4ca14f95f0>, <OpenFile 'renatoleite-criteo-partial/flat_data/day_0'>, 0, 1980465152, b'\n'), None, True]))
kwargs:    {}
Exception: ValueError("Mismatched dtypes found in `pd.read_csv`/`pd.read_table`.\n\n+--------+---------+----------+\n| Column | Found   | Expected |\n+--------+---------+----------+\n| I12    | float64 | int64    |\n| I2     | float64 | int64    |\n| I7     | float64 | int64    |\n+--------+---------+----------+\n\nUsually this is due to dask's dtype inference failing, and\n*may* be fixed by specifying dtypes manually by adding:\n\ndtype={'I12': 'float64',\n       'I2': 'float64',\n       'I7': 'float64'}\n\nto the call to `read_csv`/`read_table`.\n\nAlternatively, provide `assume_missing=True` to interpret\nall unspecified integer columns as floats.")



ValueError: Mismatched dtypes found in `pd.read_csv`/`pd.read_table`.

+--------+---------+----------+
| Column | Found   | Expected |
+--------+---------+----------+
| I12    | float64 | int64    |
| I2     | float64 | int64    |
| I7     | float64 | int64    |
+--------+---------+----------+

Usually this is due to dask's dtype inference failing, and
*may* be fixed by specifying dtypes manually by adding:

dtype={'I12': 'float64',
       'I2': 'float64',
       'I7': 'float64'}

to the call to `read_csv`/`read_table`.

Alternatively, provide `assume_missing=True` to interpret
all unspecified integer columns as floats.