In [2]:
# Standard
import json

# Google Cloud
from google.cloud import aiplatform

# Kubeflow Pipelines
import kfp
from kfp.v2 import compiler
from kfp.v2.google.client import AIPlatformClient

# NVTabular
import nvtabular as nvt
from nvtabular.ops import (
    Categorify,
    Clip,
    FillMissing,
    Normalize,
)

# 1) Pipeline: Source data in GCS
## Data Definition

In [None]:
# Import components and pipeline definition
from pipeline_gcs import preprocessing_pipeline_gcs

In [2]:
# Columns and dtypes definition
cont_names = ["I" + str(x) for x in range(1, 14)]
cat_names = ["C" + str(x) for x in range(1, 27)]
columns = ["label"] + cont_names + cat_names

# Specify column dtypes. Note that "hex" means that
# the values will be hexadecimal strings that should
# be converted to int32
cols_dtype = {}
cols_dtype["label"] = 'int32'
for x in cont_names:
    cols_dtype[x] = 'int32'
for x in cat_names:
    cols_dtype[x] = 'hex'

In [3]:
# Transformation pipeline
num_buckets = 10000000
categorify_op = Categorify(max_size=num_buckets)
cat_features = cat_names >> categorify_op
cont_features = cont_names >> FillMissing() >> Clip(min_value=0) >> Normalize()
features = cat_features + cont_features + ['label']

# Create and save workflow
workflow = nvt.Workflow(features)
# workflow.save(local_path)

In [3]:
train_paths = ['renatoleite-criteo-partial/flat_data/day_0']
valid_paths = ['renatoleite-criteo-partial/flat_data/day_1']
output_path = 'renatoleite-criteo-partial/converted'
workflow_path = 'renatoleite-criteo-partial/saved_workflow'
output_transformed = 'renatoleite-criteo-partial/transformed_data'

sep = '\t'
gpus = '0'

recursive = False
shuffle = None

In [4]:
parameter_values = {
    'train_paths': json.dumps(train_paths),
    'valid_paths': json.dumps(valid_paths),
    'output_path': output_path,
    'columns': json.dumps(columns),
    'cols_dtype': json.dumps(cols_dtype),
    'output_transformed': output_transformed,
    'workflow_path': workflow_path,
    'sep': sep,
    'gpus': gpus,
    'recursive': json.dumps(recursive),
    'shuffle': json.dumps(shuffle)
}

In [5]:
# Compile Pipeline
PACKAGE_PATH = 'pipeline_gcs.json'
compiler.Compiler().compile(
       pipeline_func=preprocessing_pipeline_gcs,
       package_path=PACKAGE_PATH
)

In [6]:
project_id = 'renatoleite-mldemos'
region = 'us-central1'
staging_bucket = 'gs://renatoleite-staging'

In [7]:
aiplatform.init(
    project=project_id,
    location=region,
    staging_bucket=staging_bucket
)

In [None]:
pipeline_job = aiplatform.PipelineJob(
    display_name='nvt_convert_pipeline_gcs',
    template_path=PACKAGE_PATH,
    enable_caching=False,
    parameter_values=parameter_values,
)

pipeline_job.run()


# 2) Pipeline: Source data in BQ

In [3]:
# Import components and pipeline definition
from pipeline_bq import preprocessing_pipeline_bq

In [15]:
output_path = 'renatoleite-criteo-partial/bq_converted'
bq_project = 'renatoleite-mldemos'
bq_dataset_id = 'criteo_pipeline'
bq_table_train = 'train'
bq_table_valid = 'valid'
location = 'US'

workflow_path = 'renatoleite-criteo-partial/saved_workflow'
output_transformed = 'renatoleite-criteo-partial/bq_transformed_data'
gpus = '0'

recursive = False
shuffle = None

In [5]:
parameter_values = {
    'bq_table_train': bq_table_train,
    'bq_table_valid': bq_table_valid,
    'output_path': output_path,
    'bq_project': bq_project,
    'bq_dataset_id': bq_dataset_id,
    'location': location,
    'gpus': gpus,
    'workflow_path': workflow_path,
    'output_transformed': output_transformed,
    'recursive': recursive,
    'shuffle': json.dumps(shuffle)
}

In [6]:
# Compile Pipeline
PACKAGE_PATH = 'pipeline_bq.json'
compiler.Compiler().compile(
       pipeline_func=preprocessing_pipeline_bq,
       package_path=PACKAGE_PATH
)

In [7]:
project_id = 'renatoleite-mldemos'
region = 'us-central1'
staging_bucket = 'gs://renatoleite-staging'

In [9]:
aiplatform.init(
    project=project_id,
    location=region,
    staging_bucket=staging_bucket
)

In [None]:
pipeline_job = aiplatform.PipelineJob(
    display_name='nvt_convert_pipeline_bq',
    template_path=PACKAGE_PATH,
    enable_caching=False,
    parameter_values=parameter_values,
)

pipeline_job.run()

# 3) Pipeline: Source GCS and output to Feature Store

In [10]:
# 1) Create op to export from gcs parquet back to BQ
# 2) Export dataset to Feature Store