# 1) Pipeline: Source data in GCS
## Data Definition

In [4]:
# Standard
import json

# Import components and pipeline definition
from pipeline_gcs import preprocessing_pipeline_gcs

# Google Cloud
from google.cloud import aiplatform

# Kubeflow Pipelines
import kfp
from kfp.v2 import compiler

# NVTabular
import nvtabular as nvt
from nvtabular.ops import (
    Categorify,
    Clip,
    FillMissing,
    Normalize,
)

In [1]:
# GCP variables
project_id = 'renatoleite-mldemos'

In [3]:
# Columns and dtypes definition
cont_names = ["I" + str(x) for x in range(1, 14)]
cat_names = ["C" + str(x) for x in range(1, 27)]
columns = ["label"] + cont_names + cat_names

# Specify column dtypes. Note that "hex" means that
# the values will be hexadecimal strings that should
# be converted to int32
cols_dtype = {}
cols_dtype["label"] = 'int32'
for x in cont_names:
    cols_dtype[x] = 'int32'
for x in cat_names:
    cols_dtype[x] = 'hex'

In [None]:
# Transformation pipeline
num_buckets = 10000000
categorify_op = Categorify(max_size=num_buckets)
cat_features = cat_names >> categorify_op
cont_features = cont_names >> FillMissing() >> Clip(min_value=0) >> Normalize()
features = cat_features + cont_features + ['label']

In [None]:
# Create and save workflow
workflow = nvt.Workflow(features)
# workflow.save(local_path)

In [5]:
train_paths = ['renatoleite-criteo-partial/flat_data/day_0']
valid_paths = ['renatoleite-criteo-partial/flat_data/day_1']
output_path = 'renatoleite-criteo-partial/converted'
workflow_path = 'renatoleite-criteo-partial/saved_workflow'
output_transformed = 'renatoleite-criteo-partial/transformed_data/'

sep = '\t'
gpus = '0'

In [6]:
parameter_values = {
    'train_paths': json.dumps(train_paths),
    'valid_paths': json.dumps(valid_paths),
    'output_path': output_path,
    'columns': json.dumps(columns),
    'cols_dtype': json.dumps(cols_dtype),
    'output_transformed': output_transformed,
    'sep': sep,
    'gpus': gpus,
    'workflow_path': 'gs://renatoleite-criteo-partial/saved_workflow'
}

In [10]:
# Compile Pipeline
PACKAGE_PATH = 'gcs_pipeline.json'
compiler.Compiler().compile(
       pipeline_func=preprocessing_pipeline_gcs,
       package_path=PACKAGE_PATH
)

In [None]:
project_id = 'renatoleite-mldemos'
region = 'us-central1'
staging_bucket = ''

In [11]:
aiplatform.init(
    project=project_id,
    location=region,
    staging_bucket=staging_bucket
)

In [None]:
pipeline_job = aiplatform.PipelineJob(
    display_name='nvt_convert_pipeline',
    template_path=PACKAGE_PATH,
    enable_caching=False,
    parameter_values=parameter_values,
)

pipeline_job.run()