In [1]:
from local_test_kfp_components import *

In [2]:
import nvtabular as nvt
from nvtabular.ops import (
    Categorify,
    Clip,
    FillMissing,
    Normalize,
)

cont_names = ["I" + str(x) for x in range(1, 14)]
cat_names = ["C" + str(x) for x in range(1, 27)]
columns = ["label"] + cont_names + cat_names

# Specify column dtypes. Note that "hex" means that
# the values will be hexadecimal strings that should
# be converted to int32
cols_dtype = {}
cols_dtype["label"] = 'int32'
for x in cont_names:
    cols_dtype[x] = 'int32'
for x in cat_names:
    cols_dtype[x] = 'hex'

# Transformation steps
num_buckets = 10000000
categorify_op = Categorify(max_size=num_buckets)
cat_features = cat_names >> categorify_op
cont_features = cont_names >> FillMissing() >> Clip(min_value=0) >> Normalize()
features = cat_features + cont_features + ['label']

In [2]:
train_paths = ['renatoleite-criteo-partial/flat_data/day_0'] #: list,
valid_paths = ['renatoleite-criteo-partial/flat_data/day_1'] #: list,
output_path = 'renatoleite-criteo-partial/converted' #: str,
gpus = '0'
sep = '\t'

In [None]:
result = convert_csv_to_parquet_op(
    train_paths=train_paths,
    valid_paths=valid_paths,
    output_path=output_path,
    columns=columns,
    cols_dtype=cols_dtype,
    sep=sep,
    gpus=gpus,
    output_dataset = {},
    shuffle = None,
    recursive = False
)

In [3]:
result = {'train':'/gcs/renatoleite-criteo-partial/converted/train',
          'valid':'/gcs/renatoleite-criteo-partial/converted/valid'}
workflow_path = 'renatoleite-criteo-partial/saved_workflow'

In [8]:
fit_result = fit_dataset_op(
    datasets=result,
    fitted_workflow={},
    workflow_path=workflow_path,
    gpus=gpus
)

INFO:numba.cuda.cudadrv.driver:init
INFO:root:Creating a Dask CUDA cluster
INFO:root:Loading saved workflow
INFO:root:Starting workflow fitting
INFO:root:Finished generating statistics for dataset.
INFO:root:Saving workflow to /gcs/renatoleite-criteo-partial/saved_workflow/fitted_workflow


In [4]:
fit_result = {
    'datasets':{
        'train':'/gcs/renatoleite-criteo-partial/converted/train',
        'valid':'/gcs/renatoleite-criteo-partial/converted/valid'
    },
    'fitted_workflow':'/gcs/renatoleite-criteo-partial/saved_workflow/fitted_workflow'
}
output_transformed = 'renatoleite-criteo-partial/transformed_data/'

In [9]:
transform_result = transform_dataset_op(
    fitted_workflow=fit_result,
    transformed_dataset={},
    output_transformed=output_transformed,
    gpus=gpus
)

INFO:numba.cuda.cudadrv.driver:init
INFO:root:Creating a Dask CUDA cluster
INFO:root:Loading workflow and statistics
INFO:root:Creating dataset definition
INFO:root:Start workflow transformation
INFO:root:Finished transformation


In [5]:
transform_result = {
    'transformed_dataset': 
        '/gcs/renatoleite-criteo-partial/transformed_data/train',
    'original_datasets': {
        'train': '/gcs/renatoleite-criteo-partial/converted/train',
        'valid': '/gcs/renatoleite-criteo-partial/converted/valid'
    }
}

In [4]:
output_path = 'renatoleite-criteo-partial/bq_exported/'
project = "bigquery-public-data"
dataset_id = "samples"
table_id = "shakespeare"

In [5]:
# BQ Export feature
result = export_parquet_from_bq_op(
    output_path=output_path,
    bq_project=project,
    bq_dataset_id=dataset_id,
    bq_table_train=table_id,
    bq_table_valid=table_id,
    location='US',
    output_dataset={},
)

INFO:root:Extracting bigquery-public-data.samples.shakespeare to gs://renatoleite-criteo-partial/bq_exported/train/train-*.parquet
INFO:root:Saving metadata for train path: /gcs/renatoleite-criteo-partial/bq_exported/train
INFO:root:Extracting bigquery-public-data.samples.shakespeare to gs://renatoleite-criteo-partial/bq_exported/valid/valid-*.parquet
INFO:root:Saving metadata for valid path: /gcs/renatoleite-criteo-partial/bq_exported/valid
