# Convert CSV to Parquet

In [None]:
def convert_csv_to_parquet_op(
    train_paths: list,
    valid_paths: list,
    output_dir: str,
    sep: str,
    shuffle = None,
    recursive = False,
    device_limit_frac = 0.8,
    device_pool_frac = 0.9,
    part_mem_frac = 0.125
):
    '''
    Component to convert CSV file(s) to Parquet format using NVTabular.

    output_datasets: Output[Dataset]
        Output metadata with references to the converted CSVs in GCS.
        Usage:
            output_datasets.metadata['train']
                .example: 'gs://my_bucket/folders/train'
            output_datasets.metadata['valid']
                .example: 'gs://my_bucket/folders/valid'
    train_paths: list
        List of paths to folders or files in GCS for training.
        For recursive folder search, set the recursive variable to True
        Format:
            'gs://<bucket_name>/<subfolder1>/<subfolder>/' or
            'gs://<bucket_name>/<subfolder1>/<subfolder>/flat_file.csv' or
            a combination of both.
    valid_paths: list
        List of paths to folders or files in GCS for validation.
        For recursive folder search, set the recursive variable to True
        Format:
            'gs://<bucket_name>/<subfolder1>/<subfolder>/' or
            'gs://<bucket_name>/<subfolder1>/<subfolder>/flat_file.csv' or
            a combination of both.
    output_dir: str
        Path in GCS to write the converted parquet files.
        Format:
            '<bucket_name>/<subfolder1>/<subfolder>'
    recursive: bool
        If it must recursivelly look for files in path.
    shuffle: str
        How to shuffle the converted CSV, default to None.
        Options:
            PER_PARTITION
            PER_WORKER
            FULL
    '''
    # Standard Libraries
    import logging
    import os

    # ETL library
    from preprocessing import etl

    logging.basicConfig(level=logging.INFO)
    output_datasets = {}

    logging.info('Getting column names and dtypes')
    col_dtypes = etl.get_criteo_col_dtypes()

    # Create Dask cluster
    logging.info('Creating Dask cluster.')
    client = etl.create_cluster(
        n_workers = 1,
        device_limit_frac = device_limit_frac,
        device_pool_frac = device_pool_frac
    )

    for folder_name, data_paths in zip(
        ['train', 'valid'],
        [train_paths, valid_paths]
    ):
        logging.info(f'Creating {folder_name} dataset.')
        dataset = etl.create_csv_dataset(
            data_paths=data_paths,
            sep=sep,
            recursive=recursive, 
            col_dtypes=col_dtypes,
            part_mem_frac=part_mem_frac, 
            client=client
        )
        
        fuse_output_dir = output_dir.replace('gs://', '/gcs/')
        fuse_output_path = os.path.join(fuse_output_dir, folder_name)
        logging.info(f'Writing parquet file(s) to {fuse_output_path}')
        etl.convert_csv_to_parquet(fuse_output_path, dataset, shuffle)

        # Write output path to metadata
        output_datasets[folder_name] = os.path.join(output_dir, folder_name)
    
    return output_datasets

In [None]:
train_paths = ['gs://workshop-datasets/criteo/day_0'] # Sample training CSV file to be converted to parquet
valid_paths = ['gs://workshop-datasets/criteo/day_1'] # Sample validation CSV file to be converted to parquet
sep = '\t' # Separator for the CSV file
recursive = False # If the train/valid paths should be navigated recursivelly
output_dir = 'gs://renatoleite-nvtabular/converted'

In [None]:
convert_csv_to_parquet_op(train_paths, valid_paths, output_dir, sep)

# Fit and transform 1 file from Criteo Dataset

In [1]:
def analyze_dataset_op(
    datasets,
    workflow_path: str,
    split_name = 'train',
    device_limit_frac = 0.8,
    device_pool_frac = 0.9,
    part_mem_frac = 0.125
):
    '''
    Component to generate statistics from the dataset.

    datasets: Input[Dataset]
        Input metadata with references to the train and valid converted
        datasets in GCS.
        Usage:
            full_path_train = datasets.metadata.get('train')
                .example: 'gs://my_bucket/folders/converted/train'
            full_path_valid = datasets.metadata.get('valid')
                .example: 'gs://my_bucket/folders/converted/valid'
    workflow: Output[Artifact]
        Output metadata with the path to the fitted workflow artifacts
        (statistics) and converted datasets in GCS.
        Usage:
            workflow.metadata['workflow']
                .example: '/gcs/my_bucket/fitted_workflow'
            workflow.metadata['datasets']
                .example: 'gs://my_bucket/folders/converted/train'
    workflow_path: str
        Path to write the fitted workflow.
        Format:
            '<bucket_name>/<subfolder1>/<subfolder>'
    split_name: str
        Which dataset split to calculate the statistics. 'train' or 'valid'
    '''
    from preprocessing import etl
    import logging
    import os

    logging.basicConfig(level=logging.INFO)

    # Retrieve `split_name` from metadata
    data_path = datasets[split_name]

    # Create Dask cluster
    logging.info('Creating Dask cluster.')
    client = etl.create_cluster(
        n_workers = 1,
        device_limit_frac = device_limit_frac, 
        device_pool_frac = device_pool_frac
    )

    # Create data transformation workflow. This step will only 
    # calculate statistics based on the transformations
    logging.info('Creating transformation workflow.')
    criteo_workflow = etl.create_criteo_nvt_workflow(client=client)

    # Create dataset to be fitted
    dataset = etl.create_parquet_dataset(
        client=client,
        data_path=data_path,
        part_mem_frac=part_mem_frac
    )

    logging.info('Starting workflow fitting')
    criteo_workflow = etl.analyze_dataset(criteo_workflow, dataset)
    logging.info('Finished generating statistics for dataset.')

    etl.save_workflow(criteo_workflow, os.path.join('/gcs', workflow_path))
    logging.info('Workflow saved to GCS')

    workflow = {}
    workflow['workflow'] = os.path.join('/gcs', workflow_path)
    workflow['datasets'] = datasets
    return workflow

In [2]:
datasets = {}
datasets['train'] = 'gs://renatoleite-nvtabular/converted/train'
datasets['valid'] = 'gs://renatoleite-nvtabular/converted/valid'
workflow_path = 'renatoleite-nvtabular/workflow'

In [3]:
analyze_dataset_op(datasets, workflow_path)

INFO:root:Creating Dask cluster.
INFO:numba.cuda.cudadrv.driver:init
distributed.diskutils - INFO - Found stale lock file and directory '/home/renatoleite/workspace/merlin-on-vertex/src/preprocessing/dask-worker-space/worker-ynasmpt2', purging
distributed.preloading - INFO - Import preload module: dask_cuda.initialize
INFO:root:Creating transformation workflow.
INFO:root:Starting workflow fitting
INFO:root:Finished generating statistics for dataset.
INFO:root:Workflow saved to GCS


{'workflow': '/gcs/renatoleite-nvtabular/workflow',
 'datasets': {'train': 'gs://renatoleite-nvtabular/converted/train',
  'valid': 'gs://renatoleite-nvtabular/converted/valid'}}

# Transform dataset

In [1]:
def transform_dataset_op(
    workflow,
    transformed_output_dir: str,
    split_name: str = 'train',
    shuffle: str = None,
    device_limit_frac: float = 0.8,
    device_pool_frac: float = 0.9,
    part_mem_frac: float = 0.125,
):
    transformed_dataset = {}
    '''
    Component to transform a dataset according to the workflow specifications.

    workflow: Input[Artifact]
        Input metadata with the path to the fitted_workflow and the 
        location of the converted datasets in GCS (train and validation).
        Usage:
            fitted_workflow.metadata['datasets']['train']
                example: 'gs://my_bucket/converted/train'
            fitted_workflow.metadata['fitted_workflow']
                example: '/gcs/my_bucket/fitted_workflow'
    transformed_dataset: Output[Dataset]
        Output metadata with the path to the transformed dataset 
        and the validation dataset.
        Usage:
            transformed_dataset.metadata['transformed_dataset']
                .example: 'gs://my_bucket/transformed_data/train'
            transformed_dataset.metadata['original_datasets']
                .example: 'gs://my_bucket/converted/train'
    transformed_output_dir: str,
        Path in GCS to write the transformed parquet files.
        Format:
            'gs://<bucket_name>/<subfolder1>/<subfolder>/'
    '''
    from preprocessing import etl
    import logging
    import os

    logging.basicConfig(level=logging.INFO)

    # Define output path for transformed files
    transformed_fuse_dir = os.path.join(
        '/gcs',
        transformed_output_dir, 
        split_name
    )

    # Create Dask cluster
    client = etl.create_cluster(
        n_workers=1,
        device_limit_frac=device_limit_frac, 
        device_pool_frac=device_pool_frac
    )

    logging.info('Loading workflow and statistics')
    criteo_workflow = etl.load_workflow(
        workflow_path=workflow['workflow'],
        client=client
    )

    logging.info(f'Creating dataset definition for {split_name} split')
    dataset = etl.create_parquet_dataset(
        client=client,
        data_path=workflow['datasets'][split_name],
        part_mem_frac=part_mem_frac
    )

    logging.info('Workflow is loaded')
    logging.info('Starting workflow transformation')
    dataset = etl.transform_dataset(
        dataset=dataset,
        workflow=criteo_workflow
    )

    logging.info('Applying transformation')
    etl.save_dataset(dataset, transformed_fuse_dir)

    transformed_dataset['transformed_dataset'] = \
        os.path.join('gs://', transformed_output_dir)
    transformed_dataset['original_datasets'] = \
        workflow.get('datasets')

In [2]:
workflow = {}
workflow['workflow'] = '/gcs/renatoleite-nvtabular/workflow'
workflow['datasets'] = {}
workflow['datasets']['train'] = 'gs://renatoleite-nvtabular/converted/train'
workflow['datasets']['valid'] = 'gs://renatoleite-nvtabular/converted/valid'

transformed_output_dir = 'renatoleite-nvtabular/transformed'

In [3]:
transform_dataset_op(workflow, transformed_output_dir)

INFO:numba.cuda.cudadrv.driver:init
distributed.diskutils - INFO - Found stale lock file and directory '/home/renatoleite/workspace/merlin-on-vertex/src/preprocessing/dask-worker-space/worker-tev8icoj', purging
distributed.preloading - INFO - Import preload module: dask_cuda.initialize
INFO:root:Loading workflow and statistics
INFO:root:Creating dataset definition for train split
INFO:root:Workflow is loaded
INFO:root:Starting workflow transformation
INFO:root:Applying transformation
