# Convert CSV to Parquet

In [None]:
def convert_csv_to_parquet_op(
    train_paths: list,
    valid_paths: list,
    output_converted: str,
    sep: str,
    shuffle = None,
    recursive = False
):
    # Standard Libraries
    import logging
    import os

    # ETL library
    import etl

    logging.basicConfig(level=logging.INFO)

    logging.info('Getting column names and dtypes')
    col_dtypes = etl.get_criteo_col_dtypes()

    logging.info('Creating a Dask CUDA cluster')
    client = etl.create_convert_cluster()

    for folder_name, data_paths in zip(
        ['train', 'valid'],
        [train_paths, valid_paths]
    ):
        logging.info(f'Creating {folder_name} dataset.')
        dataset = etl.create_csv_dataset(
            data_paths=data_paths,
            sep=sep,
            recursive=recursive, 
            col_dtypes=col_dtypes, 
            client=client
        )

        fuse_output_path = os.path.join('/gcs', output_converted, folder_name)
        logging.info(f'Writing parquet file(s) to {fuse_output_path}')
        etl.convert_csv_to_parquet(fuse_output_path, dataset, shuffle)

        # Write output path to metadata
        print(os.path.join('gs://', output_converted, folder_name))

In [None]:
train_paths = ['gs://workshop-datasets/criteo/day_0'] # Sample training CSV file to be converted to parquet
valid_paths = ['gs://workshop-datasets/criteo/day_1'] # Sample validation CSV file to be converted to parquet
sep = '\t' # Separator for the CSV file
recursive = False # If the train/valid paths should be navigated recursivelly
output_converted = 'renatoleite-criteo-partial/converted'

In [None]:
convert_csv_to_parquet_op(train_paths, valid_paths, output_converted, sep)

# Fit and transform 1 file from Criteo Dataset

In [1]:
def analyze_dataset_op(
    datasets,
    workflow_path,
    split_name = 'train',
    device_limit_frac = 0.8,
    device_pool_frac = 0.9,
    part_mem_frac = 0.125
):
    from preprocessing import etl
    import logging
    import os

    logging.basicConfig(level=logging.INFO)
    workflow = {}

    # Retrieve `split_name` from metadata
    data_path = datasets[split_name]

    # Create Dask cluster
    logging.info('Creating Dask cluster.')
    client = etl.create_transform_cluster(device_limit_frac, device_pool_frac)

    # Create data transformation workflow. This step will only 
    # calculate statistics based on the transformations
    logging.info('Creating transformation workflow.')
    criteo_workflow = etl.create_criteo_nvt_workflow(client)

    logging.info('Creating dataset.')
    # Create dataset to be fitted
    dataset = etl.create_parquet_dataset(
        data_path=data_path,
        part_mem_frac=part_mem_frac
    )

    logging.info('Starting workflow fitting')
    criteo_workflow = etl.analyze_dataset(criteo_workflow, dataset)
    logging.info('Finished generating statistics for dataset.')

    etl.save_workflow(criteo_workflow, os.path.join('/gcs', workflow_path))
    logging.info('Workflow saved to GCS')

    workflow['workflow'] = os.path.join('/gcs', workflow_path)
    workflow['datasets'] = datasets.metadata

In [2]:
datasets = {}
datasets['train'] = 'gs://renatoleite-criteo-partial/converted/train'
datasets['valid'] = 'gs://renatoleite-criteo-partial/converted/valid'
workflow_path = 'renatoleite-criteo-partial/workflow'

In [3]:
analyze_dataset_op(datasets, workflow_path)

INFO:root:Creating Dask cluster.
INFO:numba.cuda.cudadrv.driver:init
distributed.diskutils - INFO - Found stale lock file and directory '/home/renatoleite/workspace/merlin-on-vertex/src/preprocessing/dask-worker-space/worker-3a8ay57y', purging
distributed.preloading - INFO - Import preload module: dask_cuda.initialize
INFO:root:Creating transformation workflow.
INFO:root:Creating dataset.
INFO:root:Starting workflow fitting
INFO:root:Finished generating statistics for dataset.
INFO:root:Workflow saved to GCS


AttributeError: 'dict' object has no attribute 'metadata'