# Data preparation

In [1]:
import os
import glob

import numpy as np
from dask.distributed import Client
from dask_cuda import LocalCUDACluster

import nvtabular as nvt
from nvtabular.utils import device_mem_size, get_rmm_size

from nvtabular.io.parquet import ParquetDatasetEngine

print(nvt.__version__)

0.6.1


In [2]:
INPUT_PATH = '/home/jupyter/criteo/criteo_16_per_file'
OUTPUT_PATH = '/home/jupyter/criteo/criteo_out'

CUDA_VISIBLE_DEVICES = "0,1,2,3"
FRAC_SIZE = 0.12
PART_SIZE = 1900000000
MEM_POOL_FRAC = 0.9
DEVICE_LIMIT_FRAC = 0.8

In [3]:
cluster = None  # Connect to existing cluster if desired
if cluster is None:
    cluster = LocalCUDACluster(
        CUDA_VISIBLE_DEVICES=CUDA_VISIBLE_DEVICES,
        rmm_pool_size=get_rmm_size(MEM_POOL_FRAC * device_mem_size()),
        #device_memory = int(DEVICE_LIMIT_FRAC * device_mem_size(kind='total')),
        local_directory=os.path.join('/tmp', "dask-space"),
    )
client = Client(cluster)

In [31]:
dataset = nvt.Dataset(
    #INPUT_PATH,
    '/home/jupyter/criteo/criteo_2',
    engine="parquet",
    #part_mem_fraction=0.124,
    client=client,
)

In [32]:
print(dataset.num_rows)
print(dataset.partition_lens)
print(dataset.npartitions)

395405518
[11189798, 11213952, 10458815, 11354902, 10473752, 11232959, 10405390, 11207407, 10369068, 11177235, 11126889, 10272419, 11111079, 10241712, 11087178, 10230421, 11098650, 10266055, 1324302, 11186362, 11189364, 10446171, 11323558, 10528346, 11248042, 10436635, 11218825, 10387232, 11175054, 11141698, 10281536, 11108497, 10237284, 11094852, 10222212, 11090340, 10249703, 4997824]
38


In [33]:
partitions_per_file = 1

part_size  = int(0.11 * device_mem_size()) + 1
file_size = partitions_per_file * part_size

print(part_size)
print(file_size)

1742809334
1742809334


In [34]:
new_dataset = ParquetDatasetEngine.regenerate_dataset(
    dataset, 
    output_path = OUTPUT_PATH,
    file_size = file_size,
    part_size = part_size

).compute()

  "General-metadata information not detected! "


In [36]:
processed_dataset = nvt.Dataset(
    #INPUT_PATH,
    '/home/jupyter/criteo/criteo_out',
    engine="parquet",
    part_mem_fraction=0.125,
    client=client,
)

print(processed_dataset.num_rows)
print(processed_dataset.partition_lens)
print(processed_dataset.npartitions)

395405518
[7142661, 7142661, 7142661, 7142661, 7142661, 7142661, 7142661, 7142661, 7142661, 7142661, 7142661, 7142661, 7142661, 7142661, 7142661, 7142661, 7142661, 7142661, 7142661, 7142661, 7142661, 7142661, 7142661, 7142661, 7142661, 7142661, 7142661, 7142661, 7142661, 7142661, 7142661, 7142661, 7142661, 7142661, 7142661, 7142661, 7142661, 7142661, 7142661, 7142661, 7142661, 7142661, 7142661, 7142661, 7142661, 7142661, 7142661, 7142661, 7142661, 7142661, 7142661, 7142661, 7142661, 7142661, 7142661, 2559163]
56
