In [1]:
from dask.distributed import Client
from dask_cuda import LocalCUDACluster
import nvtabular as nvt
from glob import glob
import argparse
import yaml
from pprint import pprint
import numpy as np
from shutil import rmtree

In [2]:
cur_dir = '/home/jovyan/work/projects/GPU_gene_ETL'
config_subdir = 'configs/make_nvtab_data_config.yaml'

In [3]:
config_dir = f"{cur_dir}/{config_subdir}"

In [4]:
print(f"loading yaml file...")
config = yaml.safe_load(open(config_dir, 'r'))
pprint(config)

loading yaml file...
{'CUDA_VISIBLE_DEVICES': '0',
 'dmake_file': '/home/jovyan/work/projects/GPU_gene_ETL/configs/make_dataset_config.yaml',
 'do_cuda_vis_dev': False,
 'input_col_name': 'seq',
 'label_col_name': 'label',
 'max_seq_len': 150,
 'out_base_dir': 'gs://jcosme/clean_data/genomics/nvtab_datasets',
 'row_group_size': 10000}


In [5]:
out_base_dir = config['out_base_dir']  
dmake_file = config['dmake_file'] 
label_col_name = config['label_col_name']  
input_col_name = config['input_col_name']  
max_seq_len = config['max_seq_len']  
row_group_size = config['row_group_size']  

CUDA_VISIBLE_DEVICES = config['CUDA_VISIBLE_DEVICES']  
do_cuda_vis_dev = config['do_cuda_vis_dev']  

In [6]:
dmake_config = yaml.safe_load(open(dmake_file, 'r'))
in_base_dir = dmake_config['out_base_dir']
dsplit_file = dmake_config['dsplit_file']
numbers_of_classes = dmake_config['numbers_of_classes']
sizes_per_class = dmake_config['sizes_per_class']
in_dir = dmake_config['in_dir']
do_unknown_class = dmake_config['do_unknown_class']

In [7]:
dsplit_config = yaml.safe_load(open(dsplit_file, 'r'))
splits = dsplit_config['splits'] 

split_names = []
for key, val in splits.items():
    split_names.append(key)

split_names = [ x + '.parquet' for x in split_names]

In [8]:
split_names

['train.parquet', 'val.parquet', 'test.parquet']

In [9]:
for c_i, cur_n_class in enumerate(numbers_of_classes):
    for s_i, cur_size in enumerate(sizes_per_class):
        cur_name = f"num_cls_{cur_n_class}_sz_cls_{cur_size}_unkn_cls_{do_unknown_class}"
        cur_in_dir = f"{in_base_dir}/{cur_name}"
        # print(cur_in_dir)
        cur_out_dir = f"{out_base_dir}/{cur_name}"
        print(cur_out_dir)

gs://jcosme/clean_data/genomics/nvtab_datasets/num_cls_1_sz_cls_500_unkn_cls_True
gs://jcosme/clean_data/genomics/nvtab_datasets/num_cls_1_sz_cls_1000_unkn_cls_True
gs://jcosme/clean_data/genomics/nvtab_datasets/num_cls_1_sz_cls_2000_unkn_cls_True
gs://jcosme/clean_data/genomics/nvtab_datasets/num_cls_1_sz_cls_4000_unkn_cls_True
gs://jcosme/clean_data/genomics/nvtab_datasets/num_cls_2_sz_cls_500_unkn_cls_True
gs://jcosme/clean_data/genomics/nvtab_datasets/num_cls_2_sz_cls_1000_unkn_cls_True
gs://jcosme/clean_data/genomics/nvtab_datasets/num_cls_2_sz_cls_2000_unkn_cls_True
gs://jcosme/clean_data/genomics/nvtab_datasets/num_cls_2_sz_cls_4000_unkn_cls_True
gs://jcosme/clean_data/genomics/nvtab_datasets/num_cls_4_sz_cls_500_unkn_cls_True
gs://jcosme/clean_data/genomics/nvtab_datasets/num_cls_4_sz_cls_1000_unkn_cls_True
gs://jcosme/clean_data/genomics/nvtab_datasets/num_cls_4_sz_cls_2000_unkn_cls_True
gs://jcosme/clean_data/genomics/nvtab_datasets/num_cls_4_sz_cls_4000_unkn_cls_True
gs://jc

In [10]:
tmp_dask_dir = '/tmp/dask'
try:
    rmtree(tmp_dask_dir)
except:
    pass

In [11]:
print(f"starting Dask GPU cluster...")
if do_cuda_vis_dev:
    cluster = LocalCUDACluster(
        protocol="ucx",
        enable_tcp_over_ucx=True,
        CUDA_VISIBLE_DEVICES=CUDA_VISIBLE_DEVICES,
        local_directory=tmp_dask_dir,
    )
else:
    cluster = LocalCUDACluster(
        protocol="ucx",
        enable_tcp_over_ucx=True,
        local_directory=tmp_dask_dir,
    )
client = Client(cluster)

starting Dask GPU cluster...


2022-07-13 02:43:17,894 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize
2022-07-13 02:43:17,920 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize


In [12]:
print(f"creating pipeline...")
# create the pipeline
# nvt.ColumnGroup(
# cat_features = [input_col_name] >> nvt.ops.Categorify() >> nvt.ops.ListSlice(0, end=max_seq_len, pad=True,
#                                                                              pad_value=0.0)
# lab_features = [label_col_name] >> nvt.ops.Categorify()
# # add label column
# output = cat_features + lab_features
# # create workflow
# workflow = nvt.Workflow(output, client=client)

# shuffle = nvt.io.Shuffle.PER_PARTITION

creating pipeline...


In [13]:
for c_i, cur_n_class in enumerate(numbers_of_classes):
    for s_i, cur_size in enumerate(sizes_per_class):
        cur_name = f"num_cls_{cur_n_class}_sz_cls_{cur_size}_unkn_cls_{do_unknown_class}"
        cur_in_dir = f"{in_base_dir}/{cur_name}"
        #print(cur_in_dir)
        cur_out_dir = f"{out_base_dir}/{cur_name}"
        #print(cur_out_dir)
        cur_workflow_file = f"{cur_out_dir}/workflow"
        #print(cur_workflow_file)
        
        cat_features = [input_col_name] >> nvt.ops.Categorify() >> nvt.ops.ListSlice(0, end=max_seq_len, pad=True,
                                                                             pad_value=0.0)
        lab_features = [label_col_name] >> nvt.ops.Categorify()
        output = cat_features + lab_features
        workflow = nvt.Workflow(output, client=client)
        shuffle = nvt.io.Shuffle.PER_PARTITION
        
        for cur_split_name in split_names:
            cur_split = cur_split_name.split('.')[0]
            #print(f"\tcur_split: {cur_split}")
            cur_out_file = f"{cur_out_dir}/{cur_split_name}"
            #print(f"\tcur_out_file: {cur_out_file}")
            cur_in_file = f"{cur_in_dir}/{cur_split_name}"
            print(f"\tcur_in_file: {cur_in_file}")
            if cur_split == 'train':
                #print(f"\t\tfitting nvtab workflow on: {cur_in_file}")
                workflow.fit(nvt.Dataset(cur_in_file, engine='parquet', row_group_size=row_group_size))
                #print(f"\t\tsaving workflow: {cur_workflow_file}")
                workflow.save(cur_workflow_file)
                
            #print(f"\ttransforming: {cur_in_file}")
            workflow.transform(nvt.Dataset(cur_in_file, engine='parquet', row_group_size=row_group_size)).to_parquet(
                output_path=cur_out_file,
                shuffle=shuffle,
                cats=[input_col_name],
                labels=[label_col_name],
                )
            print(f"\tsaved into: {cur_out_file}")
        client.cancel(workflow)

gs://jcosme/clean_data/genomics/datasets/num_cls_1_sz_cls_500_unkn_cls_True
gs://jcosme/clean_data/genomics/nvtab_datasets/num_cls_1_sz_cls_500_unkn_cls_True
gs://jcosme/clean_data/genomics/nvtab_datasets/num_cls_1_sz_cls_500_unkn_cls_True/workflow
	cur_split: train
	cur_out_file: gs://jcosme/clean_data/genomics/nvtab_datasets/num_cls_1_sz_cls_500_unkn_cls_True/train.parquet
	cur_in_file: gs://jcosme/clean_data/genomics/datasets/num_cls_1_sz_cls_500_unkn_cls_True/train.parquet
		fitting nvtab workflow on: gs://jcosme/clean_data/genomics/datasets/num_cls_1_sz_cls_500_unkn_cls_True/train.parquet




		saving workflow: gs://jcosme/clean_data/genomics/nvtab_datasets/num_cls_1_sz_cls_500_unkn_cls_True/workflow
	transforming: gs://jcosme/clean_data/genomics/datasets/num_cls_1_sz_cls_500_unkn_cls_True/train.parquet
	saved into: gs://jcosme/clean_data/genomics/nvtab_datasets/num_cls_1_sz_cls_500_unkn_cls_True/train.parquet
	cur_split: val
	cur_out_file: gs://jcosme/clean_data/genomics/nvtab_datasets/num_cls_1_sz_cls_500_unkn_cls_True/val.parquet
	cur_in_file: gs://jcosme/clean_data/genomics/datasets/num_cls_1_sz_cls_500_unkn_cls_True/val.parquet
	transforming: gs://jcosme/clean_data/genomics/datasets/num_cls_1_sz_cls_500_unkn_cls_True/val.parquet
	saved into: gs://jcosme/clean_data/genomics/nvtab_datasets/num_cls_1_sz_cls_500_unkn_cls_True/val.parquet
	cur_split: test
	cur_out_file: gs://jcosme/clean_data/genomics/nvtab_datasets/num_cls_1_sz_cls_500_unkn_cls_True/test.parquet
	cur_in_file: gs://jcosme/clean_data/genomics/datasets/num_cls_1_sz_cls_500_unkn_cls_True/test.parquet
	transfor



		saving workflow: gs://jcosme/clean_data/genomics/nvtab_datasets/num_cls_1_sz_cls_1000_unkn_cls_True/workflow
	transforming: gs://jcosme/clean_data/genomics/datasets/num_cls_1_sz_cls_1000_unkn_cls_True/train.parquet
	saved into: gs://jcosme/clean_data/genomics/nvtab_datasets/num_cls_1_sz_cls_1000_unkn_cls_True/train.parquet
	cur_split: val
	cur_out_file: gs://jcosme/clean_data/genomics/nvtab_datasets/num_cls_1_sz_cls_1000_unkn_cls_True/val.parquet
	cur_in_file: gs://jcosme/clean_data/genomics/datasets/num_cls_1_sz_cls_1000_unkn_cls_True/val.parquet
	transforming: gs://jcosme/clean_data/genomics/datasets/num_cls_1_sz_cls_1000_unkn_cls_True/val.parquet
	saved into: gs://jcosme/clean_data/genomics/nvtab_datasets/num_cls_1_sz_cls_1000_unkn_cls_True/val.parquet
	cur_split: test
	cur_out_file: gs://jcosme/clean_data/genomics/nvtab_datasets/num_cls_1_sz_cls_1000_unkn_cls_True/test.parquet
	cur_in_file: gs://jcosme/clean_data/genomics/datasets/num_cls_1_sz_cls_1000_unkn_cls_True/test.parquet


In [14]:
print(f"shutting down Dask client")
client.shutdown()
print(f"finished")

shutting down Dask client
finished


2022-07-13 03:28:10,136 - distributed.client - ERROR - Failed to reconnect to scheduler after 30.00 seconds, closing client
