# make nvtab data

This is an explanation of the  
**make_nvtab.py**    
file.  

We take our split data and create:
+ NVtabublar datasets

Then we will save the output as parquet files.

## Step 0: import required libraries

In [1]:
import sys
sys.path.append('..') # this is to allow the script to read from the parent folder

from scripts.global_funcs import load_data_config
from dask.distributed import Client
from dask_cuda import LocalCUDACluster
import dask_cudf
import nvtabular as nvt

# import rmm
# from nvtabular.utils import device_mem_size
# import shutil
# import pathlib


## Step 1: load config file data

In [2]:
configs = load_data_config()

In [3]:
# these are the variables we will be using
for key, val in configs.items():
    print(f"{key}: {val}")

clean_fasta_file: /media/jcosme/Data/MarRef_parquet_10_cats
output_dir: /media/jcosme/Data
project_name: MarRef_parquet_10_cats
base_col_names: ['seq', 'label']
label_col_name: label
input_col_name: seq
label_regex: (?:[^a-zA-Z0-9]+)([a-zA-Z]+[0-9]+)(?:[^a-zA-Z0-9]+)
k_mer: 1
possible_gene_values: ['A', 'C', 'G', 'T']
max_seq_len: 150
data_splits: {'train': 0.9, 'val': 0.05, 'test': 0.05}
random_seed: 42
fasta_sep: >
unq_labs_dir: /media/jcosme/Data/MarRef_parquet_10_cats/data/unq_labels
unq_labs_dir_csv: /media/jcosme/Data/MarRef_parquet_10_cats/data/unq_labels.csv
data_dir: /media/jcosme/Data/MarRef_parquet_10_cats/data/MarRef_parquet_10_cats
nvtab_dir: /media/jcosme/Data/MarRef_parquet_10_cats/nvtab
dask_dir: /media/jcosme/Data/MarRef_parquet_10_cats/dask
tensorboard_dir: /media/jcosme/Data/MarRef_parquet_10_cats/tensorboard
model_checkpoints_dir: /media/jcosme/Data/MarRef_parquet_10_cats/checkpoints/model_checkpoints
model_checkpoints_parent_dir: /media/jcosme/Data/MarRef_parquet_1

In [4]:
# lets put these into python variables
input_col_name = configs['input_col_name']
label_col_name = configs['label_col_name']
data_splits = configs['data_splits']
max_seq_len = configs['max_seq_len']
nvtab_dir = configs['nvtab_dir']
data_dir = configs['data_dir']
dask_dir = configs['dask_dir']

## Step 2: start the cluster

In [5]:
# # define some information about where to get our data
# dask_workdir = pathlib.Path(nvtab_dir, "dask", "workdir")
# stats_path = pathlib.Path(nvtab_dir, "dask", "stats")

# # Make sure we have a clean worker space for Dask
# if pathlib.Path.is_dir(dask_workdir):
#     shutil.rmtree(dask_workdir)
# dask_workdir.mkdir(parents=True)

# # Make sure we have a clean stats space for Dask
# if pathlib.Path.is_dir(stats_path):
#     shutil.rmtree(stats_path)
# stats_path.mkdir(parents=True)

# # Get device memory capacity
# capacity = device_mem_size(kind="total")

# # Deploy a Single-Machine Multi-GPU Cluster
# protocol = "tcp"  # "tcp" or "ucx"
# visible_devices = "0"  # Delect devices to place workers
# device_spill_frac = 0.5  # Spill GPU-Worker memory to host at this limit.
# # Reduce if spilling fails to prevent
# # device memory errors.
# cluster = None  # (Optional) Specify existing scheduler port
# if cluster is None:
#     cluster = LocalCUDACluster(
#         protocol=protocol,
#         CUDA_VISIBLE_DEVICES=visible_devices,
#         local_directory=dask_workdir,
#         device_memory_limit=capacity * device_spill_frac,
#     )

# # Create the distributed client
# client = Client(cluster)
# client

# # Initialize RMM pool on ALL workers
# def _rmm_pool():
#     rmm.reinitialize(
#         pool_allocator=True,
#         initial_pool_size=None,  # Use default size
#     )


# client.run(_rmm_pool)
# client


In [6]:
cluster = LocalCUDACluster(local_directory=dask_dir)
client = Client(cluster)
client

0,1
Client  Scheduler: tcp://127.0.0.1:37919  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 1  Cores: 1  Memory: 31.21 GiB


### optional: 
click the link above to open up the Dask Dashboard, which will allow you to see the progress of your job.  
**note:** this will only work on a jupyter notebook

## Step 3: create NVTabular workflow

In [8]:
# create the pipeline
# nvt.ColumnGroup(
cat_features =  [input_col_name] >> nvt.ops.Categorify() >> nvt.ops.ListSlice(0, end=150, pad=True, pad_value=0.0)

# add label column
output = cat_features + label_col_name

# create workflow
workflow = nvt.Workflow(output)

## Step 5: fit workflow on training data

In [9]:
%%time
# fitting on training data, and saving the workflow
for key in data_splits.keys():
    if key=='train':
        print("fitting nvtab workflow on training data...")
        workflow.fit(nvt.Dataset(f"{data_dir}_{key}", engine='parquet', row_group_size=10000))

        print("saving fitting nvtab workflow...")
        workflow.save(f"{nvtab_dir}/workflow")

fitting nvtab workflow on training data...
saving fitting nvtab workflow...
CPU times: user 441 ms, sys: 216 ms, total: 656 ms
Wall time: 2.35 s


# Step 6: create datasets

In [14]:
%%time
shuffle= nvt.io.Shuffle.PER_PARTITION

for key in data_splits.keys():
    if key=='train':

        print("making nvtab dataset for training...")
        workflow.transform(nvt.Dataset(f"{data_dir}_{key}", engine='parquet', row_group_size=10000)).to_parquet(
            output_path=f"{nvtab_dir}/{key}",
            shuffle=shuffle,
            cats=[input_col_name],
            labels=[label_col_name],
        )
    else:
        print(f"making nvtab dataset for {key}...")
        workflow.transform(nvt.Dataset(f"{data_dir}_{key}", engine='parquet', row_group_size=10000)).to_parquet(
            output_path=f"{nvtab_dir}/{key}",
            shuffle=None,
            out_files_per_proc=None,
            cats=[input_col_name],
            labels=[label_col_name],
        )

making nvtab dataset for training...
making nvtab dataset for val...
making nvtab dataset for test...
CPU times: user 140 ms, sys: 4.32 ms, total: 144 ms
Wall time: 1.62 s


## Step 7: cleanup

In [15]:
# shutdown the Dask cluster
client.shutdown()

# finally we close the Dask cluster
client.close()



## finished!