In [1]:
from dask.distributed import Client
from dask_cuda import LocalCUDACluster
import dask_cudf
import argparse
import yaml
from pprint import pprint
from shutil import rmtree

In [2]:
cur_dir = '/home/jovyan/work/projects/GPU_gene_ETL'
config_subdir = 'configs/make_data_splits_config.yaml'

In [3]:
config_dir = f"{cur_dir}/{config_subdir}"

In [4]:
print(f"loading yaml file...")
config = yaml.safe_load(open(config_dir, 'r'))
pprint(config)

loading yaml file...
{'CUDA_VISIBLE_DEVICES': '0',
 'do_cuda_vis_dev': False,
 'do_rand_seed': True,
 'in_file': 'gs://jcosme/clean_data/genomics/MarRef_species.parquet',
 'out_dir': 'gs://jcosme/clean_data/genomics/data_splits',
 'partition_size': '100M',
 'rand_seed': 42,
 'splits': {'test': 0.08, 'train': 0.84, 'val': 0.08}}


In [5]:
in_file = config['in_file']  
out_dir = config['out_dir']  
splits = config['splits']  
rand_seed = config['rand_seed']  
do_rand_seed = config['do_rand_seed']  

CUDA_VISIBLE_DEVICES = config['CUDA_VISIBLE_DEVICES']  
do_cuda_vis_dev = config['do_cuda_vis_dev']  
partition_size = config['partition_size']  

In [6]:
split_names = []
split_perc = []
for key, val in splits.items():
    split_names.append(key)
    split_perc.append(val)

In [7]:
# turn off random seed if needed
if not do_rand_seed:
    rand_seed = None

In [8]:
tmp_dask_dir = '/tmp/dask'
try:
    rmtree(tmp_dask_dir)
except:
    pass

In [9]:
print(f"starting Dask GPU cluster...")
if do_cuda_vis_dev:
    cluster = LocalCUDACluster(
        protocol="ucx",
        enable_tcp_over_ucx=True,
        CUDA_VISIBLE_DEVICES=CUDA_VISIBLE_DEVICES,
        local_directory=tmp_dask_dir,
    )
else:
    cluster = LocalCUDACluster(
        protocol="ucx",
        enable_tcp_over_ucx=True,
        local_directory=tmp_dask_dir,
    )
client = Client(cluster)

starting Dask GPU cluster...


2022-07-13 01:11:36,938 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize
2022-07-13 01:11:36,958 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize


In [10]:
# first we create the Dask dataframe
print(f"reading file {in_file}")
df = dask_cudf.read_parquet(in_file,  # location of raw file
                        partition_size=partition_size,
                        )

reading file gs://jcosme/clean_data/genomics/MarRef_species.parquet


In [11]:
print(f"splitting data")
df_split = df.random_split(split_perc, random_state=rand_seed)

splitting data


In [12]:
for i, cur_split in enumerate(df_split):
    cur_out_file = f"{out_dir}/{split_names[i]}.parquet"
    print(f"saving {split_names[i]} data to {cur_out_file}")
    # the final step is to save the cleaned data.
    _ = cur_split.reset_index(True).to_parquet(cur_out_file)

saving train data to gs://jcosme/clean_data/genomics/data_splits/train.parquet
saving val data to gs://jcosme/clean_data/genomics/data_splits/val.parquet
saving test data to gs://jcosme/clean_data/genomics/data_splits/test.parquet


In [13]:
print(f"shutting down Dask client")
client.shutdown()
print(f"finished")

shutting down Dask client
finished
