In [1]:
from dask.distributed import Client
from dask_cuda import LocalCUDACluster
import dask_cudf
import argparse
import yaml
from pprint import pprint
from shutil import rmtree

In [2]:
cur_dir = '/home/jovyan/work/projects/GPU_gene_ETL'
config_subdir = 'configs/make_inverse_config.yaml'

In [3]:
config_dir = f"{cur_dir}/{config_subdir}"

In [4]:
print(f"loading yaml file...")
config = yaml.safe_load(open(config_dir, 'r'))
pprint(config)

loading yaml file...
{'CUDA_VISIBLE_DEVICES': '0',
 'do_cuda_vis_dev': False,
 'dsplit_file': '/home/jovyan/work/projects/GPU_gene_ETL/configs/make_data_splits_config.yaml',
 'in_dir': 'gs://jcosme/clean_data/genomics/data_splits',
 'inverse_col': 'seq',
 'out_dir': 'gs://jcosme/clean_data/genomics/data_splits_inverse',
 'partition_size': '100M'}


In [5]:
in_dir = config['in_dir']  
out_dir = config['out_dir']  
inverse_col = config['inverse_col'] 
dsplit_file = config['dsplit_file'] 
CUDA_VISIBLE_DEVICES = config['CUDA_VISIBLE_DEVICES']  
do_cuda_vis_dev = config['do_cuda_vis_dev']  
partition_size = config['partition_size']  

In [6]:
dsplit_config = yaml.safe_load(open(dsplit_file, 'r'))
splits = dsplit_config['splits']  

split_names = []
for key, val in splits.items():
    split_names.append(key)

split_names = [ x + '.parquet' for x in split_names]
split_files = [f"{in_dir}/{x}" for x in split_names]


In [7]:
tmp_dask_dir = '/tmp/dask'
try:
    rmtree(tmp_dask_dir)
except:
    pass

In [8]:
print(f"starting Dask GPU cluster...")
if do_cuda_vis_dev:
    cluster = LocalCUDACluster(
        protocol="ucx",
        enable_tcp_over_ucx=True,
        CUDA_VISIBLE_DEVICES=CUDA_VISIBLE_DEVICES,
        local_directory=tmp_dask_dir,
    )
else:
    cluster = LocalCUDACluster(
        protocol="ucx",
        enable_tcp_over_ucx=True,
        local_directory=tmp_dask_dir,
    )
client = Client(cluster)

starting Dask GPU cluster...


2022-07-13 01:12:39,196 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize
2022-07-13 01:12:39,243 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize


In [9]:
def get_inverse(df):
    df[inverse_col] = df[inverse_col].str.slice(start=-1, stop=None, step=-1)
    return df

In [10]:
for i, in_file in enumerate(split_files):
    out_file = f"{out_dir}/{split_names[i]}"
    
    print(f"reading file {in_file}")
    df = dask_cudf.read_parquet(in_file,  # location of raw file
                            partition_size=partition_size,
                            )
    print(f"inverting column {inverse_col}...")
    df_inverse = df.copy()
    df_inverse = df_inverse.map_partitions(get_inverse)
    df = dask_cudf.concat([df, df_inverse]).reset_index(True).repartition(partition_size=partition_size)
    
    print(f"saving data to {out_file}")
    _ = df.to_parquet(out_file)
    
    client.cancel(df)

reading file gs://jcosme/clean_data/genomics/data_splits/train.parquet
inverting column seq...
saving data to gs://jcosme/clean_data/genomics/data_splits_inverse/train.parquet
reading file gs://jcosme/clean_data/genomics/data_splits/val.parquet
inverting column seq...
saving data to gs://jcosme/clean_data/genomics/data_splits_inverse/val.parquet
reading file gs://jcosme/clean_data/genomics/data_splits/test.parquet
inverting column seq...
saving data to gs://jcosme/clean_data/genomics/data_splits_inverse/test.parquet


In [11]:
print(f"shutting down Dask client")
client.shutdown()
print(f"finished")

shutting down Dask client
finished
