In [1]:
from dask.distributed import Client
from dask_cuda import LocalCUDACluster
import dask_cudf
import argparse
import yaml
from pprint import pprint
from shutil import rmtree

In [2]:
cur_dir = '/home/jovyan/work/projects/GPU_gene_ETL'
config_subdir = 'configs/remove_inverse_config.yaml'

In [3]:
config_dir = f"{cur_dir}/{config_subdir}"

In [4]:
print(f"loading yaml file...")
config = yaml.safe_load(open(config_dir, 'r'))
pprint(config)

loading yaml file...
{'CUDA_VISIBLE_DEVICES': '0',
 'base_col_names': ['seq', 'label'],
 'do_cuda_vis_dev': False,
 'drop_col': 'label',
 'in_file': 'gs://jcosme/clean_data/genomics/MarRef_training.parquet',
 'out_file': 'gs://jcosme/clean_data/genomics/MarRef_inverse_rm.parquet',
 'partition_size': '100M',
 'remove_str': '/2'}


In [5]:
in_file = config['in_file']  
out_file = config['out_file']  
base_col_names = config['base_col_names']  
remove_str = config['remove_str']  
drop_col = config['drop_col']  
CUDA_VISIBLE_DEVICES = config['CUDA_VISIBLE_DEVICES']  
do_cuda_vis_dev = config['do_cuda_vis_dev']  
partition_size = config['partition_size']  

In [6]:
tmp_dask_dir = '/tmp/dask'
try:
    rmtree(tmp_dask_dir)
except:
    pass

In [7]:
print(f"starting Dask GPU cluster...")

if do_cuda_vis_dev:
    cluster = LocalCUDACluster(
        protocol="ucx",
        enable_tcp_over_ucx=True,
        CUDA_VISIBLE_DEVICES=CUDA_VISIBLE_DEVICES,
        local_directory=tmp_dask_dir,
    )
else:
    cluster = LocalCUDACluster(
        protocol="ucx",
        enable_tcp_over_ucx=True,
        local_directory=tmp_dask_dir,
    )
client = Client(cluster)

starting Dask GPU cluster...


2022-07-13 00:54:42,066 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize
2022-07-13 00:54:42,096 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize


In [8]:
# first we create the Dask dataframe
print(f"reading file {in_file}")
df = dask_cudf.read_parquet(in_file,  # location of raw file
                        partition_size=partition_size,
                        )

reading file gs://jcosme/clean_data/genomics/MarRef_training.parquet


In [9]:
# def remove_str_rows(df):
#     keep_mask = df[drop_col].str.rfind(remove_str) == -1
#     df = df.loc[keep_mask]
#     return df

In [10]:
# print(f"dropping rows with '{remove_str}' from {drop_col}")
# df = df.map_partitions(remove_str_rows).reset_index(True)

In [11]:
#len(df)
#14864656

In [12]:
has_str_mask = df[drop_col].str.contains(remove_str, regex=False)
df = df.loc[~has_str_mask]

In [13]:
#len(df)
#7432328

In [14]:
print(f"saving data to {out_file}")
# the final step is to save the cleaned data.
_ = df.to_parquet(out_file)

saving data to gs://jcosme/clean_data/genomics/MarRef_inverse_rm.parquet


In [15]:
client.cancel(df)
print(f"shutting down Dask client")
client.shutdown()
print(f"finished")

shutting down Dask client
finished
