In [1]:
from dask.distributed import Client
from dask_cuda import LocalCUDACluster
import dask_cudf
import argparse
import yaml
from pprint import pprint
from shutil import rmtree

In [2]:
cur_dir = '/home/jovyan/work/projects/GPU_gene_ETL'
config_subdir = 'configs/convert_labels_to_species_config.yaml'

In [3]:
config_dir = f"{cur_dir}/{config_subdir}"

In [4]:
print(f"loading yaml file...")
config = yaml.safe_load(open(config_dir, 'r'))
pprint(config)

loading yaml file...
{'CUDA_VISIBLE_DEVICES': '0',
 'do_cuda_vis_dev': False,
 'in_file': 'gs://jcosme/clean_data/genomics/MarRef_inverse_rm.parquet',
 'label_col_name': 'label',
 'label_regex': '(?:[^a-zA-Z0-9]+)([a-zA-Z]+[0-9]+)(?:[^a-zA-Z0-9]+)',
 'out_file': 'gs://jcosme/clean_data/genomics/MarRef_species.parquet',
 'partition_size': '100M',
 'unq_label_out_file': 'gs://jcosme/clean_data/genomics/MarRef_species_unq.parquet'}


In [5]:
in_file = config['in_file']  
out_file = config['out_file']  
unq_label_out_file = config['unq_label_out_file']  
label_col_name = config['label_col_name']  
label_regex = config['label_regex']    
CUDA_VISIBLE_DEVICES = config['CUDA_VISIBLE_DEVICES']  
do_cuda_vis_dev = config['do_cuda_vis_dev']  
partition_size = config['partition_size']  

In [6]:
tmp_dask_dir = '/tmp/dask'
try:
    rmtree(tmp_dask_dir)
except:
    pass

In [7]:
print(f"starting Dask GPU cluster...")
if do_cuda_vis_dev:
    cluster = LocalCUDACluster(
        protocol="ucx",
        enable_tcp_over_ucx=True,
        CUDA_VISIBLE_DEVICES=CUDA_VISIBLE_DEVICES,
        local_directory=tmp_dask_dir,
    )
else:
    cluster = LocalCUDACluster(
        protocol="ucx",
        enable_tcp_over_ucx=True,
        local_directory=tmp_dask_dir,
    )
client = Client(cluster)

starting Dask GPU cluster...


2022-07-13 00:56:44,854 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize
2022-07-13 00:56:44,882 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize


In [8]:
# first we create the Dask dataframe
print(f"reading file {in_file}")
df = dask_cudf.read_parquet(in_file,  # location of raw file
                        partition_size=partition_size,
                        )

reading file gs://jcosme/clean_data/genomics/MarRef_inverse_rm.parquet


In [9]:
#len(df)
#7432328

7432328

In [10]:
def extract_labels(df):
        df[label_col_name] = df[label_col_name].str.extract(label_regex).loc[:, 0]
        return df

In [11]:
print(f"extracting labels...")
df = df.map_partitions(extract_labels)

extracting labels...


In [12]:
#len(df)
#7432328

7432328

In [16]:
# df[label_col_name].value_counts().compute()

In [None]:
print(f"saving data to {out_file}")
# the final step is to save the cleaned data.
_ = df.to_parquet(out_file)

In [14]:
print(f"creating unique labels...")
del df
df = dask_cudf.read_parquet(out_file)
unq_labs_df = df[label_col_name].unique().to_frame().reset_index(True)

creating unique labels...
saving data to gs://jcosme/clean_data/genomics/MarRef_species_unq.parquet


In [None]:
print(f"saving data to {unq_label_out_file}")
_ = unq_labs_df.to_parquet(unq_label_out_file)

In [19]:
print(f"shutting down Dask client")
client.shutdown()
print(f"finished")

shutting down Dask client
finished


2022-07-12 22:39:43,249 - distributed.client - ERROR - Failed to reconnect to scheduler after 30.00 seconds, closing client
