In [1]:
from dask.distributed import Client
from dask_cuda import LocalCUDACluster
import dask_cudf
import argparse
import yaml
from pprint import pprint
from shutil import rmtree

In [2]:
cur_dir = '/home/jovyan/work/projects/GPU_gene_ETL'
config_subdir = 'configs/fasta_to_parquet_config.yaml'

In [3]:
config_dir = f"{cur_dir}/{config_subdir}"
config_dir

'/home/jovyan/work/projects/GPU_gene_ETL/configs/fasta_to_parquet_config.yaml'

In [4]:
print(f"loading yaml file...")
config = yaml.safe_load(open(config_dir, 'r'))
pprint(config)

loading yaml file...
{'CUDA_VISIBLE_DEVICES': '0',
 'base_col_names': ['seq', 'label'],
 'do_cuda_vis_dev': False,
 'fasta_sep': '>',
 'in_file': 'gs://jcosme/raw_data/genomics/MarRef.training.fasta',
 'label_col_name': 'label',
 'out_file': 'gs://jcosme/clean_data/genomics/MarRef_training.parquet',
 'partition_size': '100M'}


In [5]:
in_file = config['in_file']  
out_file = config['out_file']  
base_col_names = config['base_col_names']  
fasta_sep = config['fasta_sep']  
label_col_name = config['label_col_name'] 

CUDA_VISIBLE_DEVICES = config['CUDA_VISIBLE_DEVICES']  
do_cuda_vis_dev = config['do_cuda_vis_dev']  
partition_size = config['partition_size']  

In [6]:
tmp_dask_dir = '/tmp/dask'
try:
    rmtree(tmp_dask_dir)
except:
    pass

In [7]:
print(f"starting Dask GPU cluster...")
if do_cuda_vis_dev:
    cluster = LocalCUDACluster(
        protocol="ucx",
        enable_tcp_over_ucx=True,
        CUDA_VISIBLE_DEVICES=CUDA_VISIBLE_DEVICES,
        local_directory=tmp_dask_dir,
    )
else:
    cluster = LocalCUDACluster(
        protocol="ucx",
        enable_tcp_over_ucx=True,
        local_directory=tmp_dask_dir,
    )
client = Client(cluster)

starting Dask GPU cluster...


2022-07-13 00:51:02,511 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize
2022-07-13 00:51:02,523 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize


In [8]:
# first we create the Dask dataframe
print(f"reading .fasta file {in_file}")
df = dask_cudf.read_csv(in_file,  # location of raw file
                        sep=fasta_sep,  # this is the '>' sign
                        names=base_col_names,  # column names
                        dtype=str,  # data type
                        partition_size=partition_size,
                        )

reading .fasta file gs://jcosme/raw_data/genomics/MarRef.training.fasta


In [9]:
# now we have to shift the data, in order to correct the wrong offset
print(f"shifting data...")
df[label_col_name] = df[label_col_name].shift()

shifting data...


In [10]:
# finally, we drop all empty rows, and reset the index
print(f"dropping empty & duplicate rows...")
df = df.dropna().drop_duplicates().reset_index(drop=True).repartition(partition_size=partition_size)

dropping empty & duplicate rows...


In [11]:
#len(df)
#14864656

14864656

In [12]:
print(f"saving cleaned data to {out_file}")
# the final step is to save the cleaned data.
_ = df.to_parquet(out_file)

saving cleaned data to gs://jcosme/clean_data/genomics/MarRef_training.parquet


In [13]:
client.cancel(df)
print(f"shutting down Dask client")
client.shutdown()
print(f"finished")

shutting down Dask client
finished
