In [1]:
from dask.distributed import Client
from dask_cuda import LocalCUDACluster
import dask_cudf
import argparse
import yaml
from pprint import pprint
from shutil import rmtree

In [2]:
cur_dir = '/home/jovyan/work/projects/GPU_gene_ETL'
config_subdir = 'configs/make_kmers_config.yaml'

In [3]:
config_dir = f"{cur_dir}/{config_subdir}"

In [4]:
print(f"loading yaml file...")
config = yaml.safe_load(open(config_dir, 'r'))
pprint(config)

loading yaml file...
{'CUDA_VISIBLE_DEVICES': '0',
 'do_cuda_vis_dev': False,
 'dsplit_file': '/home/jovyan/work/projects/GPU_gene_ETL/configs/make_data_splits_config.yaml',
 'in_dir': 'gs://jcosme/clean_data/genomics/data_splits_inverse',
 'input_col_name': 'seq',
 'k_mer': 1,
 'label_col_name': 'label',
 'max_seq_len': 150,
 'out_dir': 'gs://jcosme/clean_data/genomics/data_splits_inverse_1_mer',
 'partition_size': '100M',
 'possible_gene_values': ['A', 'C', 'G', 'T']}


In [5]:
in_dir = config['in_dir']  
out_dir = config['out_dir']  
dsplit_file = config['dsplit_file'] 
label_col_name = config['label_col_name']  
input_col_name = config['input_col_name']  
k_mer = config['k_mer']  
possible_gene_values = config['possible_gene_values']  
max_seq_len = config['max_seq_len']  
CUDA_VISIBLE_DEVICES = config['CUDA_VISIBLE_DEVICES']  
do_cuda_vis_dev = config['do_cuda_vis_dev']  
partition_size = config['partition_size']  

In [6]:
dsplit_config = yaml.safe_load(open(dsplit_file, 'r'))
splits = dsplit_config['splits']  

split_names = []
for key, val in splits.items():
    split_names.append(key)

split_names = [ x + '.parquet' for x in split_names]
split_files = [f"{in_dir}/{x}" for x in split_names]

In [7]:
replace_gene_values = []
for gene_val in possible_gene_values:
    replace_gene_values.append(gene_val + ' ')

In [8]:
tmp_dask_dir = '/tmp/dask'
try:
    rmtree(tmp_dask_dir)
except:
    pass

In [9]:
print(f"starting Dask GPU cluster...")
if do_cuda_vis_dev:
    cluster = LocalCUDACluster(
        protocol="ucx",
        enable_tcp_over_ucx=True,
        CUDA_VISIBLE_DEVICES=CUDA_VISIBLE_DEVICES,
        local_directory=tmp_dask_dir,
    )
else:
    cluster = LocalCUDACluster(
        protocol="ucx",
        enable_tcp_over_ucx=True,
        local_directory=tmp_dask_dir,
    )
client = Client(cluster)

starting Dask GPU cluster...


2022-07-13 01:14:06,054 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize
2022-07-13 01:14:06,061 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize


In [10]:
def add_whitespace(df):
    df[input_col_name] = df[input_col_name].str.replace(possible_gene_values, replace_gene_values, regex=False)
    return df

def get_kmers(df):
    df['temp'] = df[input_col_name].copy()
    df['temp'] = ' '
    for i in np.arange(0, df[input_col_name].str.len().max() - k_mer):
        # print(i)
        temp_df = df[input_col_name].str[i: i + k_mer].fillna(' ')
        change_mask = temp_df.str.len() < k_mer
        temp_df[change_mask] = ' '
        df['temp'] = df['temp'] + ' ' + temp_df
    df['temp'] = df['temp'].str.normalize_spaces()
    df[input_col_name] = df['temp']
    df = df.drop(columns=['temp'])
    return df

In [11]:
for i, in_file in enumerate(split_files):
    out_file = f"{out_dir}/{split_names[i]}"

    # first we create the Dask dataframe
    print(f"reading file {in_file}")
    df = dask_cudf.read_parquet(in_file,  # location of raw file
                            partition_size=partition_size,
                            )

    print(f"creating {k_mer} mer...")
    if k_mer == 1:
        df = df.map_partitions(add_whitespace)
    elif (k_mer > 1):
        df = df.map_partitions(get_kmers)

    df[input_col_name] = df[input_col_name].str.split()

    print(f"saving data to {out_file}")
    # the final step is to save the cleaned data.
    _ = df.to_parquet(out_file)

    client.cancel(df)

reading file gs://jcosme/clean_data/genomics/data_splits_inverse/train.parquet
creating 1 mer...
saving data to gs://jcosme/clean_data/genomics/data_splits_inverse_1_mer/train.parquet




reading file gs://jcosme/clean_data/genomics/data_splits_inverse/val.parquet
creating 1 mer...
saving data to gs://jcosme/clean_data/genomics/data_splits_inverse_1_mer/val.parquet
reading file gs://jcosme/clean_data/genomics/data_splits_inverse/test.parquet
creating 1 mer...
saving data to gs://jcosme/clean_data/genomics/data_splits_inverse_1_mer/test.parquet


In [12]:
print(f"shutting down Dask client")
client.shutdown()
print(f"finished")

shutting down Dask client




finished
