In [None]:
from dask.distributed import Client
from dask_cuda import LocalCUDACluster
import dask_cudf
import argparse
import yaml
from pprint import pprint

In [None]:
cur_dir = '/home/jovyan/work/projects/COSME'
config_subdir = 'configs/make_kmers_config.yaml'

In [None]:
config_dir = f"{cur_dir}/{config_subdir}"

In [None]:
print(f"loading yaml file...")
config = open(config_dir, 'r').read()
pprint(config_yaml_data)

In [None]:
in_dir = config['in_dir']  
out_dir = config['out_dir']  
label_col_name = config['label_col_name']  
input_col_name = config['input_col_name']  
k_mer = config['k_mer']  
possible_gene_values = config['possible_gene_values']  
max_seq_len = config['max_seq_len']  
CUDA_VISIBLE_DEVICES = config['CUDA_VISIBLE_DEVICES']  
do_cuda_vis_dev = config['do_cuda_vis_dev']  
partition_size = config['partition_size']  

In [None]:
split_files = glob(f"{in_dir}/*.parquet")
split_names = [ x.split('/')[-1] for x in split_files]

In [None]:
replace_gene_values = []
for gene_val in possible_gene_values:
    replace_gene_values.append(gene_val + ' ')

In [None]:
print(f"starting Dask GPU cluster...")
if do_cuda_vis_dev:
    cluster = LocalCUDACluster(
        protocol="ucx",
        enable_tcp_over_ucx=True,
        CUDA_VISIBLE_DEVICES=CUDA_VISIBLE_DEVICES,
        local_directory='/tmp',
    )
else:
    cluster = LocalCUDACluster(
        protocol="ucx",
        enable_tcp_over_ucx=True,
        local_directory='/tmp',
    )
client = Client(cluster)

In [None]:
def add_whitespace(df):
    df[input_col_name] = df[input_col_name].str.replace(possible_gene_values, replace_gene_values, regex=False)
    return df

def get_kmers(df):
    df['temp'] = df[input_col_name].copy()
    df['temp'] = ' '
    for i in np.arange(0, df[input_col_name].str.len().max() - k_mer):
        # print(i)
        temp_df = df[input_col_name].str[i: i + k_mer].fillna(' ')
        change_mask = temp_df.str.len() < k_mer
        temp_df[change_mask] = ' '
        df['temp'] = df['temp'] + ' ' + temp_df
    df['temp'] = df['temp'].str.normalize_spaces()
    df[input_col_name] = df['temp']
    df = df.drop(columns=['temp'])
    return df

In [None]:
for i, in_file in enumerate(split_files):
    out_file = f"{out_dir}/{split_names[i]}"

    # first we create the Dask dataframe
    print(f"reading file {in_file}")
    df = dask_cudf.read_parquet(in_file,  # location of raw file
                            partition_size=partition_size,
                            )

    print(f"creating {k_mer} mer...")
    if k_mer == 1:
        df = df.map_partitions(add_whitespace)
    elif (k_mer > 1):
        df = df.map_partitions(get_kmers)

    df[input_col_name] = df[input_col_name].str.split()

    print(f"saving data to {out_file}")
    # the final step is to save the cleaned data.
    _ = df.to_parquet(out_file)

    client.cancel(df)

In [None]:
print(f"shutting down Dask client")
client.shutdown()
print(f"finished")