In [None]:
from dask.distributed import Client
from dask_cuda import LocalCUDACluster
import dask_cudf
import argparse
import yaml
from pprint import pprint

In [None]:
cur_dir = '/home/jovyan/work/projects/COSME'
config_subdir = '/configs/fasta_to_parquet_config.yaml'

In [None]:
config_dir = f"{cur_dir}/{config_subdir}"

In [None]:
print(f"loading yaml file...")
config = open(config_dir, 'r').read()
pprint(config_yaml_data)

In [None]:
raw_fasta_file = config['raw_fasta_file']  
clean_fasta_file = config['clean_fasta_file']  
base_col_names = config['base_col_names']  
fasta_sep = config['fasta_sep']  
CUDA_VISIBLE_DEVICES = config['CUDA_VISIBLE_DEVICES']  
do_cuda_vis_dev = config['do_cuda_vis_dev']  
partition_size = config['partition_size']  

In [None]:
print(f"starting Dask GPU cluster...")
if do_cuda_vis_dev:
    cluster = LocalCUDACluster(
        protocol="ucx",
        enable_tcp_over_ucx=True,
        CUDA_VISIBLE_DEVICES=CUDA_VISIBLE_DEVICES,
        local_directory='/tmp',
    )
else:
    cluster = LocalCUDACluster(
        protocol="ucx",
        enable_tcp_over_ucx=True,
        local_directory='/tmp',
    )
client = Client(cluster)

In [None]:
# first we create the Dask dataframe
print(f"reading .fasta file {raw_fasta_file}")
df = dask_cudf.read_csv(raw_fasta_file,  # location of raw file
                        sep=fasta_sep,  # this is the '>' sign
                        names=base_col_names,  # column names
                        dtype=str,  # data type
                        partition_size=partition_size,
                        )

In [None]:
# now we have to shift the data, in order to correct the wrong offset
print(f"shifting data...")
df['label'] = df['label'].shift()

# finally, we drop all empty rows, and reset the index
print(f"dropping empty rows...")
df = df.dropna().reset_index(drop=True)

print(f"saving cleaned data to {clean_fasta_file}")
# the final step is to save the cleaned data.
_ = df.to_parquet(clean_fasta_file)

In [None]:
print(f"shutting down Dask client")
client.shutdown()
print(f"finished")