In [None]:
from dask.distributed import Client
from dask_cuda import LocalCUDACluster
import dask_cudf
import argparse
import yaml
from pprint import pprint
from glob import glob

In [None]:
cur_dir = '/home/jovyan/work/projects/COSME'
config_subdir = 'configs/make_inverse_config.yaml'

In [None]:
config_dir = f"{cur_dir}/{config_subdir}"

In [None]:
print(f"loading yaml file...")
config = open(config_dir, 'r').read()
pprint(config_yaml_data)

In [None]:
in_dir = config['in_dir']  
out_dir = config['out_dir']  
inverse_col = config['inverse_col'] 
CUDA_VISIBLE_DEVICES = config['CUDA_VISIBLE_DEVICES']  
do_cuda_vis_dev = config['do_cuda_vis_dev']  
partition_size = config['partition_size']  

In [None]:
split_files = glob(f"{in_dir}/*.parquet")
split_names = [ x.split('/')[-1] for x in split_files]

In [None]:
print(f"starting Dask GPU cluster...")
if do_cuda_vis_dev:
    cluster = LocalCUDACluster(
        protocol="ucx",
        enable_tcp_over_ucx=True,
        CUDA_VISIBLE_DEVICES=CUDA_VISIBLE_DEVICES,
        local_directory='/tmp',
    )
else:
    cluster = LocalCUDACluster(
        protocol="ucx",
        enable_tcp_over_ucx=True,
        local_directory='/tmp',
    )
client = Client(cluster)

In [None]:
def get_inverse(df):
    df[inverse_col] = df[inverse_col].str.slice(start=-1, stop=None, step=-1)
    return df

In [None]:
for i, in_file in enumerate(split_files):
    out_file = f"{out_dir}/{split_names[i]}"
    
    print(f"reading file {in_file}")
    df = dask_cudf.read_parquet(in_file,  # location of raw file
                            partition_size=partition_size,
                            )
    print(f"inverting column {inverse_col}...")
    df_inverse = df.copy()
    df_inverse = df_inverse.map_partitions(get_inverse)
    df = dask_cudf.concat([df, df_inverse]).reset_index(True).repartition(partition_size=partition_size)
    
    print(f"saving data to {out_file}")
    _ = df.to_parquet(out_file)
    
    client.cancel(df)

In [None]:
print(f"shutting down Dask client")
client.shutdown()
print(f"finished")