In [1]:
# Initialization
import pyhdk 
import pandas
import time
import pyarrow as pa
import pyarrow.csv
import os, sys

config = pyhdk.buildConfig(enable_heterogeneous=True,
                           force_heterogeneous_distribution=True,
                           enable_multifrag_heterogeneous=True,
                           enable_debug_timer=True,
                           )
pyhdk.initLogger(log_severity="INFO")
storage = pyhdk.storage.ArrowStorage(1)
data_mgr = pyhdk.storage.DataMgr(config)
data_mgr.registerDataProvider(storage)

calcite = pyhdk.sql.Calcite(storage, config)
executor = pyhdk.Executor(data_mgr, config)

In [2]:
# Helper Functions
default_step = 50
default_iters = 3

def get_rel_alg(sql):
    return calcite.process(sql)

def run_query(sql):
    ra = get_rel_alg(sql)
    # One RelAlgExecutor per query
    rel_alg_executor = pyhdk.sql.RelAlgExecutor(executor, storage, data_mgr, ra)
    return rel_alg_executor.execute()


def import_hdk_pyarrow(storage, arrow_table, hdk_table_name, fragment_size, overwrite=True):
    """
    Imports a pyarrow table to HDK with the given fragment size.
        overwrite: By default overwrites previously existing table.
    """
    opt = pyhdk.storage.TableOptions(fragment_size)
    start_timer = time.perf_counter()
    try:
        storage.importArrowTable(arrow_table, hdk_table_name, opt)
    except:
        if not overwrite:
            raise Exception(f"Cannot overwrite table{hdk_table_name}, overwrite={overwrite}")
        storage.dropTable(hdk_table_name)
        storage.importArrowTable(arrow_table, hdk_table_name, opt)
    print(f"[PyHDK] Importing pyarrow table: {(time.perf_counter()-start_timer):.4f}s")


def run_query_het_all_props(sql, query_name="", prop_step=default_step, n_iters=default_iters, clear_memory_devices=[]):
    """
    Runs SQL query multiple times at each proportion, feel free try and experiment with loops order.
        clear_memory_devices: clear memory of the device manager: 1:CPU, 2:GPU 
    """
    cython_enum_dict = {"CPU":1, "GPU":2} # May move up to cython for easier interface
    ra = get_rel_alg(sql)
    col_names = ["GPU_prop", f"QueryT_{query_name}"]
    prop_time = {col_names[0] : [], col_names[1]: []}
    # Walking over proportions
    for gpu_proportion in range(0, 101, prop_step):
        # Multiple iterations
        for _ in range(1, n_iters + 1):
            rel_alg_executor = pyhdk.sql.RelAlgExecutor(executor, storage, data_mgr, ra)
            query_start = time.perf_counter()
            result = rel_alg_executor.execute(forced_gpu_proportion=gpu_proportion)
            query_finish = time.perf_counter()
            prop_time[col_names[0]].append(gpu_proportion)
            prop_time[col_names[1]].append(query_finish - query_start)
            [executor.clearMemory(data_mgr, cython_enum_dict[device]) for device in clear_memory_devices]

        df_prop_time = pandas.DataFrame(prop_time, columns=col_names)
    # Some metadata to get idea about the output cardinality
    df_output = result.to_arrow().to_pandas()
    output_size_KB = df_output.memory_usage(index=True).sum() // (1024)
    df_prop_time.rename(columns={col_names[1]:f"{col_names[1]}_{output_size_KB}KB"}, inplace=True)
    return [df_prop_time, df_output]

def run_queries_all_props(query_dict, step=default_step, n_iters=default_iters, clear_memory_devices=[]):
    """
    Runs query dictionary of SQL queries with the following structure: dict(query_name:{SQL_string})
        clear_memory_devices: clear memory of the device manager after each query: "CPU", "GPU" 
    """
    q_timings_dict = dict()
    for q_name in query_dict:
        [df_prop_time, df_output] = run_query_het_all_props(query_dict[q_name], 
                                                            query_name=q_name, 
                                                            prop_step=step, 
                                                            n_iters=n_iters, 
                                                            clear_memory_devices=clear_memory_devices)
        df_prop_time.set_index("GPU_prop", inplace=True)
        q_timings_dict[q_name] = (df_prop_time)
    return q_timings_dict

def fragment_size_calc(num_rows):
    """Taken from Modin, you can experiment with it."""
    cpu_count = os.cpu_count()
    if cpu_count is not None:
        fragment_size = num_rows // cpu_count
        fragment_size = min(fragment_size, 2**25)
        fragment_size = max(fragment_size, 2**18)
        return fragment_size
    else:
        return None

def fragment_size_test_range(num_rows):
    """
    Take two power of two steps around default frag_size: [x/4,x/2,x,x*2,x*4].
    """
    res_range = []
    default_fragment_size = fragment_size_calc(num_rows)
    print(f"Default fragment_size={default_fragment_size}")
    power_two_steps = 2
    range_start = default_fragment_size//(2**power_two_steps)
    range_end = default_fragment_size*(2**power_two_steps)
    fragment_size = range_start
    while fragment_size < range_end+1:
        res_range.append(fragment_size)
        fragment_size *= 2
    return res_range

def test_groups_fragment_sizes(storage, pyarrow_tbl, table_name, q_dict, step, n_iters, clear_memory_devices=[]):
    """ 
    Produces the follwing result grouping: fragment_size{query_name{timings_df}}
    """
    part_group_timings_dict = dict()
    for frag_size in fragment_size_test_range(pyarrow_tbl.num_rows):
        table_size_MB = pyarrow_tbl.nbytes // (1024*1024)
        print(f"Testing {table_size_MB}MB Table with Frag.size={frag_size}")
        import_hdk_pyarrow(storage, pyarrow_tbl, table_name, frag_size)
        part_group_timings_dict[f"Tbl_size_{table_size_MB}MB_frag_size_{frag_size}"] = run_queries_all_props(q_dict, step, n_iters, clear_memory_devices)
    return part_group_timings_dict

In [3]:
import numpy as np

num_groups = [100 * i for i in range(1,10)] #[500, 1000, 2000, 5000, 10000]#, 4000, 30000, 20000, 1000000]
# num_groups = [200, 512, 513, 1000, 2000, 5000, 10000] #4000, 10000, 20000, 1000000]
total_rows = 90000000
table_columns = []
column_data = []
for group in num_groups:
    groups = np.random.randint(0, group, total_rows)
    column_name = f"group_{group}"
    column = pa.array(groups, pa.int64())
    table_columns.append(pa.field(column_name, column.type))
    column_data.append(groups)
    
aggregated_data = np.random.uniform(0.0, 1000000.0, total_rows).astype(np.float64)
aggregated_column = pa.array(aggregated_data, pa.float64())

# aggregated_data = np.random.randint(0, 1000000, total_rows)
# aggregated_column = pa.array(aggregated_data, pa.int64())
table_columns.append(pa.field("aggregated_data", aggregated_column.type))
column_data.append(aggregated_column)

table_schema = pa.schema(table_columns)
groups_tbl = pa.Table.from_arrays(column_data, schema=table_schema)

# print(groups_tbl)

In [4]:
default_fragment_size = fragment_size_calc(groups_tbl.num_rows)
import_hdk_pyarrow(storage, groups_tbl, f"groups_table", 90000000)

[PyHDK] Importing pyarrow table: 0.1585s


In [5]:
for group in num_groups[:4]:
    # ra = get_rel_alg(f"SELECT MIN({groups_tbl.column_names[-1]}), MAX({groups_tbl.column_names[-1]}) FROM groups_table GROUP BY group_{group};")
    ra = get_rel_alg(f"SELECT COUNT(*) FROM groups_table GROUP BY group_{group};")
    gpu_prop = 100
    rel_alg_executor = pyhdk.sql.RelAlgExecutor(executor, storage, data_mgr, ra)
    result = rel_alg_executor.execute(forced_gpu_proportion=100)

In [10]:
# HDK Cleanup
storage.dropTable(table_name)