In [None]:
import pyhdk
import time
import pandas as pd
import pyarrow as pa
import numpy as np
import os

hdk = pyhdk.hdk.HDK(
    enable_heterogeneous=True,
    force_heterogeneous_distribution=True,
    enable_multifrag_heterogeneous=True,
    # enable_debug_timer=True,
    # debug_logs="INFO" # generates log file, DEBUG2 for more verbosity 
) 

In [None]:
def synthensizeTable(num_groups_per_col,
                    num_rows,
                    random_data_col_dt=pa.float64(),
                    chunk_size=None):
    """
    Generates a table with num_groups_per_col columns of int64 type which have corresponding number of unique elements.
    random_data_col_dt: an additional column (int64/float64) filled with random data for MIN/MAX/AVG/... reductions.
    chunk_size: used to simulate reading a file with arrow which results in a chunked array (affects materialization).
    """
    
    if chunk_size is None:
        chunk_size = num_rows
    table_columns = []
    column_data = []
    data_col_dt = pa.int64()
    for groups_count in num_groups_per_col:
        groups = np.random.randint(0, groups_count, num_rows)
        column_name = f"group_{groups_count}"
        chunks = [pa.array(groups[i:i+chunk_size], data_col_dt) for i in range(0, len(groups), chunk_size)]
        column = pa.chunked_array(chunks)
        table_columns.append(pa.field(column_name, column.type))
        column_data.append(column)

    if pa.types.is_floating(random_data_col_dt):
        random_data = np.random.uniform(0.0, 1000000.0, num_rows)
    else:
        random_data = np.random.randint(0, 1000000, num_rows)
    chunks = [pa.array(random_data[i:i+chunk_size], random_data_col_dt) for i in range(0, len(random_data), chunk_size)]
    random_column = pa.chunked_array(chunks)

    table_columns.append(pa.field("rand_data", random_column.type))
    column_data.append(random_column)

    table_schema = pa.schema(table_columns)
    groups_tbl = pa.Table.from_arrays(column_data, schema=table_schema)

    print(f"One column has {num_rows/(1000000)} Mil. rows and takes {(num_rows*data_col_dt.bit_width//8)/(1024*1024):.2f} MiB")
    print(f"Chunk size: {len(groups_tbl.column(0).chunks[0])}")
    return groups_tbl

def fragment_size_calc(num_rows):
    """Taken from Modin, you can experiment with it."""
    cpu_count = os.cpu_count()
    if cpu_count is not None:
        fragment_size = num_rows // cpu_count
        fragment_size = min(fragment_size, 2**25)
        fragment_size = max(fragment_size, 2**18)
        return fragment_size
    else:
        return None

In [None]:
n_data_cols = 5
num_groups = [500 * i for i in range(1,n_data_cols+1)]
# num_groups = [200, 512, 513, 1000, 2000, 5000, 10000] 
tbl = synthensizeTable(num_groups_per_col=num_groups, 
                       num_rows=20_000_000, 
                    #    random_data_col_dt=pa.int64(), 
                       chunk_size=50000)


In [None]:
default_fragment_size = fragment_size_calc(tbl.num_rows)
table_name = "groups_table"
hdk_tbl = hdk.import_arrow(tbl, table_name, default_fragment_size)

In [None]:
gpu_proportion = 100
q_opts = {"forced_gpu_proportion":gpu_proportion}
q = f"SELECT SUM(x) FROM (SELECT COUNT(*) x FROM {table_name} "
print(f"Running query {q}")
for group in num_groups:
    query_start = time.perf_counter()
    res = hdk.sql(q + f"GROUP BY group_{group});", q_opts)
    print(f"Query for group_{group} took {(time.perf_counter() - query_start):.3f}s")

q = f"SELECT MIN({tbl.column_names[-1]}), MAX({tbl.column_names[-1]}) FROM groups_table "
print(f"Running query {q}")
for group in num_groups:
    query_start = time.perf_counter()
    res = hdk.sql(q + f"GROUP BY group_{group};", q_opts)
    assert(res.to_arrow().num_rows == group)
    print(f"Query for group_{group} took {(time.perf_counter() - query_start):.3f}s")

In [None]:
# Let's see if heterogeneous execution wins anywhere in simple aggregations?
# For group_count < 512 CPU will always win.

num_groups = [1000 * 2*i for i in range(1,15+1)]
num_groups.append(1500)
num_groups = sorted(num_groups)
tbl = synthensizeTable(num_groups_per_col=num_groups, 
                       num_rows=300_000_000)

default_fragment_size = fragment_size_calc(tbl.num_rows)
frag_size = default_fragment_size // 2
table_name = "table_for_grid"
hdk.drop_table(table_name)
hdk_het_tbl = hdk.import_arrow(tbl, table_name, frag_size)

In [None]:
prop_time = dict()
n_iters_check = 10
proportion_range = range(0,101,10)
q_per_group_size_df = pd.DataFrame()

for g in num_groups:
    for prop in proportion_range:
        prop_time[prop] = []
    for it in range(1+n_iters_check):
        for prop in proportion_range:
            agg_res1 = hdk_het_tbl.agg(f"group_{g}", "avg(rand_data)")
            # agg_res1 = agg_res1.agg("rand_data_avg", aggs ={"min":"min(rand_data_avg)", "max":"max(rand_data_avg)"})
            agg_start = time.perf_counter()
            agg_res1.run(forced_gpu_proportion=prop)
            if it:
                q_time = int((time.perf_counter() - agg_start)*1000)
                prop_time[prop].append(q_time) 
    if q_per_group_size_df.empty:
        q_per_group_size_df = pd.DataFrame({k: np.median(v) for k, v in prop_time.items()}, index=[0]).T
        q_per_group_size_df.rename(columns={0: f"group_{g}"}, inplace=True)
    else:
        q_per_group_size_df[f"group_{g}"] = [np.median(v) for v in prop_time.values()]
    hdk.clear_gpu_mem()
    print(f"Running group_{g}")



In [None]:
import importlib.util
if importlib.util.find_spec("matplotlib") is None:
    raise Exception("Please install matplotlib")

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

plt.figure(figsize=(13, 8), dpi=200)
plt.imshow(q_per_group_size_df, cmap='summer', aspect='auto')
plt.colorbar(label='Time(ms)')
plt.ylabel("GPU proportion")
plt.xlabel("Number of groups per column")

for i in range(len(q_per_group_size_df.columns)):
    for j in range(len(q_per_group_size_df.index)):
        cell_color = 'black'
        if q_per_group_size_df.index[j] == q_per_group_size_df.iloc[:, i].idxmin():
            cell_color = 'red'
        plt.text(i, j, f'{q_per_group_size_df.iloc[j, i]:.1f}', ha='center', va='center', color=cell_color)


plt.xticks(np.arange(len(q_per_group_size_df.columns)), q_per_group_size_df.columns, rotation=330)
plt.yticks(np.arange(len(q_per_group_size_df.index)), q_per_group_size_df.index)
plt.title(f"Heterogeneous Aggregation, #Rows={tbl.num_rows//1_000_000}Mil., #Frags={int(np.ceil(tbl.num_rows/frag_size))}")
plt.show()


In [None]:
hdk.drop_table(table_name)