In [None]:
import numpy as np
import pyarrow as pa
import pyhdk
import time

hdk = pyhdk.hdk.HDK(
    debug_logs="INFO"
)

In [None]:
# Init data
col2_tbl1 = np.array(['red', 'orange', 'yellow', 'green', 'blue'])
col1_tbl1 = np.arange(len(col2_tbl1))

table1 = pa.Table.from_arrays(
    [pa.array(col1_tbl1, pa.int64()), pa.array(col2_tbl1, pa.string())], 
    schema=pa.schema([('ID', pa.int64()), ('color', pa.string())])
)

table2_nrows = 10_000_000 # with more data, we expect GPU to beat CPU
col1_table2 = np.random.randint(1, 100, size=table2_nrows)
col2_table2 = np.random.randint(1, 100, size=table2_nrows)
col3_table2 = np.random.randint(len(col2_tbl1), size=table2_nrows)

table2 = pa.Table.from_arrays(
    [pa.array(col1_table2, pa.int64()), pa.array(col2_table2, pa.int64()), pa.array(col3_table2, pa.int64())], 
    schema=pa.schema([("price", pa.int64()), ('Region', pa.int64()), ('color_ID', pa.int64())])
    )


In [None]:
fragment_count = 8
hdk_tbl1 = hdk.import_arrow(table1, "ht1", int(np.ceil(table1.num_rows/fragment_count)))
hdk_tbl2 = hdk.import_arrow(table2, "ht2", int(np.ceil(table2.num_rows/fragment_count)))

Note that cold run may not indicate significant speedups, because HDK
potentially needs to materialize/build some info about the table and/or the individual columns.
That info, however, will be preserved and subsequent runs should be faster.

To get "fair" results, do not run of optimized versions back-to-back as this will try to reuse results of previous compilations. 

You could run the unoptimized version before an optimized one to 
wipe the cached plan and get a time that includes compilation.


In [None]:
# Can also refragment original tables
# hdk_tbl3 = hdk_tbl1.refragmented_view(500_000)
# hdk_tbl4 = hdk.refragmented_view("ht2", 500_000)
# OR
# hdk_tbl4 = hdk.refragmented_view(hdk_tbl2, 500_000)

In [None]:
# To see "fair" results, you can at first execute all cells and then click "execute this cell and below" 
# Independent ops on CPU (dataframe-like, naive and suboptimal)
join_start = time.perf_counter()
join_res = hdk_tbl2.join(hdk_tbl1, lhs_cols="color_ID", rhs_cols="ID").run()
print(f"Join time: {(time.perf_counter() - join_start):.3f}s")

sort_start = time.perf_counter()
sort_res = join_res.sort(fields={"price" : "desc"}).run()
print(f"Sort time: {(time.perf_counter() - sort_start):.3f}s")

agg_start = time.perf_counter()
agg_res = sort_res.agg("color_ID", "avg(price)").run()
print(f"Agg time: {(time.perf_counter() - agg_start):.3f}s")
unopt_query_t = time.perf_counter() - join_start
print(f"Total time (unopt): {(unopt_query_t):.3f}s")

In [None]:
# Combined plan on CPU 
# Giving the compiler more overview of what we want to achieve allows for more optimizations

q_start = time.perf_counter()
join_res = hdk_tbl2.join(hdk_tbl1, lhs_cols="color_ID", rhs_cols="ID")
sort_res = join_res.sort(fields={"price" : "desc"})
agg_res = sort_res.agg("color_ID", "avg(price)").run()
opt_query_t = time.perf_counter() - q_start
print(f"Total time (Combined plan on CPU): {(opt_query_t):.3f}s")
print(f"Speedup: {(unopt_query_t/opt_query_t):.2f}")

In [None]:
# Combined plan on GPU
q_start = time.perf_counter()
join_res = hdk_tbl2.join(hdk_tbl1, lhs_cols="color_ID", rhs_cols="ID")
sort_res = join_res.sort(fields={"price" : "desc"})
agg_res = join_res.agg("color_ID", "avg(price)").run(device_type="GPU")
opt_query_t = time.perf_counter() - q_start
print(f"Total time (Combined plan on GPU): {(opt_query_t):.3f}s")
print(f"Speedup: {(unopt_query_t/opt_query_t):.2f}")

In [None]:
# Indep ops: Join on CPU, Sort and Agg on GPU
# Q: Why could it be so much slower than fully on either of the devices? 
# A: GPU must *fetch intermediate results* of the join each run, whereas in
#    the full-GPU mode GPU can retain columns of the table for the next run
#    via BufferManager and thus only needs to transfer the aggregate back to CPU.

join_start = time.perf_counter()
join_res = hdk_tbl2.join(hdk_tbl1, lhs_cols="color_ID", rhs_cols="ID").run()
print(f"Join time: {(time.perf_counter() - join_start):.3f}s")

sort_res = join_res.sort(fields={"price" : "desc"})
agg_start = time.perf_counter()
agg_res = sort_res.agg("color_ID", "avg(price)").run(device_type="GPU")
print(f"Sort+Agg time: {(time.perf_counter() - agg_start):.3f}s")
opt_query_t = time.perf_counter() - join_start
print(f"Speedup (Join on CPU, Agg on GPU): {(unopt_query_t/opt_query_t):.2f}")

In [None]:
# SQL example
q_start = time.perf_counter()
hdk.sql("SELECT color_ID, AVG(price) \
        FROM ht2 \
        JOIN ht1 ON ht1.ID = ht2.color_ID \
        GROUP BY color_ID \
        ORDER BY AVG(price) DESC")
print(f"SQL time: {(time.perf_counter() - q_start):.3f}s")