In [None]:
# Initialization
import pyhdk
import time
import pandas as pd
import pyarrow as pa
import pyarrow.parquet
import pyarrow.csv
import numpy as np
import os

hdk = pyhdk.hdk.HDK(
    enable_heterogeneous=True,
    force_heterogeneous_distribution=True,
    enable_multifrag_heterogeneous=True,
    # enable_debug_timer=True,
    # debug_logs="INFO" # generates log file, DEBUG2 for more verbosity 
) 

In [None]:
# Helper Functions
def import_hdk_pyarrow(arrow_tbl, hdk_tbl_name, fragment_size, overwrite=True):
    """
    Wrapper that imports a pyarrow table to HDK with the given fragment size.
        overwrite: By default overwrites previously existing table.
    """
    if overwrite:
        hdk.drop_table(hdk_tbl_name)
    start_timer = time.perf_counter()
    hdk_tbl = hdk.import_arrow(arrow_tbl, hdk_tbl_name, fragment_size)
    print(f"[PyHDK] Importing pyarrow table: {(time.perf_counter()-start_timer):.4f}s")
    return hdk_tbl


def fragment_size_calc(num_rows):
    """Taken from Modin, you can experiment with it."""
    cpu_count = os.cpu_count()
    if cpu_count is not None:
        fragment_size = num_rows // cpu_count
        fragment_size = min(fragment_size, 2**25)
        fragment_size = max(fragment_size, 2**18)
        return fragment_size
    else:
        return None

def fragment_size_test_range(num_rows):
    """
    Take two power of two steps around default frag_size: [x/4,x/2,x,x*2,x*4].
    """
    res_range = []
    default_fragment_size = fragment_size_calc(num_rows)
    print(f"Default fragment_size={default_fragment_size}")
    power_two_steps = 2
    range_start = default_fragment_size//(2**power_two_steps)
    range_end = default_fragment_size*(2**power_two_steps)
    fragment_size = range_start
    while fragment_size < range_end+1:
        res_range.append(fragment_size)
        fragment_size *= 2
    return res_range


def run_single_q_all_props(sql, q_name, prop_step, n_iters, clear_gpu_mem=False):
    """
    Runs SQL query multiple times at each proportion, feel free try and experiment with loops order.
        clear_gpu_mem: when True, clear GPU memory between runs
    """
    col_names = ["GPU_prop", q_name]
    prop_time = {col_names[0] : [], col_names[1]: []}
    # Walking over proportions
    for gpu_proportion in range(0, 101, prop_step):
        # Multiple iterations
        for _ in range(1, n_iters + 1):
            query_start = time.perf_counter()
            result = hdk.sql(sql, {"forced_gpu_proportion":gpu_proportion})
            query_finish = time.perf_counter()
            prop_time[col_names[0]].append(gpu_proportion)
            prop_time[col_names[1]].append(query_finish - query_start)
            if clear_gpu_mem:
                hdk.clear_gpu_mem()
    df_output = result.to_arrow().to_pandas()
    df_prop_time = pd.DataFrame(prop_time, columns=col_names)
    return [df_prop_time, df_output]

def run_queries_all_props(query_dict, step, n_iters, clear_gpu_mem=False):
    """
    Runs query dictionary of SQL queries with the following structure: dict(query_name:{SQL_string})
        clear_gpu_mem: when True, clear GPU memory between runs
    """
    q_timings_df = pd.DataFrame()
    # new_df = old_df[['a', 'b', 'c', 'd']]
    for q_name in query_dict:
        [df_prop_time, df_output] = run_single_q_all_props(
            query_dict[q_name], 
            q_name=q_name, 
            prop_step=step, 
            n_iters=n_iters, 
            clear_gpu_mem=clear_gpu_mem
        )
        if q_timings_df.empty:
            q_timings_df = df_prop_time
            q_timings_df.rename(columns={q_name:f"{q_name}_#RowsOut={df_output.shape[0]}"}, inplace=True)
        else:
            q_timings_df[f"{q_name}_#RowsOut={df_output.shape[0]}"] = df_prop_time[q_name]
    return q_timings_df

def test_groups_fragment_sizes(
        pyarrow_tbl, 
        table_name,
        get_queries_for_table_callback, 
        step, 
        n_iters, 
        clear_memory_devices=False
    ):
    """ 
    Runs queries for different fragment sizes and returns a dictionary of structure: `frag_size: timings_df`
    """
    
    q_per_frag_size_df = pd.DataFrame()
    for frag_size in fragment_size_test_range(pyarrow_tbl.num_rows):
        table_rows = pyarrow_tbl.num_rows
        print(f"Testing {table_rows} rows table with Frag.size={frag_size}")
        refragmented_view_name = f"{table_name}_{frag_size}"
        hdk.refragmented_view(table_name, frag_size, refragmented_view_name)
        queries_timings = run_queries_all_props(
            get_queries_for_table_callback(refragmented_view_name), 
            step, 
            n_iters, 
            clear_memory_devices
        )
        queries_timings["Frag.size"]=frag_size
        if q_per_frag_size_df.empty:
            q_per_frag_size_df = queries_timings
        else:
            q_per_frag_size_df = pd.concat([q_per_frag_size_df, queries_timings])
        hdk.drop_table(refragmented_view_name)
        hdk.clear_gpu_mem()
    return q_per_frag_size_df

In [None]:
# Read data (replace with real dataset)
dataset_path = "../omniscidb/Tests/ArrowStorageDataFiles/taxi_sample_header.csv"
table_name = "taxi"
# If the CSV does not have a header, please provide the column names.
pyarrow_tbl = pa.csv.read_csv(dataset_path)

In [None]:
# Queries (NY Taxi example)
def getTaxiQ_for_table(tbl_name):
    return {
    "Q1": f"SELECT cab_type, count(*)\
            FROM {tbl_name}\
            GROUP BY cab_type;",
    "Q2": f"SELECT passenger_count, avg(total_amount)\
            FROM {tbl_name}\
            GROUP BY passenger_count;",
    "Q3": f"SELECT passenger_count, extract(year from pickup_datetime) as pickup_year, count(*)\
            FROM {tbl_name}\
            GROUP BY passenger_count, extract(year from pickup_datetime);",
    "Q4": f"SELECT passenger_count,\
                extract(year from pickup_datetime) as pickup_year,\
                cast(trip_distance as int) AS distance,\
                count(*) AS the_count\
            FROM {tbl_name}\
            GROUP BY passenger_count,\
                    pickup_year,\
                    distance\
            ORDER BY passenger_count, pickup_year, distance, the_count;"
}

In [None]:
hdk_tbl = import_hdk_pyarrow(pyarrow_tbl, table_name, fragment_size_calc(pyarrow_tbl.num_rows))

In [None]:
prop_step = 25
n_iters_per_prop = 3

In [None]:
default_timings_df = run_queries_all_props(
    getTaxiQ_for_table(table_name),
    prop_step,
    n_iters_per_prop
)

In [None]:
timing_per_frag_df = test_groups_fragment_sizes(
    pyarrow_tbl,
    table_name,
    getTaxiQ_for_table,
    prop_step,
    n_iters_per_prop
)

In [None]:
import importlib.util
if importlib.util.find_spec("matplotlib") is None:
    raise Exception("Please install matplotlib")

import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams["figure.figsize"] = (8,4)
styles = ['s-','o-','^-','+-','*-',',-']

def plotTimings(dict_of_df_timings, plot_name="Time vs GPU proportion"):
    ylab = "Time (s)"
    xlab = "Data proportion on GPU (%)"
    df_agg = dict_of_df_timings.groupby(["GPU_prop"]).median()
    df_agg.plot(xlabel=xlab, ylabel=ylab, title=plot_name)
    plt.legend(bbox_to_anchor=(1.01, 1), loc="upper left")
    plt.tight_layout()

def plotTimingsFrags(timing_per_frag_df, default_frag_size = None, row_count = None, plot_name="Time vs GPU proportion"):
    ylab = "Time (ms)"
    xlab = "Data proportion on GPU (%)"
    fig, axes = plt.subplots(timing_per_frag_df.shape[1]-2,1)
    fig.set_size_inches(7,9)
    frag_sizes = timing_per_frag_df["Frag.size"].unique()
    for frag_size in frag_sizes:
        frag_df = timing_per_frag_df[timing_per_frag_df["Frag.size"]==frag_size].groupby(["GPU_prop"]).median()
        frag_df = frag_df.drop("Frag.size", axis=1)
        frag_df *= 1000 
        for enum, q_name in enumerate(frag_df):
            df_agg = frag_df[q_name]
            subplot_title = q_name
            lab = f"Frag.size={frag_size}" if row_count is None else f"Num.frags={int(np.ceil(row_count/frag_size))}"
            if default_frag_size is not None and default_frag_size == frag_size:
                lab = f"{lab} (CPU opt)"
            if frag_size == np.max(frag_sizes):
                lab = f"{lab} (GPU opt.)"
            df_agg.plot(
                ax=axes[enum], 
                xlabel=xlab, 
                ylabel=ylab, 
                title=subplot_title, 
                style=styles[enum], 
                label=lab
            )
            axes[enum].legend(bbox_to_anchor=(1.01, 1.02), loc="upper left")
    fig.suptitle(plot_name)
    fig.tight_layout()
    fig.show()

In [None]:
default_frag_size = fragment_size_calc(pyarrow_tbl.num_rows)
plotTimingsFrags(
    timing_per_frag_df, 
    default_frag_size=default_frag_size, 
    row_count=pyarrow_tbl.num_rows,
    plot_name=f"Taxi, #Rows={pyarrow_tbl.num_rows//(1000*1000)}Mil."
    )

In [None]:
# HDK Cleanup
hdk.dropTable(table_name)