Skip to content
This repository has been archived by the owner on May 9, 2024. It is now read-only.

[L0] Building hashtable on GPU #583

Draft
wants to merge 3 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions omniscidb/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -617,6 +617,19 @@ else()
set(PROFILER_LIBS "")
endif()

find_library(libhash_table
NAMES
hash_table
PATHS
SOMEPATH/l0_physops/build/hash_table
)

if ( NOT libhash_table )
message( WARNING "hash_table library NOT FOUND - the respective targets won't be build")
else()
message( STATUS "hash_table library : ${libhash_table}")
endif( NOT libhash_table )

add_subdirectory(SqliteConnector)

add_subdirectory(StringDictionary)
Expand Down
1 change: 1 addition & 0 deletions omniscidb/QueryEngine/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -322,6 +322,7 @@ list(APPEND QUERY_ENGINE_LIBS ${llvm_libs} ${ZLIB_LIBRARIES})

add_subdirectory(CostModel)
list(APPEND QUERY_ENGINE_LIBS CostModel)
list(APPEND QUERY_ENGINE_LIBS ${libhash_table})

target_link_libraries(QueryEngine ${QUERY_ENGINE_LIBS})

Expand Down
17 changes: 17 additions & 0 deletions omniscidb/QueryEngine/Compiler/CommonGpuRuntime.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,4 +64,21 @@ DEVICE const GENERIC_ADDR_SPACE int64_t* init_shared_mem_nop(
const int32_t groups_buffer_size) {
return groups_buffer;
}

DEVICE ALWAYS_INLINE int64_t
baseline_hash_join_idx_32(GENERIC_ADDR_SPACE const int8_t* hash_buff,
GENERIC_ADDR_SPACE const int8_t* key,
const size_t key_bytes,
const size_t entry_count) {
return baseline_hash_join_idx_impl<int32_t>(hash_buff, key, key_bytes, entry_count);
}

NEVER_INLINE DEVICE int64_t
get_composite_key_index_32(GENERIC_ADDR_SPACE const int32_t* key,
const size_t key_component_count,
GENERIC_ADDR_SPACE const int32_t* composite_key_dict,
const size_t entry_count) {
return get_composite_key_index_impl(
key, key_component_count, composite_key_dict, entry_count);
}
}
109 changes: 109 additions & 0 deletions omniscidb/QueryEngine/Compiler/genx.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,117 @@
#include <algorithm>
#include <cstdint>

#include "../GpuRtConstants.h"
#include "CommonRuntimeDefs.h"
#include "QueryEngine/MurmurHash1Inl.h"
#include "Shared/funcannotations.h"

template <typename T = int64_t>
inline DEVICE T SUFFIX(get_invalid_key)() {
return EMPTY_KEY_64;
}

template <>
inline DEVICE int32_t SUFFIX(get_invalid_key)() {
return EMPTY_KEY_32;
}

DEVICE bool compare_to_key(GENERIC_ADDR_SPACE const int8_t* entry,
GENERIC_ADDR_SPACE const int8_t* key,
const size_t key_bytes) {
for (size_t i = 0; i < key_bytes; ++i) {
if (entry[i] != key[i]) {
return false;
}
}
return true;
}

template <typename T>
inline bool keys_are_equal(GENERIC_ADDR_SPACE const T* key1,
GENERIC_ADDR_SPACE const T* key2,
const size_t key_component_count) {
for (size_t i = 0; i < key_component_count; ++i) {
if (key1[i] != key2[i]) {
return false;
}
}
return true;
}

namespace {

const int kNoMatch = -1;
const int kNotPresent = -2;

} // namespace

template <class T>
DEVICE int64_t get_matching_slot(GENERIC_ADDR_SPACE const int8_t* hash_buff,
const uint32_t h,
GENERIC_ADDR_SPACE const int8_t* key,
const size_t key_bytes) {
const auto lookup_result_ptr = hash_buff + h * (key_bytes + sizeof(T));
if (compare_to_key(lookup_result_ptr, key, key_bytes)) {
return *reinterpret_cast<GENERIC_ADDR_SPACE const T*>(lookup_result_ptr + key_bytes);
}
if (*reinterpret_cast<GENERIC_ADDR_SPACE const T*>(lookup_result_ptr) ==
SUFFIX(get_invalid_key) < typename remove_addr_space<T>::type > ()) {
return kNotPresent;
}
return kNoMatch;
}

template <class T>
DEVICE int64_t baseline_hash_join_idx_impl(GENERIC_ADDR_SPACE const int8_t* hash_buff,
GENERIC_ADDR_SPACE const int8_t* key,
const size_t key_bytes,
const size_t entry_count) {
if (!entry_count) {
return kNoMatch;
}
const uint32_t h = MurmurHash1Impl(key, key_bytes, 0) % entry_count;
int64_t matching_slot = get_matching_slot<T>(hash_buff, h, key, key_bytes);
if (matching_slot != kNoMatch) {
return matching_slot;
}
uint32_t h_probe = (h + 1) % entry_count;
while (h_probe != h) {
matching_slot = get_matching_slot<T>(hash_buff, h_probe, key, key_bytes);
if (matching_slot != kNoMatch) {
return matching_slot;
}
h_probe = (h_probe + 1) % entry_count;
}
return kNoMatch;
}

template <typename T>
FORCE_INLINE DEVICE int64_t get_composite_key_index_impl(const T* key,
const size_t key_component_count,
const T* composite_key_dict,
const size_t entry_count) {
const uint32_t h =
MurmurHash1Impl(key, key_component_count * sizeof(T), 0) % entry_count;
uint32_t off = h * key_component_count;
if (keys_are_equal(&composite_key_dict[off], key, key_component_count)) {
return h;
}
uint32_t h_probe = (h + 1) % entry_count;
while (h_probe != h) {
off = h_probe * key_component_count;
if (keys_are_equal(&composite_key_dict[off], key, key_component_count)) {
return h_probe;
}
if (composite_key_dict[off] ==
SUFFIX(get_invalid_key) < typename remove_addr_space<T>::type > ()) {
return -1;
}
h_probe = (h_probe + 1) % entry_count;
}
return -1;
}

extern "C" {
int64_t atomic_cas_int_64(GENERIC_ADDR_SPACE int64_t*, int64_t, int64_t);
int32_t atomic_cas_int_32(GENERIC_ADDR_SPACE int32_t*, int32_t, int32_t);
Expand Down
8 changes: 4 additions & 4 deletions omniscidb/QueryEngine/IRCodegen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -825,10 +825,10 @@ std::shared_ptr<HashJoin> Executor::buildCurrentLevelHashTable(
check_valid_join_qual(qual_bin_oper);
JoinHashTableOrError hash_table_or_error;
if (!current_level_hash_table) {
if (co.device_type == ExecutorDeviceType::GPU && getDataMgr()->getGpuMgr() &&
getDataMgr()->getGpuMgr()->getPlatform() == GpuMgrPlatform::L0) {
throw QueryMustRunOnCpu();
}
// if (co.device_type == ExecutorDeviceType::GPU && getDataMgr()->getGpuMgr() &&
// getDataMgr()->getGpuMgr()->getPlatform() == GpuMgrPlatform::L0) {
// throw QueryMustRunOnCpu();
// }
hash_table_or_error = buildHashTableForQualifier(
qual_bin_oper,
query_infos,
Expand Down
10 changes: 5 additions & 5 deletions omniscidb/QueryEngine/JoinHashTable/BaselineHashTable.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ class BaselineHashTable : public HashTable {
const size_t hash_table_size)
: cpu_hash_table_buff_size_(hash_table_size)
, gpu_hash_table_buff_(nullptr)
#ifdef HAVE_CUDA
#if defined(HAVE_CUDA) || defined(HAVE_L0)
, device_id_(0)
, buffer_provider_(nullptr)
#endif
Expand All @@ -51,14 +51,14 @@ class BaselineHashTable : public HashTable {
const size_t hash_table_size,
const size_t device_id)
: gpu_hash_table_buff_(nullptr)
#ifdef HAVE_CUDA
#if defined(HAVE_CUDA) || defined(HAVE_L0)
, device_id_(device_id)
, buffer_provider_(buffer_provider)
#endif
, layout_(layout)
, entry_count_(entry_count)
, emitted_keys_count_(emitted_keys_count) {
#ifdef HAVE_CUDA
#if defined(HAVE_CUDA) || defined(HAVE_L0)
CHECK(buffer_provider_);
gpu_hash_table_buff_ = GpuAllocator::allocGpuAbstractBuffer(
buffer_provider_, hash_table_size, device_id_);
Expand All @@ -68,7 +68,7 @@ class BaselineHashTable : public HashTable {
}

~BaselineHashTable() override {
#ifdef HAVE_CUDA
#if defined(HAVE_CUDA) || defined(HAVE_L0)
if (gpu_hash_table_buff_) {
CHECK(buffer_provider_);
buffer_provider_->free(gpu_hash_table_buff_);
Expand Down Expand Up @@ -108,7 +108,7 @@ class BaselineHashTable : public HashTable {
size_t cpu_hash_table_buff_size_;
Data_Namespace::AbstractBuffer* gpu_hash_table_buff_;

#ifdef HAVE_CUDA
#if defined(HAVE_CUDA) || defined(HAVE_L0)
const size_t device_id_;
BufferProvider* buffer_provider_;
#endif
Expand Down
18 changes: 13 additions & 5 deletions omniscidb/QueryEngine/JoinHashTable/BaselineJoinHashTable.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ std::string BaselineJoinHashTable::toString(const ExecutorDeviceType device_type
auto hash_table = hash_tables_for_device_[device_id];
CHECK(hash_table);
auto buffer_size = hash_table->getHashTableBufferSize(device_type);
#ifdef HAVE_CUDA
#if defined(HAVE_CUDA) || defined(HAVE_L0)
auto buffer_provider = executor_->getBufferProvider();
std::unique_ptr<int8_t[]> buffer_copy;
if (device_type == ExecutorDeviceType::GPU) {
Expand Down Expand Up @@ -204,7 +204,7 @@ std::set<DecodedJoinHashBufferEntry> BaselineJoinHashTable::toSet(
auto hash_table = getHashTableForDevice(device_id);
CHECK(hash_table);
auto buffer_size = hash_table->getHashTableBufferSize(device_type);
#ifdef HAVE_CUDA
#if defined(HAVE_CUDA) || defined(HAVE_L0)
auto buffer_provider = executor_->getBufferProvider();
std::unique_ptr<int8_t[]> buffer_copy;
if (device_type == ExecutorDeviceType::GPU) {
Expand Down Expand Up @@ -375,7 +375,7 @@ std::pair<size_t, size_t> BaselineJoinHashTable::approximateTupleCount(
}
return std::make_pair(hll_size(hll_result, count_distinct_desc.bitmap_sz_bits), 0);
}
#ifdef HAVE_CUDA
#if defined(HAVE_CUDA) || defined(HAVE_L0)
auto buffer_provider = executor_->getBufferProvider();
std::vector<std::vector<uint8_t>> host_hll_buffers(device_count_);
for (auto& host_hll_buffer : host_hll_buffers) {
Expand Down Expand Up @@ -409,11 +409,19 @@ std::pair<size_t, size_t> BaselineJoinHashTable::approximateTupleCount(
nullptr);
const auto key_handler_gpu =
transfer_flat_object_to_gpu(key_handler, allocator);
#ifdef HAVE_CUDA
approximate_distinct_tuples_on_device(
reinterpret_cast<uint8_t*>(device_hll_buffer),
count_distinct_desc.bitmap_sz_bits,
key_handler_gpu,
columns_for_device.join_columns[0].num_elems);
#else
approximate_distinct_tuples_on_l0(reinterpret_cast<uint8_t*>(device_hll_buffer),
nullptr,
count_distinct_desc.bitmap_sz_bits,
columns_for_device.join_columns[0].num_elems,
key_handler_gpu);
#endif

auto& host_hll_buffer = host_hll_buffers[device_id];
buffer_provider->copyFromDevice(reinterpret_cast<int8_t*>(&host_hll_buffer[0]),
Expand Down Expand Up @@ -675,7 +683,7 @@ int BaselineJoinHashTable::initHashTableForDevice(
// but the query runs on GPU (join on dictionary encoded columns).
// Don't transfer the buffer if there was an error since we'll bail anyway.
if (memory_level_ == Data_Namespace::GPU_LEVEL && !err) {
#ifdef HAVE_CUDA
#if defined(HAVE_CUDA) || defined(HAVE_L0)
BaselineJoinHashTableBuilder builder;

builder.allocateDeviceMemory(hashtable_layout,
Expand Down Expand Up @@ -706,7 +714,7 @@ int BaselineJoinHashTable::initHashTableForDevice(
#endif
}
} else {
#ifdef HAVE_CUDA
#if defined(HAVE_CUDA) || defined(HAVE_L0)
BaselineJoinHashTableBuilder builder;

GpuAllocator allocator(executor_->getBufferProvider(), device_id);
Expand Down
Loading
Loading