intel · akroviakov · Jul 14, 2023 · Jul 19, 2023 · Jul 21, 2023
diff --git a/omniscidb/CMakeLists.txt b/omniscidb/CMakeLists.txt
@@ -617,6 +617,19 @@ else()
   set(PROFILER_LIBS "")
 endif()
 
+find_library(libhash_table
+         NAMES 
+         hash_table
+         PATHS 
+         SOMEPATH/l0_physops/build/hash_table
+)
+
+if ( NOT libhash_table )
+        message( WARNING "hash_table library NOT FOUND - the respective targets won't be build")
+else()
+        message( STATUS "hash_table library : ${libhash_table}")
+endif( NOT libhash_table )
+
 add_subdirectory(SqliteConnector)
 
 add_subdirectory(StringDictionary)

diff --git a/omniscidb/QueryEngine/CMakeLists.txt b/omniscidb/QueryEngine/CMakeLists.txt
@@ -322,6 +322,7 @@ list(APPEND QUERY_ENGINE_LIBS ${llvm_libs} ${ZLIB_LIBRARIES})
 
 add_subdirectory(CostModel)
 list(APPEND QUERY_ENGINE_LIBS CostModel)
+list(APPEND QUERY_ENGINE_LIBS ${libhash_table})
 
 target_link_libraries(QueryEngine ${QUERY_ENGINE_LIBS})
 

diff --git a/omniscidb/QueryEngine/Compiler/CommonGpuRuntime.cpp b/omniscidb/QueryEngine/Compiler/CommonGpuRuntime.cpp
@@ -64,4 +64,21 @@ DEVICE const GENERIC_ADDR_SPACE int64_t* init_shared_mem_nop(
     const int32_t groups_buffer_size) {
   return groups_buffer;
 }
+
+DEVICE ALWAYS_INLINE int64_t
+baseline_hash_join_idx_32(GENERIC_ADDR_SPACE const int8_t* hash_buff,
+                          GENERIC_ADDR_SPACE const int8_t* key,
+                          const size_t key_bytes,
+                          const size_t entry_count) {
+  return baseline_hash_join_idx_impl<int32_t>(hash_buff, key, key_bytes, entry_count);
+}
+
+NEVER_INLINE DEVICE int64_t
+get_composite_key_index_32(GENERIC_ADDR_SPACE const int32_t* key,
+                           const size_t key_component_count,
+                           GENERIC_ADDR_SPACE const int32_t* composite_key_dict,
+                           const size_t entry_count) {
+  return get_composite_key_index_impl(
+      key, key_component_count, composite_key_dict, entry_count);
+}
 }
diff --git a/omniscidb/QueryEngine/Compiler/genx.cpp b/omniscidb/QueryEngine/Compiler/genx.cpp
@@ -7,8 +7,117 @@
 #include <algorithm>
 #include <cstdint>
 
+#include "../GpuRtConstants.h"
+#include "CommonRuntimeDefs.h"
+#include "QueryEngine/MurmurHash1Inl.h"
 #include "Shared/funcannotations.h"
 
+template <typename T = int64_t>
+inline DEVICE T SUFFIX(get_invalid_key)() {
+  return EMPTY_KEY_64;
+}
+
+template <>
+inline DEVICE int32_t SUFFIX(get_invalid_key)() {
+  return EMPTY_KEY_32;
+}
+
+DEVICE bool compare_to_key(GENERIC_ADDR_SPACE const int8_t* entry,
+                           GENERIC_ADDR_SPACE const int8_t* key,
+                           const size_t key_bytes) {
+  for (size_t i = 0; i < key_bytes; ++i) {
+    if (entry[i] != key[i]) {
+      return false;
+    }
+  }
+  return true;
+}
+
+template <typename T>
+inline bool keys_are_equal(GENERIC_ADDR_SPACE const T* key1,
+                           GENERIC_ADDR_SPACE const T* key2,
+                           const size_t key_component_count) {
+  for (size_t i = 0; i < key_component_count; ++i) {
+    if (key1[i] != key2[i]) {
+      return false;
+    }
+  }
+  return true;
+}
+
+namespace {
+
+const int kNoMatch = -1;
+const int kNotPresent = -2;
+
+}  // namespace
+
+template <class T>
+DEVICE int64_t get_matching_slot(GENERIC_ADDR_SPACE const int8_t* hash_buff,
+                                 const uint32_t h,
+                                 GENERIC_ADDR_SPACE const int8_t* key,
+                                 const size_t key_bytes) {
+  const auto lookup_result_ptr = hash_buff + h * (key_bytes + sizeof(T));
+  if (compare_to_key(lookup_result_ptr, key, key_bytes)) {
+    return *reinterpret_cast<GENERIC_ADDR_SPACE const T*>(lookup_result_ptr + key_bytes);
+  }
+  if (*reinterpret_cast<GENERIC_ADDR_SPACE const T*>(lookup_result_ptr) ==
+      SUFFIX(get_invalid_key) < typename remove_addr_space<T>::type > ()) {
+    return kNotPresent;
+  }
+  return kNoMatch;
+}
+
+template <class T>
+DEVICE int64_t baseline_hash_join_idx_impl(GENERIC_ADDR_SPACE const int8_t* hash_buff,
+                                           GENERIC_ADDR_SPACE const int8_t* key,
+                                           const size_t key_bytes,
+                                           const size_t entry_count) {
+  if (!entry_count) {
+    return kNoMatch;
+  }
+  const uint32_t h = MurmurHash1Impl(key, key_bytes, 0) % entry_count;
+  int64_t matching_slot = get_matching_slot<T>(hash_buff, h, key, key_bytes);
+  if (matching_slot != kNoMatch) {
+    return matching_slot;
+  }
+  uint32_t h_probe = (h + 1) % entry_count;
+  while (h_probe != h) {
+    matching_slot = get_matching_slot<T>(hash_buff, h_probe, key, key_bytes);
+    if (matching_slot != kNoMatch) {
+      return matching_slot;
+    }
+    h_probe = (h_probe + 1) % entry_count;
+  }
+  return kNoMatch;
+}
+
+template <typename T>
+FORCE_INLINE DEVICE int64_t get_composite_key_index_impl(const T* key,
+                                                         const size_t key_component_count,
+                                                         const T* composite_key_dict,
+                                                         const size_t entry_count) {
+  const uint32_t h =
+      MurmurHash1Impl(key, key_component_count * sizeof(T), 0) % entry_count;
+  uint32_t off = h * key_component_count;
+  if (keys_are_equal(&composite_key_dict[off], key, key_component_count)) {
+    return h;
+  }
+  uint32_t h_probe = (h + 1) % entry_count;
+  while (h_probe != h) {
+    off = h_probe * key_component_count;
+    if (keys_are_equal(&composite_key_dict[off], key, key_component_count)) {
+      return h_probe;
+    }
+    if (composite_key_dict[off] ==
+        SUFFIX(get_invalid_key) < typename remove_addr_space<T>::type > ()) {
+      return -1;
+    }
+    h_probe = (h_probe + 1) % entry_count;
+  }
+  return -1;
+}
+
 extern "C" {
 int64_t atomic_cas_int_64(GENERIC_ADDR_SPACE int64_t*, int64_t, int64_t);
 int32_t atomic_cas_int_32(GENERIC_ADDR_SPACE int32_t*, int32_t, int32_t);

diff --git a/omniscidb/QueryEngine/IRCodegen.cpp b/omniscidb/QueryEngine/IRCodegen.cpp
@@ -825,10 +825,10 @@ std::shared_ptr<HashJoin> Executor::buildCurrentLevelHashTable(
     check_valid_join_qual(qual_bin_oper);
     JoinHashTableOrError hash_table_or_error;
     if (!current_level_hash_table) {
-      if (co.device_type == ExecutorDeviceType::GPU && getDataMgr()->getGpuMgr() &&
-          getDataMgr()->getGpuMgr()->getPlatform() == GpuMgrPlatform::L0) {
-        throw QueryMustRunOnCpu();
-      }
+      // if (co.device_type == ExecutorDeviceType::GPU && getDataMgr()->getGpuMgr() &&
+      //     getDataMgr()->getGpuMgr()->getPlatform() == GpuMgrPlatform::L0) {
+      //   throw QueryMustRunOnCpu();
+      // }
       hash_table_or_error = buildHashTableForQualifier(
           qual_bin_oper,
           query_infos,

diff --git a/omniscidb/QueryEngine/JoinHashTable/BaselineHashTable.h b/omniscidb/QueryEngine/JoinHashTable/BaselineHashTable.h
@@ -33,7 +33,7 @@ class BaselineHashTable : public HashTable {
                     const size_t hash_table_size)
       : cpu_hash_table_buff_size_(hash_table_size)
       , gpu_hash_table_buff_(nullptr)
-#ifdef HAVE_CUDA
+#if defined(HAVE_CUDA) || defined(HAVE_L0)
       , device_id_(0)
       , buffer_provider_(nullptr)
 #endif
@@ -51,14 +51,14 @@ class BaselineHashTable : public HashTable {
                     const size_t hash_table_size,
                     const size_t device_id)
       : gpu_hash_table_buff_(nullptr)
-#ifdef HAVE_CUDA
+#if defined(HAVE_CUDA) || defined(HAVE_L0)
       , device_id_(device_id)
       , buffer_provider_(buffer_provider)
 #endif
       , layout_(layout)
       , entry_count_(entry_count)
       , emitted_keys_count_(emitted_keys_count) {
-#ifdef HAVE_CUDA
+#if defined(HAVE_CUDA) || defined(HAVE_L0)
     CHECK(buffer_provider_);
     gpu_hash_table_buff_ = GpuAllocator::allocGpuAbstractBuffer(
         buffer_provider_, hash_table_size, device_id_);
@@ -68,7 +68,7 @@ class BaselineHashTable : public HashTable {
   }
 
   ~BaselineHashTable() override {
-#ifdef HAVE_CUDA
+#if defined(HAVE_CUDA) || defined(HAVE_L0)
     if (gpu_hash_table_buff_) {
       CHECK(buffer_provider_);
       buffer_provider_->free(gpu_hash_table_buff_);
@@ -108,7 +108,7 @@ class BaselineHashTable : public HashTable {
   size_t cpu_hash_table_buff_size_;
   Data_Namespace::AbstractBuffer* gpu_hash_table_buff_;
 
-#ifdef HAVE_CUDA
+#if defined(HAVE_CUDA) || defined(HAVE_L0)
   const size_t device_id_;
   BufferProvider* buffer_provider_;
 #endif

diff --git a/omniscidb/QueryEngine/JoinHashTable/BaselineJoinHashTable.cpp b/omniscidb/QueryEngine/JoinHashTable/BaselineJoinHashTable.cpp
@@ -165,7 +165,7 @@ std::string BaselineJoinHashTable::toString(const ExecutorDeviceType device_type
   auto hash_table = hash_tables_for_device_[device_id];
   CHECK(hash_table);
   auto buffer_size = hash_table->getHashTableBufferSize(device_type);
-#ifdef HAVE_CUDA
+#if defined(HAVE_CUDA) || defined(HAVE_L0)
   auto buffer_provider = executor_->getBufferProvider();
   std::unique_ptr<int8_t[]> buffer_copy;
   if (device_type == ExecutorDeviceType::GPU) {
@@ -204,7 +204,7 @@ std::set<DecodedJoinHashBufferEntry> BaselineJoinHashTable::toSet(
   auto hash_table = getHashTableForDevice(device_id);
   CHECK(hash_table);
   auto buffer_size = hash_table->getHashTableBufferSize(device_type);
-#ifdef HAVE_CUDA
+#if defined(HAVE_CUDA) || defined(HAVE_L0)
   auto buffer_provider = executor_->getBufferProvider();
   std::unique_ptr<int8_t[]> buffer_copy;
   if (device_type == ExecutorDeviceType::GPU) {
@@ -375,7 +375,7 @@ std::pair<size_t, size_t> BaselineJoinHashTable::approximateTupleCount(
     }
     return std::make_pair(hll_size(hll_result, count_distinct_desc.bitmap_sz_bits), 0);
   }
-#ifdef HAVE_CUDA
+#if defined(HAVE_CUDA) || defined(HAVE_L0)
   auto buffer_provider = executor_->getBufferProvider();
   std::vector<std::vector<uint8_t>> host_hll_buffers(device_count_);
   for (auto& host_hll_buffer : host_hll_buffers) {
@@ -409,11 +409,19 @@ std::pair<size_t, size_t> BaselineJoinHashTable::approximateTupleCount(
                                 nullptr);
           const auto key_handler_gpu =
               transfer_flat_object_to_gpu(key_handler, allocator);
+#ifdef HAVE_CUDA
           approximate_distinct_tuples_on_device(
               reinterpret_cast<uint8_t*>(device_hll_buffer),
               count_distinct_desc.bitmap_sz_bits,
               key_handler_gpu,
               columns_for_device.join_columns[0].num_elems);
+#else
+          approximate_distinct_tuples_on_l0(reinterpret_cast<uint8_t*>(device_hll_buffer),
+                                            nullptr,
+                                            count_distinct_desc.bitmap_sz_bits,
+                                            columns_for_device.join_columns[0].num_elems,
+                                            key_handler_gpu);
+#endif
 
           auto& host_hll_buffer = host_hll_buffers[device_id];
           buffer_provider->copyFromDevice(reinterpret_cast<int8_t*>(&host_hll_buffer[0]),
@@ -675,7 +683,7 @@ int BaselineJoinHashTable::initHashTableForDevice(
     // but the query runs on GPU (join on dictionary encoded columns).
     // Don't transfer the buffer if there was an error since we'll bail anyway.
     if (memory_level_ == Data_Namespace::GPU_LEVEL && !err) {
-#ifdef HAVE_CUDA
+#if defined(HAVE_CUDA) || defined(HAVE_L0)
       BaselineJoinHashTableBuilder builder;
 
       builder.allocateDeviceMemory(hashtable_layout,
@@ -706,7 +714,7 @@ int BaselineJoinHashTable::initHashTableForDevice(
 #endif
     }
   } else {
-#ifdef HAVE_CUDA
+#if defined(HAVE_CUDA) || defined(HAVE_L0)
     BaselineJoinHashTableBuilder builder;
 
     GpuAllocator allocator(executor_->getBufferProvider(), device_id);